In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import lightgbm as lgb
import xgboost as xgb

In [None]:
train_path = '../input/churn-risk-hackerearth/train.csv'
test_path = '../input/churn-risk-hackerearth/test.csv'

In [None]:
train_df = pd.read_csv(train_path)
train_df.head()

In [None]:
train_df2 = train_df.drop(columns = ['customer_id', 'Name', 'joining_date', 'referral_id', 'security_no', 'last_visit_time'])

In [None]:
train_df2.head()

In [None]:
list(set(train_df2['churn_risk_score']))

In [None]:
churn_dict = {1:0,2:1,3:2,4:3,5:4,-1:5}

In [None]:
def age_classify(n):
    
    if(n < 20):
        return 0
    if(n >=20 and n <= 40):
        return 1
    if(n > 40):
        return 2
    
train_df2['age'] = train_df2['age'].apply(lambda x: age_classify(x))

train_df2.head()

In [None]:
train_df2 = train_df2.fillna(0)
for c in tqdm(train_df2.columns):
    
    if(c in ['days_since_last_login', 'avg_transaction_value', 'avg_frequency_login_days', 'points_in_wallet','avg_time_spent','churn_risk_score']):
        continue
    train_df2[c] = pd.factorize(train_df2[c])[0]
    
train_df2.head()

# train_df2 = train_df2.apply(lambda x : pd.factorize(x)[0])

In [None]:
train_df2 = train_df2.replace({'Error':0})
train_df2['churn_risk_score'] = train_df2['churn_risk_score'].replace(churn_dict)
train_df2.shape

In [None]:
train_df2.head()

In [None]:
churn0 = train_df2.query('churn_risk_score == 4')
# plt.plot(churn0)
freqs = np.unique(churn0['membership_category'], return_counts = True)
# plt.bar(freqs[0], freqs[1])
plt.scatter(x=train_df2['membership_category'], y=train_df2['churn_risk_score'])

In [None]:
def NormalizeData(data):
    return (data - data.min()) / (data.max() - data.min())

In [None]:
# train_df2['avg_frequency_login_days'] = train_df2['avg_frequency_login_days'].apply(lambda x:float(x))

In [None]:
# train_df2['days_since_last_login'] = NormalizeData(train_df2['days_since_last_login'])
# train_df2['avg_transaction_value'] = NormalizeData(train_df2['avg_transaction_value'])
# train_df2['avg_frequency_login_days'] = NormalizeData(train_df2['avg_frequency_login_days'])
# train_df2['points_in_wallet'] = NormalizeData(train_df2['points_in_wallet'])
# train_df2['avg_time_spent'] = NormalizeData(train_df2['avg_time_spent'])
# train_df2.head()

In [None]:
X = train_df2.values[:,:-1]
Y = train_df2.values[:,-1]

In [None]:
set(Y)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y.astype('int'), test_size= 0.25)

In [None]:
train_data = lgb.Dataset(X_train, label=Y_train)
w = np.random.rand(X_train.shape[0], )
train_data.set_weight(w)

In [None]:
param = {'num_leaves': 100, 'objective': 'multiclass','num_class' : 6}
param['metric'] = 'multi_error'
lgbt = lgb.train(param, train_data, 15)

In [None]:
preds = lgbt.predict(X_test)
correct_count = 0
for idx, i in enumerate(preds):
    c = np.argmax(i)
    if(c == Y_test[idx]):
        correct_count = correct_count + 1
        
correct_count/preds.shape[0]

In [None]:
gbt = GradientBoostingClassifier(n_estimators = 120)
gbt.fit(X_train, Y_train)

In [None]:
score = gbt.score(X_test, Y_test)
score

In [None]:
test_df = pd.read_csv(test_path)
test_df.head()

In [None]:
test_df2 = test_df.drop(columns = ['customer_id', 'Name', 'joining_date', 'referral_id', 'security_no', 'last_visit_time'])
test_df2 = test_df2.replace({'Error' : 0})
test_df2 = test_df2.fillna(0)
test_df2['age'] = test_df2['age'].apply(lambda x: age_classify(x))

In [None]:
for c in tqdm(test_df2.columns):
    
    if(c in ['days_since_last_login', 'avg_transaction_value', 'avg_frequency_login_days', 'points_in_wallet','avg_time_spent','churn_risk_score']):
        continue
    test_df2[c] = pd.factorize(test_df2[c])[0]
    
test_df2.head()

In [None]:
X_t = test_df2.values


In [None]:
preds = gbt.predict(X_t)

In [None]:
preds.shape

In [None]:
np.unique(preds)

In [None]:
rev_churn = {0:1,1:2,2:3,3:4,4:5,5:-1}

In [None]:
final_preds = np.array([rev_churn[i] for i in preds])

In [None]:
np.unique(final_preds)

In [None]:
ss_df = pd.read_csv('../input/churn-risk-hackerearth/sample_submission.csv')
ss_df.head()

In [None]:
sub_data = np.concatenate((test_df['customer_id'].values.reshape(19919,1), final_preds.reshape(19919,1)), axis = 1)
sub_data.shape

In [None]:
sub_data_Df = pd.DataFrame(sub_data, columns = ss_df.columns)

In [None]:
sub_data_Df.head(n=10)

In [None]:
sub_data_Df.to_csv('submission.csv', index = True)