1. Importing Required Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn import tree

2.Reading Datset

In [3]:
train=pd.read_csv("train_wn75k28.csv")
test=pd.read_csv("test_Wf7sxXF.csv")
submission=pd.read_csv('sample_submission_2zvVjBu.csv')

3.Checking for missing values and outliers

In [4]:
train.isna().sum()

id                          0
created_at                  0
campaign_var_1              0
campaign_var_2              0
products_purchased      20911
signup_date             15113
user_activity_var_1         0
user_activity_var_2         0
user_activity_var_3         0
user_activity_var_4         0
user_activity_var_5         0
user_activity_var_6         0
user_activity_var_7         0
user_activity_var_8         0
user_activity_var_9         0
user_activity_var_10        0
user_activity_var_11        0
user_activity_var_12        0
buy                         0
dtype: int64

4.Missing Value Treatment
Missing values in columns product purchased and signup date are more than half of the datapoints present. Hence rather imputing values with mean,mode or median,0 as a placeholder is used in product purchased

In [None]:
train['products_purchased']=train['products_purchased'].fillna(0)
test['products_purchased']=test['products_purchased'].fillna(0)
train['products_purchased']=train['products_purchased'].astype('int')
test['products_purchased']=test['products_purchased'].astype('int')
train['signup_date']=train['signup_date'].fillna(train['created_at'][0])
test['signup_date']=test['signup_date'].fillna(test['created_at'][0])

In [None]:
train['created_at']=pd.to_datetime(train['created_at'])
test['created_at']=pd.to_datetime(test['created_at'])
train['signup_date']=pd.to_datetime(train['signup_date'])
test['signup_date']=pd.to_datetime(test['signup_date'])

5.Extracting features from Datetime columns

In [None]:
train['sub_days']=train['created_at']-train['signup_date']
test['sub_days']=test['created_at']-test['signup_date']

In [None]:
train['signup_dom']=train['signup_date'].dt.day
test['signup_dom']=test['signup_date'].dt.day

In [None]:
train['signup_month']=train['signup_date'].dt.month
test['signup_month']=test['signup_date'].dt.month

In [None]:
train['signup_year']=train['signup_date'].dt.year
test['signup_year']=test['signup_date'].dt.year

In [None]:
train['signup_dow']=train['signup_date'].dt.dayofweek
test['signup_dow']=test['signup_date'].dt.dayofweek

In [None]:
train['sub_days']=train['sub_days'].astype('timedelta64[D]')
train['sub_days']=train['sub_days']//np.timedelta64(1, 'D')

In [None]:
test['sub_days']=test['sub_days'].astype('timedelta64[D]')
test['sub_days']=test['sub_days']//np.timedelta64(1, 'D')

6.Removing unnecessary features from datasets

In [None]:
test['sub_days']=test['sub_days'].astype('int')
train['sub_days']=train['sub_days'].astype('int')
test1=test.drop(['id','created_at','signup_date'],axis=1)
target=train['buy']
train1=train.drop(['id','created_at','signup_date','buy'],axis=1)

7.Model Fitting and Training
a) Decision Trees

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train1,target,random_state=0)

In [None]:
model1=tree.DecisionTreeClassifier()

In [None]:
from sklearn.model_selection import GridSearchCV
params = {"max_depth": np.arange(1,10), "min_samples_split": [100,200,300,400,500,1000,700,800,900]}
grid = GridSearchCV(estimator=model1,param_grid=params)
grid.fit(X_train, y_train)
print(grid.best_score_)
print(grid.best_estimator_)

In [None]:
model1=tree.DecisionTreeClassifier(max_depth=5,min_samples_split=100)

In [None]:
model1.fit(X_train,y_train)

In [None]:
tree.plot_tree(model1)

In [None]:
pred_tree=model1.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,pred_tree))

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,pred_tree))

In [None]:
tree_predictions=model1.predict(test1)

In [None]:
submission['buy']=tree_predictions

In [None]:
submission.to_csv('Desktop/Analytics_Vidhya/Jobathon June/S_tree.csv',index=False)

b) Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
params = {"max_depth": np.arange(1,10), "min_samples_split": [2,5,10],"n_estimators":[100]}
grid = GridSearchCV(estimator=RandomForestClassifier(),param_grid=params)
grid.fit(X_train, y_train)
print(grid.best_score_)
print(grid.best_estimator_)

In [None]:
rf_val=grid.predict(X_test)

In [None]:
print(classification_report(y_test,rf_val))

In [None]:
rf_predictions=grid.predict(test1)
submission['buy']=rf_predictions

In [None]:
submission.to_csv('Desktop/Analytics_Vidhya/Jobathon June/S_rf.csv',index=False)

c) AdaBoost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
Abc=AdaBoostClassifier()

In [None]:
Abc.fit(X_train,y_train)
Abc_val=Abc.predict(X_test)

In [None]:
print(classification_report(y_test,Abc_val))

In [None]:
Abc2=AdaBoostClassifier(learning_rate=1.1)
Abc2.fit(X_train,y_train)
Abc2_val=Abc2.predict(X_test)

In [None]:
submission['buy']=adaboost_predictions
submission.to_csv('Desktop/Analytics_Vidhya/Jobathon June/S_ada.csv',index=False)

d) CatBoost Classifier

In [None]:
from catboost import CatBoostClassifier

cbc=CatBoostClassifier()
cbc.fit(X_train,y_train)

In [None]:
cbc_val=cbc.predict(X_test)
print(classification_report(y_test,cbc_val))

In [None]:
catboost_predictions=cbc.predict(test1)
submission['buy']=catboost_predictions
submission.to_csv('Desktop/Analytics_Vidhya/Jobathon June/S_cat.csv',index=False)

In [None]:
print(cbc.feature_importances_)

In [None]:
cbc2=CatBoostClassifier(learning_rate=0.03)

In [None]:
cbc2.fit(X_train,y_train)

In [None]:
cbc2_val=cbc2.predict(X_test)
print(classification_report(y_test,cbc2_val))

In [None]:
catboost_predictions2=cbc2.predict(test1)
submission['buy']=catboost_predictions2
submission.to_csv('Desktop/Analytics_Vidhya/Jobathon June/S_cat_lr.csv',index=False)