In [None]:
# numpy and pandas imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# ML Libraries for scikit
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import VotingClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,\
                            precision_recall_curve,precision_score,recall_score,roc_auc_score,roc_curve,\
                            matthews_corrcoef, f1_score, make_scorer, auc
from scipy.stats import skew

#  Recursive Feature Elimination
from sklearn.feature_selection import RFE
        
# Python imports
from math import log, sqrt
import re
import collections
import warnings

# decision tree visualization related imports
import pydotplus
import graphviz

from IPython.display import Image

warnings.filterwarnings('ignore')

pd.options.display.max_columns = 400

In [None]:
train = pd.read_csv('/content/drive/MyDrive/DataSets/GLhack4/train.csv')
test = pd.read_csv('/content/drive/MyDrive/DataSets/GLhack4/test.csv')
sample= pd.read_csv('/content/drive/MyDrive/DataSets/GLhack4/sample_submission.csv')

In [None]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6512 entries, 0 to 6511
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   netgain  6512 non-null   int64
dtypes: int64(1)
memory usage: 51.0 KB


In [None]:
train.head()

Unnamed: 0,UserID,realtionship_status,industry,genre,targeted_sex,average_runtime(minutes_per_week),airtime,airlocation,ratings,expensive,money_back_guarantee,netgain
0,UI_9185,Married-civ-spouse,Pharma,Comedy,Male,40,Daytime,United-States,0.027465,Medium,No,0
1,UI_449,Never-married,Political,Comedy,Male,35,Morning,International,0.027465,Low,No,0
2,UI_9014,Never-married,Auto,Comedy,Male,40,Morning,United-States,0.027465,Low,No,0
3,UI_14987,Married-civ-spouse,Pharma,Infomercial,Male,40,Primetime,United-States,0.027465,Low,Yes,0
4,UI_25532,Married-civ-spouse,Other,Comedy,Female,50,Primetime,United-States,0.027465,Low,Yes,1


In [None]:
train.shape

(19536, 12)

In [None]:
train.isnull().sum() # there are no null values

UserID                               0
realtionship_status                  0
industry                             0
genre                                0
targeted_sex                         0
average_runtime(minutes_per_week)    0
airtime                              0
airlocation                          0
ratings                              0
expensive                            0
money_back_guarantee                 0
netgain                              0
dtype: int64

In [None]:
le = preprocessing.LabelEncoder()
le.fit(train['realtionship_status'])
list(le.classes_)
train['relationship_status'] = le.transform(train['realtionship_status']) 
train.drop('realtionship_status',axis=1,inplace=True)

le = preprocessing.LabelEncoder()
le.fit(train['industry'])
list(le.classes_)
train['industry'] = le.transform(train['industry']) 

le.fit(train['genre'])
list(le.classes_)
train['genre'] = le.transform(train['genre']) 

train['targeted_sex'] = train['targeted_sex'].replace(['Male','Female'],['1','0'])

le.fit(train['airtime'])
list(le.classes_)
train['airtime'] = le.transform(train['airtime']) 

le.fit(train['airlocation'])
list(le.classes_)
train['airlocation'] = le.transform(train['airlocation']) #try to normalize

le.fit(train['expensive'])
list(le.classes_)
train['expensive'] = le.transform(train['expensive']) 

train['money_back_guarantee'] = train['money_back_guarantee'].replace(['Yes','No'],['1','0'])

train.head(10)

Unnamed: 0,UserID,industry,genre,targeted_sex,average_runtime(minutes_per_week),airtime,airlocation,ratings,expensive,money_back_guarantee,netgain,relationship_status
0,UI_9185,4,0,1,40,0,39,0.027465,2,0,0,2
1,UI_449,5,0,1,35,1,19,0.027465,1,0,0,4
2,UI_9014,0,0,1,40,1,39,0.027465,1,0,0,4
3,UI_14987,4,3,1,40,2,39,0.027465,1,1,0,2
4,UI_25532,3,0,0,50,2,39,0.027465,1,1,1,2
5,UI_22292,4,0,1,40,0,39,0.027465,0,0,0,2
6,UI_22389,4,0,1,80,2,39,0.027465,1,1,0,2
7,UI_17,4,0,1,40,2,26,0.027465,0,1,0,2
8,UI_11739,4,0,1,40,1,39,0.027465,1,1,0,2
9,UI_16904,3,0,0,25,2,39,0.027465,0,0,0,2


In [None]:
le = preprocessing.LabelEncoder()
le.fit(test['realtionship_status'])
list(le.classes_)
test['relationship_status'] = le.transform(test['realtionship_status']) 
test.drop('realtionship_status',axis=1,inplace=True)

le = preprocessing.LabelEncoder()
le.fit(test['industry'])
list(le.classes_)
test['industry'] = le.transform(test['industry']) 

le.fit(test['genre'])
list(le.classes_)
test['genre'] = le.transform(test['genre']) 

test['targeted_sex'] = test['targeted_sex'].replace(['Male','Female'],['1','0'])

le.fit(test['airtime'])
list(le.classes_)
test['airtime'] = le.transform(test['airtime']) 

le.fit(test['airlocation'])
list(le.classes_)
test['airlocation'] = le.transform(test['airlocation']) #try to normalize

le.fit(test['expensive'])
list(le.classes_)
test['expensive'] = le.transform(test['expensive']) 

test['money_back_guarantee'] = test['money_back_guarantee'].replace(['Yes','No'],['1','0'])
test.drop('UserID',axis=1,inplace=True)
test.head(10)

Unnamed: 0,industry,genre,targeted_sex,average_runtime(minutes_per_week),airtime,airlocation,ratings,expensive,money_back_guarantee,relationship_status
0,2,3,0,40,2,38,0.027465,0,1,4
1,0,0,0,40,2,38,0.027465,1,0,0
2,4,3,1,40,2,38,0.027465,0,0,2
3,0,0,1,30,2,38,0.027465,0,1,4
4,5,0,1,70,1,38,0.027465,1,0,4
5,0,0,1,66,2,38,0.027465,1,1,4
6,4,0,1,40,2,38,0.027465,1,0,2
7,5,0,1,40,2,38,0.027465,2,0,4
8,0,0,1,15,0,38,0.027465,1,1,0
9,2,0,1,40,1,38,0.027465,1,0,4


In [None]:
features = ['industry','genre','targeted_sex','average_runtime(minutes_per_week)','airtime','airlocation','ratings','expensive','money_back_guarantee','relationship_status']
X= train[features]
y= train['netgain']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

Random Forest 

In [None]:
RF = RandomForestClassifier()
RF.fit(X_train,y_train)
y_pred2 = RF.predict(X_test)
f1_metric = f1_score(y_test,y_pred2)
print(f1_metric)

0.5368484122228879


In [None]:
datasets = pd.DataFrame(y_pred2,columns=['netgain'])
datasets.to_csv('sample_submission_RFC.csv',index=False)

VotingClassifier 

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
estimator = [] 
estimator.append(('LR',  
                  LogisticRegression())) 
estimator.append(('SVC', SVC(gamma ='auto', probability = True))) 
estimator.append(('DTC', DecisionTreeClassifier()))

vc=VotingClassifier(estimators = estimator, voting ='hard') 
vc.fit(X_train,y_train)
#model on train using all the independent values in df
vc_prediction = vc.predict(X_train)
vc_score= accuracy_score(y_train,vc_prediction)
print('voting classifier train set accuracy score :',vc_score)
#model on test using all the independent values in df
vc_prediction = vc.predict(X_test)
vc_score= accuracy_score(y_test,vc_prediction)
print('voting classifier train set accuracy score :',vc_score)

voting classifier train set accuracy score : 0.8057972869209112
voting classifier train set accuracy score : 0.7876151484135108


In [None]:
y_pred3 = vc.predict(test)
datasets = pd.DataFrame(y_pred3,columns=['netgain'])
datasets.to_csv('sample_submission_VC.csv',index=False)

In [126]:
vc=VotingClassifier(estimators = estimator, voting ='soft') 
vc.fit(X_train,y_train)
#model on train using all the independent values in df
vc_prediction = vc.predict(X_train)
vc_score= accuracy_score(y_train,vc_prediction)
print('voting classifier train set accuracy score :',vc_score)
#model on test using all the independent values in df
vc_prediction = vc.predict(X_test)
vc_score= accuracy_score(y_test,vc_prediction)
print('voting classifier test set accuracy score :',vc_score)
f1_metric = f1_score(y_test,vc_prediction)
print(f1_metric)

voting classifier train set accuracy score : 0.8313391771706443
voting classifier test set accuracy score : 0.8047094957768108
0.4584811923349894


In [None]:
y_pred4 = vc.predict(test)
datasets = pd.DataFrame(y_pred4,columns=['netgain'])
datasets.to_csv('sample_submission_VC2.csv',index=False)

Gaussian Naive Bayes

In [127]:
GNB=GaussianNB()
GNB.fit(X_train,y_train)
#model on train using all the independent values in df
GNB_prediction = GNB.predict(X_train)
GNB_score= accuracy_score(y_train,GNB_prediction)
print('GNB classifier train set accuracy score :',GNB_score)
#model on test using all the independent values in df
GNB_prediction = GNB.predict(X_test)
GNB_score= accuracy_score(y_test,GNB_prediction)
print('GNB classifier test set accuracy score :',GNB_score)
f1_metric = f1_score(y_test,GNB_prediction)
print(f1_metric)

GNB classifier train set accuracy score : 0.801842728261565
GNB classifier test set accuracy score : 0.8088047094957768
0.33717834960070986


In [None]:
y_pred5 = GNB.predict(test)
datasets = pd.DataFrame(y_pred5,columns=['netgain'])
datasets.to_csv('sample_submission_GNB.csv',index=False)

Logistic Regression

In [125]:
LR=LogisticRegression()
LR.fit(X_train,y_train)
#model on train using all the independent values in df
LR_prediction = LR.predict(X_train)
LR_score= accuracy_score(y_train,LR_prediction)
print('LR classifier train set accuracy score :',LR_score)
#model on test using all the independent values in df
LR_prediction = LR.predict(X_test)
LR_score= accuracy_score(y_test,LR_prediction)
print('LR classifier test set accuracy score :',LR_score)
f1_metric = f1_score(y_test,LR_prediction)
print(f1_metric)

LR classifier train set accuracy score : 0.7743297715784759
LR classifier test set accuracy score : 0.7783465574609675
0.2322695035460993


In [None]:
y_pred6 = LR.predict(test)
datasets = pd.DataFrame(y_pred6,columns=['netgain'])
datasets.to_csv('sample_submission_LR.csv',index=False)