In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder,StandardScaler, OneHotEncoder
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer,CountVectorizer
from sklearn.cluster import KMeans
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
import warnings
warnings.filterwarnings("ignore")

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation,BatchNormalization
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from keras.optimizers import SGD,Adagrad
from keras.layers.advanced_activations import PReLU
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping,TensorBoard
from statistics import mean

Using TensorFlow backend.


# LOADING THE DATA

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
feat_gat_tr = pd.read_csv("/content/drive/MyDrive/talkingdata-mobile-user-demographics/gender_age_train.csv",index_col='device_id')
feat_gat_test = pd.read_csv("/content/drive/MyDrive/talkingdata-mobile-user-demographics/gender_age_test.csv",index_col='device_id')
feat_ph=pd.read_csv("/content/drive/MyDrive/talkingdata-mobile-user-demographics/phone_brand_device_model.csv")
feat_app_lab=pd.read_csv('/content/drive/MyDrive/talkingdata-mobile-user-demographics/app_labels.csv')
feat_lab_cat=pd.read_csv("/content/drive/MyDrive/talkingdata-mobile-user-demographics/label_categories.csv")
feat_ap_eve=pd.read_csv("/content/drive/MyDrive/talkingdata-mobile-user-demographics/app_events.csv", dtype={'is_active':bool})
feat_eve = pd.read_csv('/content/drive/MyDrive/talkingdata-mobile-user-demographics/events.csv',  parse_dates=['timestamp'],index_col='event_id')

In [None]:
#removing duplicate device id's
feat_ph = feat_ph.drop_duplicates('device_id',keep='first').set_index('device_id') 

In [None]:
print(feat_gat_tr.shape)
print(feat_gat_test.shape)
print(feat_ph.shape)
print(feat_app_lab.shape)
print(feat_lab_cat.shape)
print(feat_ap_eve.shape)
print(feat_eve.shape)

(74645, 3)
(112071, 0)
(186716, 2)
(459943, 2)
(930, 2)
(32473067, 4)
(3252950, 4)


# SPLITTING THE DATA

SOME DEVICES HAVE EVENTS INFORMATION AND SOME DEVICES DOES NOT HAVE EVENT INFORMTION.
1. SO WE DIVIDE THE DATA INTO TRAIN AND TEST IN BOTH EVENTS AND NO EVENTS DATA.

In [None]:
#https://docs.scipy.org/doc/numpy/reference/generated/numpy.in1d.html
feat_mas=np.in1d(feat_gat_tr.index,feat_eve["device_id"].values)
feat_gatr_eve= feat_gat_tr[feat_mas]

feat_mas=np.in1d(feat_gat_test.index,feat_eve["device_id"].values)
feat_gate_eve= feat_gat_test[feat_mas]

In [None]:
#https://docs.scipy.org/doc/numpy/reference/generated/numpy.in1d.html
feat_mas=np.in1d(feat_gat_tr.index,feat_eve["device_id"].values,invert=True)
feat_gatr_noeve= feat_gat_tr[feat_mas]

feat_mas=np.in1d(feat_gat_test.index,feat_eve["device_id"].values,invert=True)
feat_gate_noeve= feat_gat_test[feat_mas]

In [None]:
#Each row of is given by a unique integer as an identifier

feat_gat_tr['trainrow'] = np.arange(feat_gat_tr.shape[0])
feat_gat_test['testrow'] = np.arange(feat_gat_test.shape[0])

feat_gatr_eve['trainrow']=np.arange(feat_gatr_eve.shape[0])
feat_gate_eve['testrow']=np.arange(feat_gate_eve.shape[0])

feat_gatr_noeve['trainrow']=np.arange(feat_gatr_noeve.shape[0])
feat_gate_noeve['testrow']=np.arange(feat_gate_noeve.shape[0])

In [None]:
print("train data with events information:",feat_gatr_eve.shape)
print("train data without events information:",feat_gatr_noeve.shape)
print("test data with events information:",feat_gate_eve.shape)
print("test data without events information:",feat_gate_noeve.shape)

train data with events information: (23309, 4)
train data without events information: (51336, 4)
test data with events information: (35194, 1)
test data without events information: (76877, 1)


# VECTORIZING PHONE BRAND

In [None]:
feat_br_encoder = LabelEncoder().fit(feat_ph.phone_brand)
feat_ph['brand'] = feat_br_encoder.transform(feat_ph['phone_brand'])
nbrand=len(feat_br_encoder.classes_)

In [None]:
import pickle
with open('brandencoder','wb') as fp:
    pickle.dump(feat_br_encoder,fp)

# VECTORIZING PHONE MODEL

In [None]:
m = feat_ph.phone_brand.str.cat(feat_ph.device_model)
#m=feat_ph['phone_brand'].str.cat(feat_ph['device_model'])
feat_mod_encodr = LabelEncoder().fit(m)
feat_ph['model'] = feat_mod_encodr.transform(m)
nmodel=len(feat_mod_encodr.classes_)

In [None]:
import pickle
with open('feat_mod_encodr','wb') as fp:
    pickle.dump(feat_mod_encodr,fp)

# FEATURES USING APP ID'S

In [None]:
#https://www.kaggle.com/dvasyukova/a-linear-model-on-apps-and-labels
#number of times app used in a device id's
feat_app_encodr = LabelEncoder().fit(feat_ap_eve['app_id'])
feat_ap_eve['app'] = feat_app_encodr.transform(feat_ap_eve['app_id'])


napps = len(feat_app_encodr.classes_)

feat_devic_aps = (feat_ap_eve.merge(feat_eve[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(['device_id','app'])['app'].agg(['size'])# grouping by device id and app and finding size of app
                       .merge(feat_gatr_eve[['trainrow']], how='left', left_index=True, right_index=True)#finding trainrow
                       .merge(feat_gate_eve[['testrow']], how='left', left_index=True, right_index=True)#finding testrow
                       .reset_index())
feat_devic_aps.head()

Unnamed: 0,device_id,app,size,trainrow,testrow
0,-9222956879900151005,548,18,5145.0,
1,-9222956879900151005,1096,18,5145.0,
2,-9222956879900151005,1248,26,5145.0,
3,-9222956879900151005,1545,12,5145.0,
4,-9222956879900151005,1664,18,5145.0,


In [None]:
feat_devic_aps.shape

(2369025, 5)

In [None]:
import pickle
with open('feat_app_encodr','wb') as fp:
    pickle.dump(feat_app_encodr,fp)

# FEATURES USING APP LABELS

In [None]:
feat_app_lab = feat_app_lab.loc[feat_app_lab.app_id.isin(feat_ap_eve.app_id.unique())]
feat_app_lab['app'] = feat_app_encodr.transform(feat_app_lab.app_id)
feat_lab_encodr = LabelEncoder().fit(feat_app_lab.label_id)
feat_app_lab['label'] = feat_lab_encodr.transform(feat_app_lab.label_id)
nlabels = len(feat_lab_encodr.classes_)

In [None]:
import pickle
with open('feat_lab_encodr','wb') as fp:
    pickle.dump(feat_lab_encodr,fp)

In [None]:
feat_devic_lab = (feat_devic_aps[['device_id','app']]
                .merge(feat_app_lab[['app','label']])
                .groupby(['device_id','label'])['app'].agg(['size'])
                .merge(feat_gatr_eve[['trainrow']], how='left', left_index=True, right_index=True)
                .merge(feat_gate_eve[['testrow']], how='left', left_index=True, right_index=True)
                .reset_index())
feat_devic_lab.head()

Unnamed: 0,device_id,label,size,trainrow,testrow
0,-9222956879900151005,117,1,5145.0,
1,-9222956879900151005,120,1,5145.0,
2,-9222956879900151005,126,1,5145.0,
3,-9222956879900151005,138,2,5145.0,
4,-9222956879900151005,147,2,5145.0,


In [None]:
feat_devic_lab.shape

(4244113, 5)

# FEATURES USING TIME FEATURE

In [None]:
#we are processing timestamp feature to get hour and day and dividing into 4 bins
feat_eve['hour'] = feat_eve['timestamp'].map(lambda x:pd.to_datetime(x).hour)
feat_eve['hourbin'] = [1 if ((x>=1)&(x<=6)) else 2 if ((x>=7)&(x<=12)) else 3 if ((x>=13)&(x<=18)) else 4 for x in feat_eve['hour']]

In [None]:
feat_eve.hour=feat_eve.hour.astype(str)
feat_eve.hourbin=feat_eve.hourbin.astype(str)

In [None]:
feat_hr_join = feat_eve.groupby("device_id")["hour"].apply(lambda x: " ".join('0'+str(s) for s in x))

In [None]:
feat_hr_bin_join=feat_eve.groupby("device_id")["hourbin"].apply(lambda x: " ".join('0'+str(s) for s in x))

In [None]:
feat_days_join=feat_eve['timestamp'].dt.day_name()
feat_eve['day']=feat_days_join.map({'Sunday':0,'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6})

In [None]:
feat_days_join = feat_eve.groupby("device_id")["day"].apply(lambda x: " ".join("0"+str(s) for s in x))

# FEATURES USING LATITUDE AND LONGITUDE

In [None]:
feat_med_lat = feat_eve.groupby("device_id")["latitude"].agg('median')

In [None]:
feat_med_lon=feat_eve.groupby("device_id")["longitude"].agg('median')

WE ARE CLUSTERING MEDIAN LATITUDES AND LONGITUDES IN TO 10 CLUSTERS

In [None]:
feat_com=pd.concat([feat_med_lat, feat_med_lon], axis=1)
kmeans = KMeans(n_clusters=10, random_state=0).fit(feat_com)
feat_clustrd_geo_featrs=pd.Series(kmeans.labels_)
feat_clustrd_geo_featrs.index=feat_med_lon.index

In [None]:
kmeans.labels_

array([5, 1, 1, ..., 1, 4, 1])

In [None]:
import pickle
with open('kmeans_labels','wb') as fp:
    pickle.dump(kmeans.labels_,fp)

In [None]:
feat_clustrd_geo_featrs.index=feat_med_lon.index
print(feat_clustrd_geo_featrs.index)

Int64Index([-9222956879900151005, -9222661944218806987, -9222399302879214035,
            -9221825537663503111, -9221767098072603291, -9221079146476055829,
            -9221026417907250887, -9220830859283101130, -9220452176650064280,
            -9220329415676028483,
            ...
             9219164468944552013,  9219842210460037807,  9219937375310355234,
             9220562120895859549,  9220814716773471568,  9220914901466458680,
             9221586026451102237,  9222110179000857683,  9222355582733155698,
             9222539910510672930],
           dtype='int64', name='device_id', length=60865)


In [None]:
import pickle
with open('n_clusters','wb') as fp:
    pickle.dump(kmeans,fp)

# FEATURES BASED ON ACTIVE APPS AND APP COUNT

In [None]:
feat_aps = feat_ap_eve.groupby("event_id")["is_active"].apply(lambda x: " ".join(str(s) for s in x))

In [None]:
feat_aps.shape

(1488096,)

In [None]:
feat_eve["apps_active"] = feat_eve.index.map(apps)
feat_actv_aps_eve = feat_eve.groupby("device_id")["apps_active"].apply(lambda x: " ".join(str(s) for s in x if str(s)!='nan'))


# MODELLING

# ONE HOT ENCODING OF PHONE BRAND

In [None]:
feat_gat_tr['brand'] = feat_ph['brand']
feat_gat_test['brand'] = feat_ph['brand']

feat_Xtr_br = csr_matrix((np.ones(feat_gat_tr.shape[0]), 
                       (feat_gat_tr.trainrow, feat_gat_tr.brand)))
feat_Xte_br = csr_matrix((np.ones(feat_gat_test.shape[0]), 
                       (feat_gat_test.testrow, feat_gat_test.brand)))

In [None]:
feat_gat_tr.shape

(74645, 5)

In [None]:
feat_gat_tr['brand'] = feat_ph['brand']
feat_gat_test['brand'] = feat_ph['brand']

feat_gat_tr['brand'] = str(feat_gat_tr['brand'])
feat_gat_test['brand'] = str(feat_gat_test['brand'])

feat_vectorzr = CountVectorizer(lowercase = True)
feat_vectorzr.fit(feat_gat_tr['brand'])
feat_tr_br_onehot = feat_vectorzr.transform(feat_gat_tr['brand'].values)
feat_te_br_onehot = feat_vectorzr.transform(feat_gat_test['brand'].values)
print(feat_tr_br_onehot.shape)
print(feat_te_br_onehot.shape)

(74645, 77)
(112071, 77)


In [None]:
import pickle
with open('brand_onehot','wb') as fp:
    pickle.dump(feat_vectorzr,fp)

In [None]:
print(feat_Xtr_br.shape)
print(feat_Xte_br.shape)

(74645, 131)
(112071, 131)


# ONE HOT ENCODING OF PHONE MODEL

In [None]:
feat_gat_tr['model'] = feat_ph['model']
feat_gat_test['model'] = feat_ph['model']

feat_Xtr_mod = csr_matrix((np.ones(feat_gat_tr.shape[0]), 
                       (feat_gat_tr.trainrow, feat_gat_tr.model)))
feat_Xte_mod = csr_matrix((np.ones(feat_gat_test.shape[0]), 
                       (feat_gat_test.testrow, feat_gat_test.model)))
print(feat_Xtr_mod.shape)
print(feat_Xte_mod.shape)

(74645, 1667)
(112071, 1667)


In [None]:
feat_gat_tr['model'] = feat_ph['model']
feat_gat_test['model'] = feat_ph['model']

feat_gat_tr['model'] = str(feat_gat_tr['model'])
feat_gat_test['model'] = str(feat_gat_test['model'])

feat_vectorzr = CountVectorizer(lowercase = True)
feat_vectorzr.fit(feat_gat_tr['model'])
feat_tr_mod_onehot = feat_vectorzr.transform(feat_gat_tr['model'].values)
feat_te_mod_onehot = feat_vectorzr.transform(feat_gat_test['model'].values)
print(feat_tr_mod_onehot.shape)
print(feat_te_mod_onehot.shape)

(74645, 98)
(112071, 98)


In [None]:
import pickle
with open('model_onehot','wb') as fp:
    pickle.dump(feat_vectorzr,fp)

# ONE HOT ENCODING OF DEVICE APPS

In [None]:
d = feat_devic_aps.dropna(subset=['trainrow'])
feat_Xtr_ap = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)), 
                      shape=(feat_gat_tr.shape[0],napps))
d = feat_devic_aps.dropna(subset=['testrow'])
feat_Xte_ap = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)), 
                      shape=(feat_gat_test.shape[0],napps))

print(feat_Xtr_ap.shape)
print(feat_Xte_ap.shape)

(74645, 19237)
(112071, 19237)


# ONE HOT ENCODING OF APP CATEGORY

In [None]:
d = feat_devic_lab.dropna(subset=['trainrow'])
feat_Xtr_lab = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), 
                      shape=(feat_gat_tr.shape[0],nlabels))
d = feat_devic_lab.dropna(subset=['testrow'])
feat_Xte_lab = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)), 
                      shape=(feat_gat_test.shape[0],nlabels))

print(feat_Xtr_lab.shape)
print(feat_Xte_lab.shape)

(74645, 492)
(112071, 492)


In [None]:
#hstacking all the features
feat_Xtr = hstack((feat_Xtr_br, feat_Xtr_mod, feat_Xtr_ap, feat_Xtr_lab), format='csr')
feat_Xte =  hstack((feat_Xte_br, feat_Xte_mod, feat_Xte_ap, feat_Xte_lab), format='csr')
print('Train data shape:',feat_Xtr.shape)
print('Test data shape:',feat_Xte.shape)

Train data shape: (74645, 21527)
Test data shape: (112071, 21527)


In [None]:
#applying applying label encoding on target variable
feat_targt_encod = LabelEncoder().fit(feat_gat_tr.group)
feat_y = feat_targt_encod.transform(feat_gat_tr.group)
nclasses = len(feat_targt_encod.classes_)

In [None]:
import pickle
with open('classlabel','wb') as fp:
    pickle.dump(feat_y,fp)

In [None]:
#splitting data into train and validation
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split

xtr, xcv, ytr, ycv = train_test_split(feat_Xtr, y,stratify=feat_y,test_size=0.15)
print(xtr.shape,ytr.shape)
print(xcv.shape,ycv.shape)

(63448, 21527) (63448,)
(11197, 21527) (11197,)


# LOGISTIC REGRESSION

In [None]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
feat_alph = [0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000]
feat_cv_log_er = []
for i in feat_alph:
    print('for c = ',i)
    feat_SGD  = LogisticRegression(class_weight = 'balanced',penalty = 'l2',C = i)
    feat_clf =  feat_SGD.fit(xtr,ytr)
    feat_sgd_clib = CalibratedClassifierCV(feat_clf,method = 'sigmoid') 
    feat_sgd_clib.fit(xtr,ytr)
    feat_y_cv_pred = feat_sgd_clib.predict_proba(xcv)
    #feat_cv_log_er.append(log_loss(y_cv,feat_y_cv_pred))
    print('for c = ',i ,'the log loss is :',log_loss(ycv,feat_y_cv_pred))

for c =  1e-05
for c =  1e-05 the log loss is : 2.4240490121671416
for c =  0.0001
for c =  0.0001 the log loss is : 2.4239127140248673
for c =  0.001
for c =  0.001 the log loss is : 2.421898251247192
for c =  0.01
for c =  0.01 the log loss is : 2.4034478389221365
for c =  0.1
for c =  0.1 the log loss is : 2.408299979523952
for c =  1
for c =  1 the log loss is : 2.4181286899702306
for c =  10
for c =  10 the log loss is : 2.42395984079844
for c =  100
for c =  100 the log loss is : 2.4254481618915733
for c =  1000
for c =  1000 the log loss is : 2.425641919967717


WE CHOSE OUR BEST C TO BE 0.01

In [None]:
feat_clf = LogisticRegression(C=0.01, class_weight='balanced', multi_class='multinomial', solver='lbfgs')
feat_clf.fit(xtr, ytr)
feat_sig_clf = CalibratedClassifierCV(feat_clf, method="sigmoid")
feat_sig_clf.fit(xcv, ycv)

feat_pred_y = feat_sig_clf.predict_proba(xtr)
feat_loss=log_loss(ytr, feat_pred_y)
print("The train log loss for best C is:",feat_loss)
feat_pred_y = feat_sig_clf.predict_proba(xcv)
feat_loss=log_loss(ycv, feat_pred_y)
print("The validation log loss for best C is:",feat_loss)

The train log loss for best C is: 2.4145317866106
The validation log loss for best C is: 2.354060651561517


# MODELLING USING DEVICES WITHOUT EVENTS

In [None]:
feat_Xtr_whol = hstack((feat_Xtr_br, feat_Xtr_mod), format='csr')

feat_targt_encod = LabelEncoder().fit(feat_gat_tr.group)
feat_y = feat_targt_encod.transform(feat_gat_tr.group)

In [None]:
feat_gate_noeve['model']=feat_ph['model']
feat_gate_noeve['brand']=feat_ph['brand']

(76877, 3)

In [None]:
feat_gate_noeve_mod = csr_matrix((np.ones(feat_gate_noeve.shape[0]), 
                       (feat_gate_noeve.testrow, feat_gate_noeve.model)))

feat_gate_noeve_br= csr_matrix((np.ones(feat_gate_noeve.shape[0]), 
                       (feat_gate_noeve.testrow, feat_gate_noeve.brand)))

In [None]:
feat_Xte_no_eve=hstack((feat_gate_noeve_br, feat_gate_noeve_mod), format='csr')

In [None]:
feat_xte_no_eve_1=hstack((feat_gate_noeve_br, feat_gate_noeve_mod), format='csr')

In [None]:
print(feat_gate_noeve_mod.shape)

(76877, 1667)


In [None]:
import pickle
with open('model_onehot','rb') as fp:
  model_onehot = pickle.load(fp)

In [None]:
import pickle
with open('brand_onehot','rb') as fp:
  brand_onehot = pickle.load(fp)

In [None]:

feat_gate_noeve['model'] = feat_ph['model']
feat_gate_noeve['brand'] = feat_ph['brand']

feat_gate_noeve['model'] = str(feat_gate_noeve['model'])
feat_gate_noeve['brand'] = str(feat_gate_noeve['brand'])

feat_noeve_test_br_onehot = brand_onehot.transform(feat_gate_noeve['brand'])
feat_noeve_test_mod_onehot = model_onehot.transform(feat_gate_noeve['model'])
print(feat_noeve_test_br_onehot.shape)
print(feat_noeve_test_mod_onehot.shape)

(76877, 77)
(76877, 98)


In [None]:
feat_Xtr = hstack((feat_tr_mod_onehot,feat_tr_br_onehot)).tocsr()
feat_xte_no_eve_onehot = hstack((feat_noeve_test_mod_onehot,feat_noeve_test_br_onehot)).tocsr()
print(feat_Xtr.shape)
print(feat_Xte.shape)

(74645, 175)
(76877, 175)


In [None]:
print("xtrain shape:",feat_Xtr_wh.shape)
print("ytrain shape:",feat_y.shape)

print("xtest shape:",feat_Xte_no_eve.shape)

xtrain shape: (74645, 1798)
ytrain shape: (74645,)
xtest shape: (76877, 1798)


In [None]:
xtr, xcv, ytr, ycv = train_test_split(feat_Xtr_wh, y,stratify=feat_y,test_size=0.15,random_state=18)

In [None]:
xtr, xcv, ytr, ycv = train_test_split(feat_xtr, y,stratify=feat_y,test_size=0.15,random_state=18)

In [None]:
print(xtr.shape,ytr.shape)
print(xcv.shape,ycv.shape)

(63448, 175) (63448,)
(11197, 175) (11197,)


# LOGISTIC REGRESSION

In [None]:
feat_alph = [0.001,0.01,0.02,0.1,0.15,1,10]


for i in feat_alph:
    feat_clf = LogisticRegression(C=i, class_weight='balanced', multi_class='multinomial',solver='lbfgs')
    feat_clf.fit(xtr, ytr)
    #Using Model Calibration
    feat_sig_clf = CalibratedClassifierCV(feat_clf, method="sigmoid")
    feat_sig_clf.fit(xtr, ytr)
    feat_pred_y = feat_sig_clf.predict_proba(xcv)
    print('For values of C = ', i, "The validation log loss is:",log_loss(ycv, feat_pred_y))

For values of C =  0.001 The validation log loss is: 2.4030020103020036
For values of C =  0.01 The validation log loss is: 2.39641152579951
For values of C =  0.02 The validation log loss is: 2.3940695248817785
For values of C =  0.1 The validation log loss is: 2.3896730084069544
For values of C =  0.15 The validation log loss is: 2.3891201368497037
For values of C =  1 The validation log loss is: 2.39105667945638
For values of C =  10 The validation log loss is: 2.398847882073424


WE CHOSE OUR BEST C TO BE 0.15

In [None]:
feat_clf = LogisticRegression(C=0.15, class_weight='balanced', multi_class='multinomial', solver='lbfgs')
feat_clf.fit(xtr, ytr)
feat_sig_clf = CalibratedClassifierCV(feat_clf, method="sigmoid")
feat_sig_clf.fit(xtr, ytr)

feat_pred_y = feat_sig_clf.predict_proba(xtr)
feat_loss=log_loss(ytr, feat_pred_y)
print("The train log loss for best C is:",feat_loss)
feat_pred_y = feat_sig_clf.predict_proba(xcv)
feat_loss=log_loss(ycv, feat_pred_y)
print("The validation log loss for best C is:",feat_loss)

The train log loss for best C is: 2.362802878894095
The validation log loss for best C is: 2.3891201368497037


In [None]:
#predicting for test data
feat_noeve_pred_lr=feat_sig_clf.predict_proba(feat_Xte_no_eve)

In [None]:
#saving the model
from sklearn.externals import joblib as jobl
from joblib import dump
np.save('lr_noevents',feat_noeve_pred_lr)

# OBSERVATIONS:


FOR LOGISTIC REGRESSION MODEL TRAIN LOGLOSS IS 2.3628 AND VALIDATION LOSS IS 2.3891

# NEURAL NETWORKS

In [None]:
#https://www.kaggle.com/c/talkingdata-mobile-user-demographics/discussion/23424
def feat_noeve_nn_mod_1(input_shape):
    model = Sequential()
    model.add(Dense(256, input_dim=input_shape))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(64))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(12))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
    return model

In [None]:
feat_modsum=feat_noeve_nn_mod_1(xtr.shape[1])
feat_modsum.summary()

W0418 13:02:32.574199 32244 deprecation_wrapper.py:119] From c:\users\navee\appdata\local\programs\python\python37\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0418 13:02:32.589181 32244 deprecation_wrapper.py:119] From c:\users\navee\appdata\local\programs\python\python37\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0418 13:02:32.591156 32244 deprecation_wrapper.py:119] From c:\users\navee\appdata\local\programs\python\python37\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0418 13:02:32.648003 32244 deprecation_wrapper.py:119] From c:\users\navee\appdata\local\programs\python\python37\lib\site-packages\keras\backend\tensorflow_backend.py:133: The name tf.placeholder_with_default is d

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               45056     
_________________________________________________________________
p_re_lu_1 (PReLU)            (None, 256)               256       
_________________________________________________________________
batch_normalization_1 (Batch (None, 256)               1024      
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                16448     
_________________________________________________________________
p_re_lu_2 (PReLU)            (None, 64)                64        
_________________________________________________________________
batch_normalization_2 (Batch (None, 64)                256       
__________

In [None]:
feat_early_stop=EarlyStopping(monitor='val_loss',patience=5,restore_best_weights=True)

In [None]:
def feat_noeve_avg_nn_1(state):
    """
    Takes a list of Random Seeds, splits the data into Train and CV based on Seed, trains model and takes average of 
    predictions while testing  
    """
    feat_mod_list=[]
    feat_loss_list=[]
    feat_avg_cv_los=0
    for i in range(len(state)):
        xtr, xcv, ytr, ycv = train_test_split(feat_Xtr_whol, y,stratify=y,test_size=0.15,random_state=state[i])
        ytr=np_utils.to_categorical(ytr)
        ycv=np_utils.to_categorical(ycv)
        model=feat_noeve_nn_mod_1(xtr.shape[1])
        model.fit(xtr, ytr, batch_size=256, epochs=20, verbose=1, validation_data=(xcv, ycv),callbacks=[early_stop])
        model.save('saved_models/no_events/nn '+str(i+1))
        feat_pred=model.predict_proba(xcv)
        feat_cv_los=log_loss(ycv, feat_pred)
        print("Validation Log Loss of  Model in Current Run: ",feat_cv_los)
        feat_mod_list.append(model)
        feat_loss_list.append(feat_cv_los)
    feat_avg_cv_los=mean(feat_loss_list)
    print("Average CV Loss of "+str(len(state))+" Runs :",feat_avg_cv_los)
    return(feat_mod_list)

In [None]:
random_seeds=[9,18,42,86,103]
model_list_1= feat_noeve_avg_nn_1(random_seeds)

W0408 12:30:10.290519 11128 deprecation.py:323] From c:\users\navee\appdata\local\programs\python\python37\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 63448 samples, validate on 11197 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Validation Log Loss of  Model in Current Run:  2.3904335856160874
Train on 63448 samples, validate on 11197 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Validation Log Loss of  Model in Current Run:  2.387496891607927
Train on 63448 samples, validate on 11197 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Validation Log Loss of  Model in Current Run:  2.385876258398855
Train on 63448 samples, validate on 11197 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Validation Log Loss of  Model in Current Run:  2.3858903664887627
Train on 63448 samples

Epoch 11/20
Validation Log Loss of  Model in Current Run:  2.3887528186292855
Average CV Loss of 5 Runs : 2.3876899841481833


In [None]:
feat_avg_pred=np.zeros((xtr.shape[0],12))
for i in range(len(model_list_1)):
    feat_tr_pred=model_list_1[i].predict_proba(xtr)
    feat_avg_pred+=feat_tr_pred
feat_avg_pred/=len(model_list_1)
print("Train Average Log-Loss: ",log_loss(ytr, feat_avg_pred))

Train Average Log-Loss:  2.3528577585587853


In [None]:
feat_avg_pred=np.zeros((xcv.shape[0],12))
for i in range(len(model_list_1)):
    feat_cv_pred=model_list_1[i].predict_proba(xcv)
    feat_avg_pred+=feat_cv_pred
feat_avg_pred/=len(model_list_1)
print("Validation Average Log-Loss: ",log_loss(ycv, feat_avg_pred))

Validation Average Log-Loss:  2.3577544682106013


In [None]:
feat_avg_pred=np.zeros((xtest_noevents.shape[0],12))
for i in range(len(model_list_1)):
    feat_te_pred=model_list_1[i].predict_proba(feat_Xte_no_eve)
    feat_avg_pred+=feat_te_pred
feat_avg_pred/=len(model_list_1)

In [None]:
#saving the model
np.save('nn1_noevents_1',feat_avg_pred)

In [None]:
def feat_noeve_avg_nn_1(state):
    """
    Takes a list of Random Seeds, splits the data into Train and CV based on Seed, trains model and takes average of 
    predictions while testing  
    """
    feat_mod_list=[]
    feat_mod_list=[]
    feat_avg_cv_los=0
    for i in range(len(state)):
        xtr, xcv, ytr, ycv = train_test_split(xtrain, y,stratify=y,test_size=0.15,random_state=state[i])
        ytr=np_utils.to_categorical(ytr)
        ycv=np_utils.to_categorical(ycv)
        model=noevents_nn_model1(xtr.shape[1])
        model.fit(xtr, ytr, batch_size=256, epochs=20, verbose=1, validation_data=(xcv, ycv),callbacks=[early_stop])
        model.save('saved_models/no_events/nn_onehot '+str(i+1))
        pred=model.predict_proba(xcv)
        feat_cv_los=log_loss(ycv, pred)
        print("Validation Log Loss of  Model in Current Run: ",feat_cv_los)
        feat_mod_list.append(model)
        feat_mod_list.append(feat_cv_los)
    feat_avg_cv_los=mean(feat_mod_list)
    print("Average CV Loss of "+str(len(state))+" Runs :",feat_avg_cv_los)
    return(feat_mod_list)

In [None]:
xtr.shape

(63448, 175)

In [None]:
random_seeds=[9,18,42,86,103]
model_list_2= feat_noeve_avg_nn_1(random_seeds)

Train on 63448 samples, validate on 11197 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Validation Log Loss of  Model in Current Run:  31.074719427568184
Train on 63448 samples, validate on 11197 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Validation Log Loss of  Model in Current Run:  30.568837554814444
Train on 63448 samples, validate on 11197 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Validation Log Loss of  Model in Current Run:  26.150270795052865
Train on 63448 samples, validate on 11197 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Validation Log Loss of  Model in Current Run:  27.09232210426058
Train on 63448 samples, validate on 11197 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch

In [None]:
feat_avg_pred=np.zeros((xtr.shape[0],12))
for i in range(len(model_list_2)):
    feat_tr_pred=model_list_2[i].predict_proba(xtr)
    feat_avg_pred+=feat_tr_pred
feat_avg_pred/=len(model_list_2)
print("Train Average Log-Loss: ",log_loss(ytr, feat_avg_pred))

Train Average Log-Loss:  14.917648857027595


In [None]:
feat_avg_pred=np.zeros((xcv.shape[0],12))
for i in range(len(model_list_2)):
    feat_cv_pred=model_list_2[i].predict_proba(xcv)
    feat_avg_pred+=feat_cv_pred
feat_avg_pred/=len(model_list_2)
print("Validation Average Log-Loss: ",log_loss(ycv, feat_avg_pred))

Validation Average Log-Loss:  14.920084094153626


In [None]:
feat_xte_no_eve_onehot.shape

(76877, 175)

In [None]:
feat_avg_pred=np.zeros((xtest_noevents_onehot.shape[0],12))
for i in range(len(model_list_2)):
    feat_te_pred=model_list_2[i].predict_proba(feat_xte_no_eve_onehot)
    feat_avg_pred+=feat_te_pred
feat_avg_pred/=len(model_list_2)

OBSERVATIONS:
USING NEURAL NETWORK WE GOT TRAIN LOSS OF 2.3528 AND TEST LOSS OF 2.3577

# MODEL 2

In [None]:
#https://www.kaggle.com/c/talkingdata-mobile-user-demographics/discussion/23424
def feat_noeve_nn_mod_2(input_dim,output_dim, learRate=0.0025):
    
    model = Sequential()
    model.add(Dense(500, input_shape=(input_dim,), init='uniform'))
    model.add(PReLU(init='zero'))
    model.add(Dropout(0.82))
    model.add(Dense(output_dim, init='uniform'))
    model.add(Activation('softmax'))
    opt = Adagrad(lr=learRate, epsilon=1e-08)
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    return model

In [None]:
model_sum=feat_noeve_nn_mod_2(xtr.shape[1],12)
model_sum.summary()

W0408 12:35:07.644370 11128 nn_ops.py:4224] Large dropout rate: 0.82 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_19 (Dense)             (None, 500)               899500    
_________________________________________________________________
p_re_lu_13 (PReLU)           (None, 500)               500       
_________________________________________________________________
dropout_13 (Dropout)         (None, 500)               0         
_________________________________________________________________
dense_20 (Dense)             (None, 12)                6012      
_________________________________________________________________
activation_7 (Activation)    (None, 12)                0         
Total params: 906,012
Trainable params: 906,012
Non-trainable params: 0
_________________________________________________________________


In [None]:
early_stop=EarlyStopping(monitor='val_loss',patience=5,restore_best_weights=True)

In [None]:
def feat_noeve_avg_nn_2(state):
    """
    Takes a list of Random Seeds, splits the data into Train and CV based on Seed, trains model and takes average of 
    predictions while testing  
    """
    feat_mod_list=[]
    feat_los_list=[]
    feat_avg_cv_los=0
    for i in range(len(state)):
        xtr, xcv, ytr, ycv = train_test_split(Xtrain_whole, y,stratify=y,test_size=0.15,random_state=state[i])
        ytr=np_utils.to_categorical(ytr)
        ycv=np_utils.to_categorical(ycv)
        model=noevents_nn_model2(xtr.shape[1],12)
        #logdir = os.path.join("logs","noevents_nn1."+str(i+1))
        #t_callback=TensorBoard(log_dir=logdir)
        model.fit(xtr, ytr, batch_size=256, epochs=30, verbose=1, validation_data=(xcv, ycv),callbacks=[early_stop])
        feat_pred=model.predict_proba(xcv)
        feat_cv_los=log_loss(ycv, feat_pred)
        print("Validation Log Loss of  Model in Current Run: ",feat_cv_los)
        feat_mod_list.append(model)
        feat_los_list.append(feat_cv_los)
    feat_avg_cv_los=mean(feat_los_list)
    print("Average CV Loss of "+str(len(state))+" Runs :",feat_avg_cv_los)
    return(feat_mod_list)

In [None]:
random_seeds=[9,18,42,86,103]
model_list_2= noevents_average_nn_2(random_seeds)

W0408 12:35:07.760055 11128 nn_ops.py:4224] Large dropout rate: 0.82 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


Train on 63448 samples, validate on 11197 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


W0408 12:37:17.340040 11128 nn_ops.py:4224] Large dropout rate: 0.82 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


Validation Log Loss of  Model in Current Run:  2.393906689524874
Train on 63448 samples, validate on 11197 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


W0408 12:39:25.408527 11128 nn_ops.py:4224] Large dropout rate: 0.82 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


Validation Log Loss of  Model in Current Run:  2.390445421803921
Train on 63448 samples, validate on 11197 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


W0408 12:41:34.231742 11128 nn_ops.py:4224] Large dropout rate: 0.82 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


Validation Log Loss of  Model in Current Run:  2.3882187361524307
Train on 63448 samples, validate on 11197 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Validation Log Loss of  Model in Current Run:  2.389341872515844
Train on 63448 samples, validate on 11197 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30


Epoch 29/30
Epoch 30/30
Validation Log Loss of  Model in Current Run:  2.391342581548126
Average CV Loss of 5 Runs : 2.390651060309039


In [None]:
feat_avg_pred=np.zeros((xtr.shape[0],12))
for i in range(len(model_list_2)):
    feat_tr_pred=model_list_2[i].predict_proba(xtr)
    feat_avg_pred+=feat_tr_pred
feat_avg_pred/=len(model_list_2)
print("Train Average Log-Loss: ",log_loss(ytr, feat_avg_pred))

Train Average Log-Loss:  2.3770776429186817


In [None]:
feat_avg_pred=np.zeros((xcv.shape[0],12))
for i in range(len(model_list_2)):
    feat_cv_pred=model_list_2[i].predict_proba(xcv)
    feat_avg_pred+=feat_cv_pred
feat_avg_pred/=len(model_list_2)
print("Validation Average Log-Loss: ",log_loss(ycv, feat_avg_pred))

Validation Average Log-Loss:  2.3788470152956647


In [None]:
feat_avg_pred=np.zeros((xtest_noevents.shape[0],12))
for i in range(len(model_list_2)):
    test_pred=model_list_2[i].predict_proba(feat_Xte_no_eve)
    feat_avg_pred+=test_pred
feat_avg_pred/=len(model_list_2)

In [None]:
#saving the model
np.save('nn2_noevents_1',feat_avg_pred)

OBSERVATIONS:
1.THE TRAIN AND VALIDATION LOSS FOR THE MODEL ARE 2.377 AND 2.378 RESPEECTIVELY.

# XGBOOST

In [None]:
#https://www.kaggle.com/c/talkingdata-mobile-user-demographics/discussion/23424
feat_xgb = XGBClassifier(n_estimators=350, n_jobs=-1,learning_rate=0.05, colsample_bytree=0.7, max_depth=5,subsample=0.7,objective='multi:softprob',num_class=12,eval_metric='mlogloss')
feat_xgb.fit(xtr, ytr)
#Using Model Calibration
feat_clf = CalibratedClassifierCV(feat_xgb, method="sigmoid")
feat_clf.fit(xtr, ytr)

feat_pred_y=feat_clf.predict_proba(xtr)
print("Train Log Loss :",log_loss(ytr, feat_pred_y))


feat_pred_y=feat_clf.predict_proba(xcv)
print("Validation Log Loss :",log_loss(ycv, feat_pred_y))

Train Log Loss : 2.3718148085004658
Validation Log Loss : 2.3929110310146204


In [None]:
feat_no_eve_pred_lr=feat_clf.predict_proba(feat_Xte_no_eve)

In [None]:
#saving the model
np.save('xgb_noevents_1.npy',feat_no_eve_pred_lr)

OBSERVATIONS:
THE TRAIN AND VALIDATION LOSS ARE 2.3718 AND 2.3929 RESPECTIVELY.
THESE ARE NOT AS GOOD AS THE NEURAL NETWORK MODEL.

# MODELLING USING DEVICES WITH EVENTS

# ONE HOT ENCODING OF PHONE BRAND

In [None]:
feat_gatr_eve['brand']=feat_ph['brand']
feat_gate_eve['brand']=feat_ph['brand']


#https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html
feat_Xtr_eve_br = csr_matrix((np.ones(feat_gatr_eve.shape[0]), # Number of Rows/Devices
                       (feat_gatr_eve.trainrow, feat_gatr_eve.brand)),shape=(feat_gatr_eve.shape[0],nbrand))
feat_Xte_eve_br = csr_matrix((np.ones(feat_gate_eve.shape[0]), # Number of Rows/Devices
                       (feat_gate_eve.testrow, feat_gate_eve.brand)),shape=(feat_gate_eve.shape[0],nbrand))

print("Train Brand One-hot Shape: ",feat_Xtr_eve_br.shape)
print("Test Brand One-hot Shape: ",feat_Xtr_eve_br.shape)

Train Brand One-hot Shape:  (23309, 131)
Test Brand One-hot Shape:  (35194, 131)


#  ONE HOT ENCODING OF PHONE MODEL

In [None]:
feat_gatr_eve['model']=feat_ph['model']
feat_gate_eve['model']=feat_ph['model']

feat_Xtr_eve_mod = csr_matrix((np.ones(feat_gatr_eve.shape[0]), 
                       (feat_gatr_eve.trainrow, feat_gatr_eve.model)),shape=(feat_gatr_eve.shape[0],nmodel))

feat_Xte_eve_mod = csr_matrix((np.ones(feat_gate_eve.shape[0]), 
                       (feat_gate_eve.testrow, feat_gate_eve.model)),shape=(feat_gate_eve.shape[0],nmodel))
print("Train Brand One-hot Shape: ",feat_Xtr_eve_mod.shape)
print("Test Brand One-hot Shape: ",feat_Xte_eve_mod.shape)

Train Brand One-hot Shape:  (23309, 1667)
Test Brand One-hot Shape:  (35194, 1667)


# ONE HOT ENCODING OF DEVICE APPS

In [None]:
#Since the Deviceapps has both train and test columns merged to create Train Apps One-Hot we will Drop all Nan of Train Row
#Once we remove Nan in Train Rows we will get the Apps in Train Data and we create CSR Matrix for those rows
d = feat_devic_aps.dropna(subset=['trainrow'])
feat_Xtr_eve_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)), 
                      shape=(feat_gatr_eve.shape[0],napps))

#Since the Deviceapps has both train and test columns merged to create Test Apps One-Hot we will Drop all Nan of Test Row
#Once we remove Nan in Test Rows we will get the Apps in Test Data and we create CSR Matrix for those rows
d = feat_devic_aps.dropna(subset=['testrow'])
feat_Xte_eve_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)), 
                      shape=(feat_gate_eve.shape[0],napps))
print("Train Event Apps One-hot Shape: ",feat_Xtr_eve_app.shape)
print("Test Event Apps One-hot Shape: ",feat_Xte_eve_app.shape)

Train Event Apps One-hot Shape:  (23309, 19237)
Test Event Apps One-hot Shape:  (35194, 19237)


# ONE HOT ENCODING OF DEVICE LABELS

In [None]:
#Since the Devicelabels has both train and test columns merged to create Train Labels One-Hot we will Drop all Nan of Train Row
#Once we remove Nan in Train Rows we will get the Labels in Train Data and we create CSR Matrix for those rows
d = feat_devic_lab.dropna(subset=['trainrow'])
feat_Xtr_eve_lab = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), 
                      shape=(feat_gatr_eve.shape[0],nlabels))
#Since the Devicelabels has both train and test columns merged to create Test Labels One-Hot we will Drop all Nan of Test Row
#Once we remove Nan in Test Rows we will get the Labels in Test Data and we create CSR Matrix for those rows
d = feat_devic_lab.dropna(subset=['testrow'])
feat_Xte_eve_lab = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)), 
                      shape=(feat_gate_eve.shape[0],nlabels))
print("Train Event Labels One-hot Shape: ",feat_Xtr_eve_lab.shape)
print("Test Event Labels One-hot Shape: ",feat_Xte_eve_lab.shape)

Train Event Labels One-hot Shape:  (23309, 492)
Test Event Labels One-hot Shape:  (35194, 492)


# TFIDF  FEATURES FOR HOURS

In [None]:
feat_gatr_eve["hourjoin"]=feat_gatr_eve.index.map(hourjoin)
feat_gate_eve["hourjoin"]=feat_gate_eve.index.map(hourjoin)

feat_vectorzr=TfidfVectorizer()
feat_vectorzr.fit(feat_gatr_eve['hourjoin'].values)

feat_Xtr_horjoin_tfidf = feat_vectorzr.transform(feat_gatr_eve['hourjoin'].values)
feat_Xte_horjoin_tfidf = feat_vectorzr.transform(feat_gate_eve['hourjoin'].values)

print("Train Event Hours TF-IDF Shape: ",feat_Xtr_horjoin_tfidf.shape)
print("Test Event Hours TF-IDF Shape: ",feat_Xtr_horjoin_tfidf.shape)

Train Event Hours TF-IDF Shape:  (23309, 24)
Test Event Hours TF-IDF Shape:  (35194, 24)


In [None]:
import pickle
with open('hour_tfidf','wb') as fp:
    pickle.dump(feat_vectorzr,fp)

#  BOW FOR HOURS

In [None]:
feat_gatr_eve["hourjoin"]=feat_gatr_eve.index.map(hourjoin)
feat_gate_eve["hourjoin"]=feat_gate_eve.index.map(hourjoin)

feat_vectorzr=CountVectorizer()
feat_vectorzr.fit(feat_gatr_eve['hourjoin'].values)

feat_X_tr_hrjoin_onehot = feat_vectorzr.transform(feat_gatr_eve['hourjoin'].values)
feat_X_te_hrjoin_onehot = feat_vectorzr.transform(feat_gate_eve['hourjoin'].values)
print("After vectorizations")
print("Train Event Hours One-hot Shape: ",feat_X_tr_hrjoin_onehot.shape)
print("Test Event Hours One-hot Shape: ",feat_X_te_hrjoin_onehot.shape)

After vectorizations
Train Event Hours One-hot Shape:  (23309, 24)
Test Event Hours One-hot Shape:  (35194, 24)


In [None]:
import pickle
with open('hour_bow','wb') as fp:
    pickle.dump(feat_vectorzr,fp)

# ONE HOT ENCODING OF HOUR BIN

In [None]:
feat_gatr_eve["hourbinjoin"]=feat_gatr_eve.index.map(hourbinjoin)
feat_gate_eve["hourbinjoin"]=feat_gate_eve.index.map(hourbinjoin)

feat_vectorzr=CountVectorizer(binary=True)
feat_vectorzr.fit(feat_gatr_eve['hourbinjoin'].values)

feat_X_tr_hrbinjoin_onehot = feat_vectorzr.transform(feat_gatr_eve['hourbinjoin'].values)
feat_X_te_hrbinjoin_onehot = feat_vectorzr.transform(feat_gate_eve['hourbinjoin'].values)

print("Train Event Hours One-hot Shape: ",feat_X_tr_hrbinjoin_onehot.shape)
print("Test Event Hours One-hot Shape: ",feat_X_te_hrbinjoin_onehot.shape)

Train Event Hours One-hot Shape:  (23309, 4)
Test Event Hours One-hot Shape:  (35194, 4)


In [None]:
import pickle
with open('hour_bin_bow','wb') as fp:
    pickle.dump(feat_vectorzr,fp)

# TFIDF FEATURES FOR DAY

In [None]:
feat_gatr_eve["daysjoin"]=feat_gatr_eve.index.map(daysjoin)
feat_gate_eve["daysjoin"]=feat_gate_eve.index.map(daysjoin)

feat_vectorzr=TfidfVectorizer()
feat_vectorzr.fit(feat_gatr_eve['daysjoin'].values)

feat_X_tr_daysjoin_tfidf = feat_vectorzr.transform(feat_gatr_eve['daysjoin'].values)
feat_X_te_daysjoin_tfidf = feat_vectorzr.transform(feat_gate_eve['daysjoin'].values)
print("After vectorizations")
print("Train Event days TF-IDF Shape: ",feat_X_tr_daysjoin_tfidf.shape)
print("Test Event days TF-IDF Shape: ",feat_X_te_daysjoin_tfidf.shape)

After vectorizations
Train Event days TF-IDF Shape:  (23309, 7)
Test Event days TF-IDF Shape:  (35194, 7)


In [None]:
import pickle
with open('day_tfidf','wb') as fp:
    pickle.dump(feat_vectorzr,fp)

# STANDARDIZING LATITUDE AND LONGITUDE

In [None]:
feat_gatr_eve["latitude"]=feat_gatr_eve.index.map(median_lat)
feat_gate_eve["latitude"]=feat_gate_eve.index.map(median_lat)

feat_scalr=StandardScaler()
feat_scalr.fit(feat_gatr_eve['latitude'].values.reshape(-1,1))

feat_Xtr_eve_lat = feat_scalr.transform(feat_gatr_eve['latitude'].values.reshape(-1,1))
feat_Xte_eve_lat = feat_scalr.transform(feat_gate_eve['latitude'].values.reshape(-1,1))

print("Train Event Latitude Standardized Shape: ",feat_Xtr_eve_lat.shape)
print("Test Event Latitude Standardized  Shape: ",feat_Xte_eve_lat.shape)

Train Event Latitude Standardized Shape:  (23309, 1)
Test Event Latitude Standardized  Shape:  (35194, 1)


In [None]:
import pickle
with open('lat_scaler','wb') as fp:
    pickle.dump(feat_scalr,fp)

In [None]:
feat_gatr_eve["longitude"]=feat_gatr_eve.index.map(feat_med_lon)
feat_gate_eve["longitude"]=feat_gate_eve.index.map(feat_med_lon)

feat_scalr=StandardScaler()
feat_scalr.fit(feat_gatr_eve['longitude'].values.reshape(-1,1))

feat_Xtr_eve_lon = feat_scalr.transform(feat_gatr_eve['longitude'].values.reshape(-1,1))
feat_Xte_eve_lon = feat_scalr.transform(feat_gate_eve['longitude'].values.reshape(-1,1))

print("Train Event longitude Standardized Shape: ",feat_Xtr_eve_lon.shape)
print("Test Event longitude Standardized  Shape: ",feat_Xtr_eve_lon.shape)

Train Event longitude Standardized Shape:  (23309, 1)
Test Event longitude Standardized  Shape:  (35194, 1)


In [None]:
import pickle
with open('lon_scaler','wb') as fp:
    pickle.dump(feat_scalr,fp)

# ONE HOT ENCODING OF CLUSTERED FEATURES

In [None]:
feat_gatr_eve["locationbin"]=feat_gatr_eve.index.map(clustered_geo_features)
feat_gate_eve["locationbin"]=feat_gate_eve.index.map(clustered_geo_features)

#feat_gatr_eve.locationbin=feat_gatr_eve.locationbin.astype(str)
#feat_gate_eve.locationbin=feat_gate_eve.locationbin.astype(str)

feat_vectorzr= OneHotEncoder()
feat_vectorzr.fit(feat_gatr_eve['locationbin'].values.reshape(-1,1))

feat_X_tr_clus = feat_vectorzr.transform(feat_gatr_eve['locationbin'].values.reshape(-1,1))
feat_X_te_clus = feat_vectorzr.transform(feat_gate_eve['locationbin'].values.reshape(-1,1))

print("Train Event locationbin Shape: ",feat_X_tr_clus.shape)
print("Test Event locationbin Shape: ",feat_X_te_clus.shape)

Train Event locationbin Shape:  (23309, 10)
Test Event locationbin Shape:  (35194, 10)


In [None]:
import pickle
with open('clustered_features','wb') as fp:
    pickle.dump(feat_vectorzr,fp)

# TFIDF FEATURE FOR APP IS_ACTIVE

In [None]:
#Mapping The Values the values to train and test dataframes
feat_gatr_eve['apps_active']=feat_gatr_eve.index.map(active_apps_events)
feat_gate_eve['apps_active']=feat_gate_eve.index.map(active_apps_events)

feat_vectorzr=TfidfVectorizer()
feat_vectorzr.fit(feat_gatr_eve['apps_active'].values)

feat_X_tr_activ = feat_vectorzr.transform(feat_gatr_eve['apps_active'].values)
feat_X_te_activ = feat_vectorzr.transform(feat_gate_eve['apps_active'].values)

print("Train Apps Active TF-IDF Shape: ",feat_X_tr_activ.shape)
print("Test Apps Active TF-IDF Shape: ",feat_X_te_activ.shape)

Train Apps Active TF-IDF Shape:  (23309, 2)
Test Apps Active TF-IDF Shape:  (35194, 2)


In [None]:
import pickle
with open('isactive_tfidf','wb') as fp:
    pickle.dump(feat_vectorzr,fp)

In [None]:
#creating final data matrix
feat_Xtr_eve=hstack((feat_Xtr_eve_br,feat_Xtr_eve_mod,feat_Xtr_eve_lab,feat_Xtr_horjoin_tfidf,feat_X_tr_hrbinjoin_onehot,feat_X_tr_daysjoin_tfidf,feat_Xtr_eve_lat,feat_Xtr_eve_lon,feat_Xtr_eve_app,feat_X_tr_activ,feat_X_tr_clus),format='csr')

feat_Xte_eve =hstack((feat_Xte_eve_br,feat_Xte_eve_mod,feat_Xte_eve_lab,feat_Xte_horjoin_tfidf,feat_X_te_hrbinjoin_onehot,feat_X_te_daysjoin_tfidf,feat_Xte_eve_lat,feat_Xte_eve_lon,feat_Xte_eve_app,feat_X_te_activ,feat_X_te_clus),format='csr')

print(feat_Xtr_eve.shape)
print(feat_Xte_eve.shape)

(23309, 21576)
(35194, 21576)


In [None]:
#label encoding target variable
feat_targt_encod = LabelEncoder().fit(feat_gatr_eve.group)
y = feat_targt_encod.transform(feat_gatr_eve.group)

In [None]:
print("xtrain shape:",feat_Xtr_eve.shape)
print("ytrain shape:",feat_y.shape)

print("xtest shape:",feat_Xte_eve.shape)

xtrain shape: (23309, 21576)
ytrain shape: (23309,)
xtest shape: (35194, 21576)


In [None]:
xtr, xcv, ytr, ycv = train_test_split(feat_Xtr_eve, y,stratify=feat_y,test_size=0.2,random_state=9)

In [None]:
#one hot encoding target variable
ytr=np_utils.to_categorical(ytr)
ycv=np_utils.to_categorical(ycv)

# NEURAL NETWORK 1

In [None]:
def feat_eve_nn_mod1(input_dim,output_dim):
    model = Sequential()
    model.add(Dropout(0.15, input_shape=(input_dim,)))
    model.add(Dense(240, init='uniform'))
    model.add(PReLU(init='zero'))
    model.add(Dropout(0.8))
    model.add(Dense(240, init='uniform'))
    model.add(PReLU(init='zero', weights=None))
    model.add(Dropout(0.35))
    model.add(Dense(260, init='uniform'))
    model.add(PReLU(init='zero', weights=None))
    model.add(Dropout(0.40))
    model.add(Dense(output_dim, init='uniform'))
    model.add(Activation('softmax'))

    opt = Adagrad(lr=0.008, epsilon=1e-08)
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    return model

In [None]:
model_sum=feat_eve_nn_mod1(xtr.shape[1],12)
model_sum.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout_19 (Dropout)         (None, 21576)             0         
_________________________________________________________________
dense_31 (Dense)             (None, 240)               5178480   
_________________________________________________________________
p_re_lu_19 (PReLU)           (None, 240)               240       
_________________________________________________________________
dropout_20 (Dropout)         (None, 240)               0         
_________________________________________________________________
dense_32 (Dense)             (None, 240)               57840     
_________________________________________________________________
p_re_lu_20 (PReLU)           (None, 240)               240       
_________________________________________________________________
dropout_21 (Dropout)         (None, 240)               0         
__________

In [None]:
early_stop=EarlyStopping(monitor='val_loss',patience=5,restore_best_weights=True)

In [None]:
def feat_eve_avg_nn_1(state):
    """
    Takes a list of Random Seeds, splits the data into Train and CV based on Seed, trains model and takes average of 
    predictions while testing  
    """
    feat_mod_list=[]
    feat_los_list=[]
    feat_avg_cv_los=0
    for i in range((state)):
        model=feat_eve_nn_mod1(xtr.shape[1],12)
        model.fit(xtr, ytr, batch_size=149, epochs=20, verbose=1, validation_data=(xcv, ycv),callbacks=[early_stop])
        model.save('saved_models/events/nn1'+str(i+1))
        eat_pred=model.predict_proba(xcv)
        feat_cv_los=log_loss(ycv, eat_pred)
        print("Validation Log Loss of  Model in Current Run: ",feat_cv_los)
        feat_mod_list.append(model)
        feat_los_list.append(feat_cv_los)
    feat_avg_cv_los=mean(feat_los_list)
    print("Average CV Loss of "+str((state))+" Runs :",feat_avg_cv_los)
    return(feat_mod_list)

In [None]:
model_list_2=feat_eve_avg_nn_1(10)

Train on 18647 samples, validate on 4662 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Validation Log Loss of  Model in Current Run:  1.9136059269573138
Train on 18647 samples, validate on 4662 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Validation Log Loss of  Model in Current Run:  1.9367125339164373
Train on 18647 samples, validate on 4662 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20


Epoch 20/20
Validation Log Loss of  Model in Current Run:  1.9237789996264
Train on 18647 samples, validate on 4662 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Validation Log Loss of  Model in Current Run:  1.9296700733280108
Train on 18647 samples, validate on 4662 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Validation Log Loss of  Model in Current Run:  1.9203065615975894
Train on 18647 samples, validate on 4662 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
E

Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Validation Log Loss of  Model in Current Run:  1.9258740875150422
Train on 18647 samples, validate on 4662 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Validation Log Loss of  Model in Current Run:  1.9209416287566417
Average CV Loss of 10 Runs : 1.9234806484970717


In [None]:
feat_avg_pred=np.zeros((xtr.shape[0],12))
for i in range(len(model_list_2)):
    feat_tr_pred=model_list_2[i].predict_proba(xtr)
    feat_avg_pred+=feat_tr_pred
feat_avg_pred/=len(model_list_2)
print("Train Average Log-Loss: ",log_loss(ytr, feat_avg_pred))

Train Average Log-Loss:  1.5406584936512548


In [None]:
feat_avg_pred=np.zeros((xcv.shape[0],12))
for i in range(len(model_list_2)):
    feat_cv_pred=model_list_2[i].predict_proba(xcv)
    feat_avg_pred+=feat_cv_pred
feat_avg_pred/=len(model_list_2)
print("Validation Average Log-Loss: ",log_loss(ycv, feat_avg_pred))

Validation Average Log-Loss:  1.9074406784271256


In [None]:
feat_avg_pred=np.zeros((X_test_events.shape[0],12))
for i in range(len(model_list_2)):
    feat_te_pred=model_list_2[i].predict_proba(X_test_events)
    feat_avg_pred+=feat_te_pred
feat_avg_pred/=len(model_list_2)

In [None]:
np.save('nn1_events_1',feat_avg_pred)

OBSERVATIONS:
1.THE TRAIN AND VALIDATION LOSSES ARE 1.5406 AND 1.9074 RESPECTIVELY.
2. BOTH TRAIN AND VALIDATIONN LOSSES DECREASED AS WE ADDED MORE FEATURES.


# NEURAL NETWORK 2

In [None]:
def feat_eve_nn_mod_2(input_dim,output_dim):
    model = Sequential()
    model.add(Dropout(0.4, input_shape=(input_dim,)))
    model.add(Dense(75))
    model.add(PReLU())
    model.add(Dropout(0.30))
    model.add(Dense(50, init='normal', activation='tanh'))
    model.add(PReLU())
    model.add(Dropout(0.20))
    model.add(Dense(output_dim, init='normal', activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])
    return model

In [None]:
model_sum=feat_eve_nn_mod_2(xtr.shape[1],12)
model_sum.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout_63 (Dropout)         (None, 21576)             0         
_________________________________________________________________
dense_75 (Dense)             (None, 75)                1618275   
_________________________________________________________________
p_re_lu_52 (PReLU)           (None, 75)                75        
_________________________________________________________________
dropout_64 (Dropout)         (None, 75)                0         
_________________________________________________________________
dense_76 (Dense)             (None, 50)                3800      
_________________________________________________________________
p_re_lu_53 (PReLU)           (None, 50)                50        
_________________________________________________________________
dropout_65 (Dropout)         (None, 50)                0         
__________

In [None]:
def feat_eve_avg_nn_2(state):

    model_list=[]
    loss_list=[]
    feat_avg_cv_los=0
    for i in range((state)):
        model=events_nn_model2(xtr.shape[1],12)
        model.fit(xtr, ytr, batch_size=149, epochs=20, verbose=1, validation_data=(xcv, ycv),callbacks=[early_stop])
        model.save('saved_models/events/nn2'+str(i+1))
        feat_pred=model.predict_proba(xcv)
        feat_cv_los=log_loss(ycv, feat_pred)
        print("Validation Log Loss of  Model in Current Run: ",feat_cv_los)
        feat_mod_list.append(model)
        feat_los_list.append(feat_cv_los)
    feat_avg_cv_los=mean(feat_los_list)
    print("Average CV Loss of "+str((state))+" Runs :",feat_avg_cv_los)
    return(feat_mod_list)

In [None]:
model_list_2=feat_eve_avg_nn_2(10)

Train on 18647 samples, validate on 4662 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Validation Log Loss of  Model in Current Run:  1.9168982927262757
Train on 18647 samples, validate on 4662 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Validation Log Loss of  Model in Current Run:  1.9146637250839105
Train on 18647 samples, validate on 4662 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Validation Log Loss of  Model in Current Run:  1.9148988088457843
Train on 18647 samples

Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Validation Log Loss of  Model in Current Run:  1.9115930246656696
Train on 18647 samples, validate on 4662 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Validation Log Loss of  Model in Current Run:  1.9148608428341785
Train on 18647 samples, validate on 4662 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Validation Log Loss of  Model in Current Run:  1.918339006581988
Train on 18647 samples, validate on 4662 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20

In [None]:
feat_avg_pred=np.zeros((xtr.shape[0],12))
for i in range(len(model_list_2)):
    feat_tr_pred=model_list_2[i].predict_proba(xtr)
    feat_avg_pred+=feat_tr_pred
feat_avg_pred/=len(model_list_2)
print("Train Average Log-Loss: ",log_loss(ytr, feat_avg_pred))

Train Average Log-Loss:  1.7068032853037864


In [None]:
feat_avg_pred=np.zeros((xcv.shape[0],12))
for i in range(len(model_list_2)):
    feat_cv_pred=model_list_2[i].predict_proba(xcv)
    feat_avg_pred+=feat_cv_pred
feat_avg_pred/=len(model_list_2)
print("Validation Average Log-Loss: ",log_loss(ycv, feat_avg_pred))

Validation Average Log-Loss:  1.9012935504651483


In [None]:
feat_avg_pred=np.zeros((X_test_events.shape[0],12))
for i in range(len(model_list_2)):
    feat_te_pred=model_list_2[i].predict_proba(feat_Xte_eve)
    feat_avg_pred+=feat_te_pred
feat_avg_pred/=len(model_list_2)

In [None]:
np.save('nn2_events_1',feat_avg_pred)

OBSERVATIONS:
1. THE TRAIN AND VALIDATION LOSSES ARE 1.7068 AND 1.9012 RESPECTIVELY.

# XGBOOST

In [None]:
ytr.shape

(18647, 12)

In [None]:
xtr, xcv, ytr, ycv = train_test_split(feat_Xtr_eve, y,stratify=feat_y,test_size=0.2,random_state=9)

In [None]:
ytr.shape

(18647,)

In [None]:
feat_xgb = XGBClassifier(n_estimators=350, n_jobs=-1,learning_rate=0.05, colsample_bytree=0.7, max_depth=5,subsample=0.7,objective='multi:softprob',num_class=12,eval_metric='mlogloss')
feat_xgb.fit(xtr, ytr)
#Using Model Calibration
feat_clf = CalibratedClassifierCV(feat_xgb, method="sigmoid")
feat_clf.fit(xtr, ytr)

feat_pred_y=feat_clf.predict_proba(xtr)
print("Train Log Loss :",log_loss(ytr, feat_pred_y))


feat_pred_y=feat_clf.predict_proba(xcv)
print("Validation Log Loss :",log_loss(ycv, feat_pred_y))

Train Log Loss : 1.2839666002195145
Validation Log Loss : 2.057339870807861


In [None]:
feat_eve_pred_xgb=clf.predict_proba(feat_Xte_eve)

In [None]:
np.save('xgb_events_1.npy',feat_eve_pred_xgb)

OBSERVATIONS:
 THE TRAIN AND  VALIDATION LOSSES ARE 1.2839 AND 2.0573 RESPECTIVELY.

# LOGISTIC REGRESSION

In [None]:
# Train a Logistic regression+Calibration model using text features whicha re on-hot encoded
feat_alph = [0.001,0.01,0.02,0.1,0.15,1,10]


for i in feat_alph:
    feat_clf = LogisticRegression(C=i, class_weight='balanced', multi_class='multinomial',solver='lbfgs')
    feat_clf.fit(xtr, ytr)
    #Using Model Calibration
    feat_sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    feat_sig_clf.fit(xtr, ytr)
    predict_y = feat_sig_clf.predict_proba(xcv)
    print('For values of C = ', i, "The validation log loss is:",log_loss(ycv, feat_pred_y))

For values of C =  0.001 The validation log loss is: 2.0982561817893224
For values of C =  0.01 The validation log loss is: 2.019844834880824
For values of C =  0.02 The validation log loss is: 2.0160115578199274
For values of C =  0.1 The validation log loss is: 2.043084585773688
For values of C =  0.15 The validation log loss is: 2.055436921152977
For values of C =  1 The validation log loss is: 2.1074612177933965
For values of C =  10 The validation log loss is: 2.13818998200252


WE CHOSE OUR BEST C TO BE 0.02

In [None]:
feat_clf = LogisticRegression(C=0.02, class_weight='balanced', multi_class='multinomial', solver='lbfgs')
feat_clf.fit(xtr, ytr)
feat_sig_clf = CalibratedClassifierCV(feat_clf, method="sigmoid")
feat_sig_clf.fit(xtr, ytr)

feat_pred_y = feat_sig_clf.predict_proba(xtr)
feat_loss=log_loss(ytr, feat_pred_y)
print("The train log loss for best C is:",feat_loss)
feat_pred_y = feat_sig_clf.predict_proba(xcv)
feat_loss=log_loss(ycv, feat_pred_y)
print("The validation log loss for best C is:",feat_loss)

The train log loss for best C is: 1.840631737548809
The validation log loss for best C is: 2.0160115578199274


In [None]:
feat_eve_pred_lr=clf.predict_proba(feat_Xte_eve)

In [None]:
#saving the model
np.save('lr_events_1.npy',feat_eve_pred_lr)

OBSERVATIONS:
  WE GOT TRAIN AND VALIDATION LOSS AS 1.84 AND 2.0160 RESPECTIVELY.

# MODEL ENSEMBLING

# MACHINE LEARNING MODELS

WE USE LOGISTIC REGRESSION  AND XGBOOST WITH EVENTS AND WITHOUTS DATA AND WE CONCATENATE THE RESULTS.

In [None]:
feat_lr1=np.load("lr_noevents.npy")
feat_lr2=np.load("lr_events_1.npy")

feat_xgb1=np.load("xgb_noevents_1.npy")
feat_xgb2=np.load("xgb_events_1.npy")

In [None]:
w1=0.5
w2=0.5
w3=0.3
w4=0.5

feat_test1=(w1*lr1)+(w2*xgb1)

feat_test2=(w3*lr2)+(w4*xgb2)

In [None]:
feat_gat_tr=pd.read_csv('gender_age_train.csv',index_col = 'device_id')
feat_targt_encod = LabelEncoder().fit(feat_gat_tr.group)
feat_y = feat_targt_encod.transform(feat_gat_tr.group)
nclasses = len(feat_targt_encod.classes_)

In [None]:
feat_pred1 = pd.DataFrame(feat_test1, index = feat_gate_noeve.index, columns=feat_targt_encod.classes_)
feat_pred2 = pd.DataFrame(feat_test2, index = feat_gate_noeve.index, columns=feat_targt_encod.classes_)
feat_fin_pred=pd.concat([feat_pred1,feat_pred2], axis=0)
feat_fin_pred.shape

(112071, 12)

In [None]:
feat_fin_pred.to_csv('ml_final.csv',index=True)

#  ENSEMBLING NEURAL NETS

In [None]:

feat_noeve_nn_1=np.load("nn1_noevents_1.npy")
feat_noeve_nn_2=np.load("nn2_noevents_1.npy")

feat_eve_nn_1=np.load("nn1_events_1.npy")
feat_eve_nn_2=np.load("nn2_events_1.npy")

WE ARE TAKING ONLY NEURAL NETWORK 1 FOR DEVICES WITHOUT EVENTS AND FOR DEVICES WITH EVENTS WE ARE TAKING AVERAGE OF BOTH NETWORKS.

In [None]:
w1=0.5
w2=0.5

feat_test1=(1*feat_noeve_nn_1)

feat_test1=(0.5*feat_eve_nn_1)+(0.5*feat_eve_nn_2)

In [None]:
feat_gat_tr=pd.read_csv('gender_age_train.csv',index_col = 'device_id')

In [None]:
feat_targt_encod = LabelEncoder().fit(feat_gat_tr.group)
feat_y = feat_targt_encod.transform(feat_gat_tr.group)
nclasses = len(feat_targt_encod.classes_)

In [None]:
feat_pred1 = pd.DataFrame(test1, index = gatest_noevents.index, columns=targetencoder.classes_)
feat_pred2 = pd.DataFrame(test2, index = gatest_events.index, columns=targetencoder.classes_)
feat_fin_pred1=pd.concat([feat_pred1,feat_pred2], axis=0)
feat_fin_pred1.shape

(112071, 12)

In [None]:
feat_fin_pred1.to_csv('dl_sub_1.csv',index=True)

# RESULT

In [None]:
from prettytable import PrettyTable

feat_res = PrettyTable()
feat_res.field_names = ["Model", "Data", "TRAIN LOSS"," Validation loss"]
feat_res.add_row(["Logistic Regression", "without events",  2.3628,2.3891])
feat_res.add_row(["XGboost", "without events",  2.3718,2.3929])
feat_res.add_row(["Avg Neural Network-1", "without events",  2.3528,2.3577])
feat_res.add_row(["Avg Neural Network-2", "without events",  2.3770,2.3788])

feat_res.add_row(["Logistic Regression", "WITH events",  1.8406,2.0160])
feat_res.add_row(["XGboost", "WITH events", 1.2839,2.0573])
feat_res.add_row(["Avg Neural Network-1", "WITH events",  1.5406,1.9074])
feat_res.add_row(["Avg Neural Network-2", "WITH events",  1.7068,1.9012])
feat_res.add_row(['LOGISTIC REGRESSION','FULL DATA',2.4145,2.3540])

print(feat_res)

+----------------------+----------------+------------+------------------+
|        Model         |      Data      | TRAIN LOSS |  Validation loss |
+----------------------+----------------+------------+------------------+
| Logistic Regression  | without events |   2.3628   |      2.3891      |
|       XGboost        | without events |   2.3718   |      2.3929      |
| Avg Neural Network-1 | without events |   2.3528   |      2.3577      |
| Avg Neural Network-2 | without events |   2.377    |      2.3788      |
| Logistic Regression  |  WITH events   |   1.8406   |      2.016       |
|       XGboost        |  WITH events   |   1.2839   |      2.0573      |
| Avg Neural Network-1 |  WITH events   |   1.5406   |      1.9074      |
| Avg Neural Network-2 |  WITH events   |   1.7068   |      1.9012      |
| LOGISTIC REGRESSION  |   FULL DATA    |   2.4145   |      2.354       |
+----------------------+----------------+------------+------------------+


# REFERENCES

1.https://www.kaggle.com/dvasyukova/a-linear-model-on-apps-and-labels
2.https://www.kaggle.com/c/talkingdata-mobile-user-demographics/discussion/23424