# GENERAL FEATURES FOR KERAS MODELS

In [None]:
import os
import sys
from os import path
import numpy as np
import pandas as pd
from scipy import sparse, io
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## LOADING DATA

In [None]:
feat_gat_tr = pd.read_csv('/content/drive/MyDrive/talkingdata-mobile-user-demographics/gender_age_train.csv')
feat_gat_test = pd.read_csv('/content/drive/MyDrive/talkingdata-mobile-user-demographics/gender_age_test.csv')
feat_ph = pd.read_csv('/content/drive/MyDrive/talkingdata-mobile-user-demographics/phone_brand_device_model.csv')
# removing duplicate values in phone dataframe so doesn't create problems while joining dataframes
feat_ph=feat_ph.drop_duplicates('device_id',keep='first')

feat_eve = pd.read_csv('/content/drive/MyDrive/talkingdata-mobile-user-demographics/events.csv'),parse_dates=['timestamp'],infer_datetime_format=True,)

feat_ap_eve = pd.read_csv('/content/drive/MyDrive/talkingdata-mobile-user-demographics/app_events.csv'),dtype={'is_installed':bool, 'is_active':bool})

feat_app_lab = pd.read_csv('/content/drive/MyDrive/talkingdata-mobile-user-demographics/app_labels.csv')  
feat_lab_cat = pd.read_csv('/content/drive/MyDrive/talkingdata-mobile-user-demographics/label_categories.csv')

## CREATE HAS_EVENTS FEATURES

In [None]:
feat_s=feat_eve.device_id.unique() 
#creating featura has events for train and test
feat_gat_tr['has_events']=feat_gat_tr.device_id.apply(lambda x:1 if x in s else 0)  
feat_gat_test['has_events']=feat_gat_test.device_id.apply(lambda x:1 if x in s else 0)

In [None]:
feat_gat_tr['trainrow'] = np.arange(feat_gat_tr.shape[0])
feat_gat_test['testrow'] = np.arange(feat_gat_test.shape[0]) 
feat_gat_tr=feat_gat_tr.merge(feat_ph,on='device_id')  

feat_gat_test=feat_gat_test.merge(feat_ph,on='device_id')

## BOW OF BRAND

In [None]:
import pickle
feat_br_encoder = LabelEncoder()
feat_br_encoder.fit(np.append(feat_gat_tr.phone_brand.values,feat_gat_test.phone_brand.values)) 
pickle.dump(feat_br_encoder,open('brandencoder.sav','wb'))
#converting brand to labels
feat_gat_tr['phone_brand']=feat_br_encoder.transform(feat_gat_tr['phone_brand']) 
feat_gat_test['phone_brand']=feat_br_encoder.transform(feat_gat_test['phone_brand'])  
#converting labels to int datatype
feat_row=feat_gat_tr['phone_brand'].astype(int) 
feat_row2=feat_gat_test['phone_brand'].astype(int)
#creating sparse matrix of brand data
feat_Xtr_br = csr_matrix((np.ones(feat_gat_tr.shape[0]),
                       (feat_gat_tr.trainrow, feat_row)))
feat_Xte_br = csr_matrix((np.ones(feat_gat_test.shape[0]),
                       (feat_gat_test.testrow, feat_row2)))
print('Brand features: train shape {}, test shape {}'.format(feat_Xtr_br.shape, feat_Xte_br.shape))

Brand features: train shape (74645, 131), test shape (112071, 131)


## BOW OF MODEL

In [None]:
m = feat_ph.phone_brand.str.cat(feat_ph.device_model)

feat_mod_encodr = LabelEncoder().fit(m)
#converting model to labels
pickle.dump(feat_mod_encodr,open('modelencoder.sav','wb'))
feat_gat_tr['model']=feat_mod_encodr.transform(feat_gat_tr['phone_brand'].str.cat(feat_gat_tr.device_model)) 
feat_gat_test['model']=feat_mod_encodr.transform(feat_gat_test['phone_brand'].str.cat(feat_gat_test.device_model)) 
#converting labels to int datatype
feat_row=feat_gat_tr['model'].astype(int) 
feat_row2=feat_gat_test['model'].astype(int) 
#creating sparse matrix of model data
feat_Xtr_mod = csr_matrix((np.ones(feat_gat_tr.shape[0]),
                       (feat_gat_tr.trainrow, feat_row)))
feat_Xte_mod = csr_matrix((np.ones(feat_gat_test.shape[0]),
                       (feat_gat_test.testrow, feat_row2)))
print('Model features: train shape {}, test shape {}'.format(feat_Xtr_mod.shape, feat_Xte_mod.shape))


Model features: train shape (74645, 1667), test shape (112071, 1667)


## TRAIN AND TEST DATASET FOR NOEVENTS FEATURE

In [None]:
feat_Xtr_all_br_mod=hstack((feat_Xtr_br,feat_Xtr_mod)) 
feat_Xtr_eve_br_mod=hstack((feat_Xtr_br[feat_gat_tr.index[feat_gat_tr.has_events==1],:],feat_Xtr_mod[feat_gat_tr.index[feat_gat_tr.has_events==1],:]))
feat_Xtr_noeve_br_mod=hstack((feat_Xtr_br[feat_gat_tr.index[feat_gat_tr.has_events==0],:],feat_Xtr_mod[feat_gat_tr.index[feat_gat_tr.has_events==0],:]))                                           

In [None]:
feat_Xte_all_br_mod=hstack((feat_Xte_br,feat_Xte_mod)) 
feat_Xte_eve_br_mod=hstack((feat_Xte_br[feat_gat_test.index[feat_gat_test.has_events==1],:],feat_Xte_mod[feat_gat_test.index[feat_gat_test.has_events==1],:]))
feat_Xte_noeve_br_mod=hstack((feat_Xte_br[feat_gat_test.index[feat_gat_test.has_events==0],:],feat_Xte_mod[feat_gat_test.index[feat_gat_test.has_events==0],:]))                                           

## BOW FOR APPS

In [None]:
feat_eve=feat_eve.set_index('event_id') 
feat_gat_tr=feat_gat_tr.set_index('device_id') 
feat_gat_test=feat_gat_test.set_index('device_id')

In [None]:
feat_ap_encodr = LabelEncoder().fit(feat_ap_eve.app_id)
#converting apps to labels
feat_ap_eve['app'] = feat_ap_encodr.transform(feat_ap_eve.app_id) 
pickle.dump(feat_ap_encodr,open('appencoder.sav','wb'))
napps = len(feat_ap_encodr.classes_) 
# finding apps per device id and finding row in train and test associated with device id
feat_devic_aps = (feat_ap_eve.merge(feat_eve[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(['device_id','app'])['app'].agg(['size'])
                       .merge(feat_gat_tr[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(feat_gat_test[['testrow']], how='left', left_index=True, right_index=True)
                       .reset_index())
feat_devic_aps.head()
# sparse matrix representation of bag of words of apps
d = feat_devic_aps.dropna(subset=['trainrow'])
feat_Xtr_ap = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)),
                      shape=(feat_gat_tr.shape[0],napps))
d = feat_devic_aps.dropna(subset=['testrow'])
feat_Xte_ap = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)),
                      shape=(feat_gat_test.shape[0],napps))
print('Apps data: train shape {}, test shape {}'.format(feat_Xtr_ap.shape, feat_Xte_ap.shape))



Apps data: train shape (74645, 19237), test shape (112071, 19237)


## BOW FOR LABELS

In [None]:
feat_app_lab = feat_app_lab.loc[feat_app_lab.app_id.isin(feat_ap_eve.app_id.unique())] 
#converting applabels to labels
feat_app_lab['app'] = feat_ap_encodr.transform(feat_app_lab.app_id) 


feat_lab_encodr = LabelEncoder().fit(feat_app_lab.label_id)
pickle.dump(feat_lab_encodr,open('labelcoder.sav','wb'))
feat_app_lab['label'] = feat_lab_encodr.transform(feat_app_lab.label_id)
nlabels = len(feat_lab_encodr.classes_)
# finding labels per device id and finding row in train and test associated with device id
feat_devic_lab = (feat_devic_aps[['device_id','app']]
                .merge(feat_app_lab[['app','label']])
                .groupby(['device_id','label'])['app'].agg(['size'])
                .merge(feat_gat_tr[['trainrow']], how='left', left_index=True, right_index=True)
                .merge(feat_gat_test[['testrow']], how='left', left_index=True, right_index=True)
                .reset_index())
feat_devic_lab.head()
# sparse matrix representation of bag of words of applabels
d = feat_devic_lab.dropna(subset=['trainrow'])
feat_Xtr_lab = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)),
                      shape=(feat_gat_tr.shape[0],nlabels))
d = feat_devic_lab.dropna(subset=['testrow'])
feat_Xte_lab = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)),
                      shape=(feat_gat_test.shape[0],nlabels))
print('Labels data: train shape {}, test shape {}'.format(feat_Xtr_lab.shape, feat_Xte_lab.shape))

Labels data: train shape (74645, 492), test shape (112071, 492)


In [None]:
print('Apps data: train shape {}, test shape {}'.format(feat_Xtr_ap.shape, feat_Xte_ap.shape))


Apps data: train shape (74645, 19237), test shape (112071, 19237)


## TRAIN AND TEST FOR ALL DATA

In [None]:
feat_Xtr_all = hstack((feat_Xtr_br, feat_Xtr_mod, feat_Xtr_ap, feat_Xtr_lab), format='csr')
feat_Xte_all =  hstack((feat_Xte_br, feat_Xte_mod, feat_Xte_ap, feat_Xte_lab), format='csr')

In [None]:
feat_Xtr_all

<74645x21527 sparse matrix of type '<class 'numpy.float64'>'
	with 2707712 stored elements in Compressed Sparse Row format>

In [None]:
feat_gat_test=feat_gat_test.reset_index() 
feat_gat_tr=feat_gat_tr.reset_index()

In [None]:
feat_Xtr_eve=feat_Xtr_all[feat_gat_tr.index[feat_gat_tr.has_events==1],:] 
feat_Xte_eve=feat_Xte_all[feat_gat_test.index[feat_gat_test.has_events==1],:]

In [None]:
feat_gat_tr.index[feat_gat_tr.has_events==1]