# for the model on embeddings

In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC, SVC

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, log_loss, f1_score

In [2]:
with open('features/both_.pkl', 'rb') as f:
    both_feature = pickle.load(f)
both_feature.dropna(inplace = True)
both_feature.reset_index(inplace = True)
both_feature.drop(both_feature.columns[0], axis=1,inplace = True)
both_feature.drop(labels = both_feature[both_feature.ethnicity.eq('other')].index,inplace =True)

both_feature["ethnicity"].replace({"asian":"asian/hispanic","hispanic":"asian/hispanic"}, inplace=True)

In [3]:
both_feature

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,292,293,294,295,296,297,298,299,gender,ethnicity
0,0.013014,0.009834,0.032839,0.068315,-0.044134,0.005859,0.030566,-0.059018,0.036496,0.053272,...,-0.066769,-0.017998,-0.029152,-0.030382,-0.008150,-0.041132,-0.010102,-0.002104,female,black
1,0.008530,0.028461,0.005318,0.067734,0.055008,-0.019989,0.035583,0.017593,0.042480,0.036743,...,-0.105774,-0.055145,-0.059845,-0.024338,0.026859,-0.021318,-0.035750,0.001282,female,black
2,0.008398,0.013814,0.026279,0.059204,-0.035989,-0.000249,0.020220,-0.052055,0.024714,0.036616,...,-0.055981,-0.005376,-0.019954,-0.021236,-0.003274,-0.037816,-0.010013,0.001785,female,black
3,-0.005295,0.035029,0.029599,0.045612,-0.039439,-0.005130,0.001551,-0.074083,0.006280,0.015165,...,-0.040974,0.026522,-0.008702,-0.027052,0.006684,-0.037407,-0.026165,0.019598,male,white
4,0.003955,0.025402,0.031767,0.076054,-0.050051,0.003772,0.016501,-0.075485,0.021640,0.035710,...,-0.061038,0.008354,-0.023504,-0.034003,0.001051,-0.040405,-0.017716,0.009700,female,black
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18723,-0.026155,0.025003,0.053989,0.060878,-0.019046,0.011664,-0.004236,-0.051848,-0.015985,0.040313,...,-0.033202,0.036435,-0.016155,-0.021178,-0.019340,-0.034092,-0.039857,0.073136,female,asian/hispanic
18724,0.014630,0.018524,0.029416,0.095410,-0.050798,0.008691,0.021943,-0.074146,0.041119,0.063127,...,-0.082996,0.005862,-0.040526,-0.034719,-0.010274,-0.040530,0.001933,-0.005288,female,black
18725,-0.012798,0.026301,0.013004,0.019854,-0.027733,-0.003602,-0.003783,-0.050443,-0.002319,-0.000319,...,-0.012225,0.024374,-0.000841,-0.022976,0.001680,-0.018816,-0.024448,0.020594,male,asian/hispanic
18726,-0.031794,0.042449,0.076959,0.065255,-0.023424,0.039190,-0.009896,-0.074889,-0.002837,0.062427,...,-0.022544,0.052662,0.006829,-0.021299,0.001269,-0.058646,-0.046015,0.090401,female,white


In [4]:
X_train_gender, X_test_gender, y_train_gender, y_test_gender = train_test_split(both_feature.drop(columns = ['gender','ethnicity']), both_feature['gender'], test_size=0.2, random_state=43)

In [5]:
X_gender = pd.concat([X_train_gender,X_test_gender],ignore_index = True)
y_gender = pd.concat([y_train_gender,y_test_gender],ignore_index = True)

In [7]:
parameters = {'max_depth':[70,80,90,100],'n_estimators':[100,500],'criterion':['gini', 'entropy']}

rf = GridSearchCV(RandomForestClassifier(),parameters)
rf.fit(X_gender, y_gender)

rf.best_estimator_

RandomForestClassifier(criterion='entropy', max_depth=70, n_estimators=500)

In [8]:
parameters = {'penalty': ['l1','l2']}
                   
svm_Linear = GridSearchCV(LinearSVC(dual = False,tol=1e-3,max_iter = 10000),parameters)
svm_Linear.fit(X_gender, y_gender)
svm_Linear.best_estimator_

LinearSVC(dual=False, max_iter=10000, penalty='l1', tol=0.01)

In [8]:
parameters = {'max_depth':[3,5,6] }
                   
GBC = GridSearchCV(GradientBoostingClassifier(n_estimators = 100),parameters)
GBC.fit(X_gender, y_gender)
GBC.best_estimator_

GradientBoostingClassifier(max_depth=6)

In [9]:
X_train_eth, X_test_eth, y_train_eth, y_test_eth = train_test_split(both_feature.drop(columns = ['gender','ethnicity']), both_feature['ethnicity'], test_size=0.2, random_state=43)

In [10]:
X_eth = pd.concat([X_train_eth,X_test_eth],ignore_index = True)
y_eth = pd.concat([y_train_eth,y_test_eth],ignore_index = True)

In [13]:
parameters = {'max_depth':[70,80,90,100],'n_estimators':[100,500],'criterion':['gini', 'entropy']}

rf = GridSearchCV(RandomForestClassifier(class_weight = 'balanced'),parameters)
rf.fit(X_eth, y_eth)
rf.best_estimator_

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=90, n_estimators=500)

In [14]:
parameters = {'penalty': ['l1','l2']}
                   
svmLinear = GridSearchCV(LinearSVC(dual = False,class_weight = 'balanced',tol=1e-2,max_iter = 15000),parameters)
svmLinear.fit(X_eth, y_eth)
svmLinear.best_estimator_

LinearSVC(class_weight='balanced', dual=False, max_iter=15000, penalty='l1',
          tol=0.01)

In [12]:
parameters = {'max_depth':[3,5,7] }
                   
GBC = GridSearchCV(GradientBoostingClassifier(),parameters)
GBC.fit(X_eth, y_eth)
GBC.best_estimator_

GradientBoostingClassifier(max_depth=7)

# for BOE

In [13]:
parent_df_path = 'usage_per_user/'

with open(parent_df_path+'joh_df.pkl', 'rb') as f:
    joh_per_usage = pickle.load(f)
    
with open(parent_df_path+'lon_df.pkl', 'rb') as f:
    lon_per_usage = pickle.load(f)
    
with open(parent_df_path+'nyc_df.pkl', 'rb') as f:
    nyc_per_usage = pickle.load(f)
    
with open(parent_df_path+'ran_df.pkl', 'rb') as f:
    ran_per_usage = pickle.load(f)


all_usage = pd.concat([joh_per_usage,lon_per_usage,nyc_per_usage,ran_per_usage],ignore_index = True)

In [14]:
all_usage.drop(labels = all_usage[all_usage.ethnicity.eq('other')].index,inplace =True)
all_usage.reset_index(inplace = True,drop = True)

all_usage["ethnicity"].replace({"asian":"asian/hispanic","hispanic":"asian/hispanic"}, inplace=True)
all_usage

Unnamed: 0,#️⃣,*️⃣,0️⃣,1️⃣,2️⃣,3️⃣,4️⃣,5️⃣,6️⃣,7️⃣,...,🧺,🧻,🧼,🧽,🧾,🧿,tweets_contain_emoji,total_tweets,gender,ethnicity
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,62.0,111.0,female,black
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,female,black
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,566.0,945.0,female,black
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,251.0,444.0,male,white
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,410.0,887.0,female,black
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18684,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,117.0,3121.0,female,asian/hispanic
18685,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,551.0,3072.0,female,black
18686,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1055.0,2482.0,male,asian/hispanic
18687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,905.0,2752.0,female,white


In [15]:
X_train_gender, X_test_gender, y_train_gender, y_test_gender = train_test_split(all_usage.drop(columns = ['gender','ethnicity']), all_usage['gender'], test_size=0.2, random_state=43)

In [16]:
X_gender = pd.concat([X_train_gender,X_test_gender],ignore_index = True)
y_gender = pd.concat([y_train_gender,y_test_gender],ignore_index = True)

In [14]:
parameters = {'max_depth':[70,80,90,100],'criterion':['gini', 'entropy']}

rf = GridSearchCV(RandomForestClassifier(n_estimators = 500),parameters)
rf.fit(X_gender, y_gender)

rf.best_estimator_

RandomForestClassifier(criterion='entropy', max_depth=100, n_estimators=500)

In [15]:
parameters = {'penalty': ['l1','l2']}
                   
svm_Linear = GridSearchCV(LinearSVC(dual = False,tol=1e-2,max_iter = 10000),parameters)
svm_Linear.fit(X_gender, y_gender)
svm_Linear.best_estimator_

LinearSVC(dual=False, max_iter=10000, tol=0.01)

In [18]:
parameters = {'max_depth':[5,6,7] }

GBC = GridSearchCV(GradientBoostingClassifier(loss = 'deviance',criterion='friedman_mse',max_features = None),parameters)
GBC.fit(X_gender, y_gender)
GBC.best_estimator_

GradientBoostingClassifier(max_depth=7)

In [19]:
X_train_eth, X_test_eth, y_train_eth, y_test_eth = train_test_split(all_usage.drop(columns = ['gender','ethnicity']), all_usage['ethnicity'], test_size=0.2, random_state=43)

In [20]:
X_eth = pd.concat([X_train_eth,X_test_eth],ignore_index = True)
y_eth = pd.concat([y_train_eth,y_test_eth],ignore_index = True)

In [20]:
parameters = {'max_depth':[70,80,90,100],'n_estimators':[100,500],'criterion':['gini', 'entropy']}

rf = GridSearchCV(RandomForestClassifier(n_estimators=500,class_weight = 'balanced'),parameters)
rf.fit(X_eth, y_eth)
rf.best_estimator_

RandomForestClassifier(class_weight='balanced', max_depth=70, n_estimators=500)

In [None]:
parameters = {'penalty': ['l1','l2']}
                   
svmLinear = GridSearchCV(LinearSVC(dual = False,class_weight = 'balanced',max_iter = 30000),parameters)
svmLinear.fit(X_eth, y_eth)
svmLinear.best_estimator_

In [21]:
parameters = {'max_depth':[3,5,7] }
                   
GBC = GridSearchCV(GradientBoostingClassifier(loss = 'deviance',criterion='friedman_mse',max_features = None),parameters)
GBC.fit(X_eth, y_eth)
GBC.best_estimator_

GradientBoostingClassifier(max_depth=7)