In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC, SVC

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, log_loss, f1_score

In [2]:
with open('features/emoji_.pkl', 'rb') as f:
    emoji_feature = pickle.load(f)
emoji_feature.dropna(inplace = True)
emoji_feature.reset_index(inplace = True)
emoji_feature.drop(emoji_feature.columns[0], axis=1,inplace = True)
emoji_feature.drop(labels = emoji_feature[emoji_feature.ethnicity.eq('other')].index,inplace =True)

emoji_feature["ethnicity"].replace({"asian":"asian/hispanic","hispanic":"asian/hispanic"}, inplace=True)

In [3]:
emoji_feature

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,292,293,294,295,296,297,298,299,gender,ethnicity
0,0.004226,-0.002817,0.011229,0.018623,-0.015355,-0.001441,0.016362,-0.017704,0.018424,0.023368,...,-0.021683,-0.011142,-0.015675,-0.006240,-0.001424,-0.011179,-0.003141,-0.002532,female,black
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,female,black
2,0.005541,0.000912,0.007761,0.012003,-0.010353,-0.002257,0.010546,-0.011081,0.011848,0.014820,...,-0.014255,-0.006583,-0.009162,-0.003765,-0.000930,-0.009514,-0.000386,-0.001888,female,black
3,0.003081,0.001683,0.004433,0.006107,-0.004185,-0.002462,0.006122,-0.007030,0.004502,0.008522,...,-0.007279,-0.003286,-0.002924,-0.001866,-0.001460,-0.004097,-0.000618,0.001767,male,white
4,0.003516,0.004004,0.003460,0.006687,-0.005137,-0.000591,0.007233,-0.005730,0.005390,0.009294,...,-0.007139,-0.003960,-0.004223,-0.000970,-0.001362,-0.004952,-0.000343,0.001728,female,black
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18723,0.000206,0.000191,0.000088,0.000268,-0.000282,-0.000044,0.000288,-0.000296,0.000313,0.000373,...,-0.000332,-0.000216,-0.000070,-0.000089,0.000038,-0.000133,-0.000098,0.000075,female,asian/hispanic
18724,0.001287,0.000611,0.001316,0.002660,-0.002421,-0.000274,0.002620,-0.002359,0.003515,0.003583,...,-0.003185,-0.001810,-0.002549,-0.000327,-0.000470,-0.002148,0.000353,-0.000268,female,black
18725,0.001664,0.001339,0.000907,0.003272,-0.002534,-0.000137,0.003157,-0.002857,0.002045,0.003899,...,-0.003888,-0.003029,-0.001336,-0.000477,0.000641,-0.002275,-0.000490,0.001062,male,asian/hispanic
18726,0.001888,0.001292,0.000712,0.002527,-0.002434,0.000339,0.002136,-0.001983,0.001951,0.003216,...,-0.002468,-0.001618,-0.000914,-0.000323,0.000486,-0.001355,-0.000173,0.001139,female,white


In [4]:
X_train_gender, X_test_gender, y_train_gender, y_test_gender = train_test_split(emoji_feature.drop(columns = ['gender','ethnicity']), emoji_feature['gender'], test_size=0.2, random_state=43)

In [5]:
svm_linear = LinearSVC(tol=1e-3,dual = False,max_iter=10000,penalty='l1').fit(X_train_gender, y_train_gender) 
report_SVMlinear = classification_report(svm_linear.predict(X_test_gender),y_test_gender,digits = 3)
print(report_SVMlinear)

              precision    recall  f1-score   support

      female      0.539     0.739     0.624      1378
        male      0.806     0.632     0.708      2360

    accuracy                          0.671      3738
   macro avg      0.672     0.685     0.666      3738
weighted avg      0.707     0.671     0.677      3738



In [6]:
rf = RandomForestClassifier(criterion='entropy', max_depth=70, n_estimators=500).fit(X_train_gender, y_train_gender)
report_rf = classification_report(rf.predict(X_test_gender),y_test_gender,digits = 3)
print(report_rf)

              precision    recall  f1-score   support

      female      0.705     0.725     0.715      1835
        male      0.728     0.708     0.718      1903

    accuracy                          0.716      3738
   macro avg      0.717     0.717     0.716      3738
weighted avg      0.717     0.716     0.716      3738



In [7]:
GBC = GradientBoostingClassifier(n_estimators=300,max_depth = 7).fit(X_train_gender, y_train_gender)
report_GBC = classification_report(GBC.predict(X_test_gender),y_test_gender,digits = 3)
print(report_GBC)

              precision    recall  f1-score   support

      female      0.715     0.735     0.725      1835
        male      0.737     0.717     0.727      1903

    accuracy                          0.726      3738
   macro avg      0.726     0.726     0.726      3738
weighted avg      0.726     0.726     0.726      3738



In [8]:
X_train_eth, X_test_eth, y_train_eth, y_test_eth = train_test_split(emoji_feature.drop(columns = ['gender','ethnicity']), emoji_feature['ethnicity'], test_size=0.2, random_state=43)

In [9]:
svm_linear = LinearSVC(class_weight='balanced', dual=False, max_iter=15000, penalty='l1',tol=1e-2).fit(X_train_eth, y_train_eth)
report_SVMlinear = classification_report(svm_linear.predict(X_test_eth),y_test_eth,digits = 3,zero_division = 0)
print(report_SVMlinear)

                precision    recall  f1-score   support

asian/hispanic      0.077     0.324     0.124        71
         black      0.602     0.827     0.697      1124
         white      0.900     0.671     0.769      2543

      accuracy                          0.711      3738
     macro avg      0.526     0.607     0.530      3738
  weighted avg      0.795     0.711     0.735      3738



In [10]:
rf = RandomForestClassifier(class_weight='balanced', criterion='entropy',max_depth=90, n_estimators=500).fit(X_train_eth, y_train_eth)
report_rf = classification_report(rf.predict(X_test_eth),y_test_eth,digits = 3,zero_division = 0)
print(report_rf)

                precision    recall  f1-score   support

asian/hispanic      0.237     0.357     0.285       199
         black      0.715     0.794     0.752      1389
         white      0.832     0.733     0.780      2150

      accuracy                          0.736      3738
     macro avg      0.595     0.628     0.606      3738
  weighted avg      0.757     0.736     0.743      3738



In [11]:
GBC = GradientBoostingClassifier(n_estimators=300,max_depth = 7).fit(X_train_eth, y_train_eth)
report_GBC = classification_report(GBC.predict(X_test_eth),y_test_eth,digits = 3)
print(report_GBC)

                precision    recall  f1-score   support

asian/hispanic      0.293     0.746     0.421       118
         black      0.743     0.810     0.775      1414
         white      0.874     0.751     0.808      2206

      accuracy                          0.773      3738
     macro avg      0.637     0.769     0.668      3738
  weighted avg      0.806     0.773     0.783      3738

