In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, log_loss, f1_score


with open('features/text_.pkl', 'rb') as f:
    text_feature = pickle.load(f)
text_feature.reset_index(inplace = True,drop = True)

In [2]:
text_feature.drop(labels = text_feature[text_feature.ethnicity.eq('other')].index,inplace =True)
text_feature.reset_index(inplace = True,drop = True)

text_feature["ethnicity"].replace({"asian":"asian/hispanic","hispanic":"asian/hispanic"}, inplace=True)
text_feature

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,292,293,294,295,296,297,298,299,gender,ethnicity
0,0.008789,0.012651,0.021610,0.049692,-0.028779,0.007299,0.014204,-0.041314,0.018072,0.029904,...,-0.045086,-0.006856,-0.013477,-0.024141,-0.006726,-0.029954,-0.006961,0.000429,female,black
1,0.008530,0.028461,0.005318,0.067734,0.055008,-0.019989,0.035583,0.017593,0.042480,0.036743,...,-0.105774,-0.055145,-0.059845,-0.024338,0.026859,-0.021318,-0.035750,0.001282,female,black
2,0.002857,0.012902,0.018518,0.047201,-0.025636,0.002009,0.009673,-0.040974,0.012866,0.021796,...,-0.041725,0.001207,-0.010792,-0.017471,-0.002344,-0.028302,-0.009627,0.003673,female,black
3,-0.008376,0.033346,0.025166,0.039505,-0.035254,-0.002669,-0.004571,-0.067053,0.001779,0.006642,...,-0.033694,0.029808,-0.005777,-0.025186,0.008144,-0.033310,-0.025547,0.017832,male,white
4,0.000440,0.021398,0.028307,0.069367,-0.044914,0.004363,0.009267,-0.069754,0.016249,0.026416,...,-0.053900,0.012315,-0.019281,-0.033033,0.002413,-0.035452,-0.017373,0.007971,female,black
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18684,-0.026361,0.024812,0.053901,0.060609,-0.018764,0.011707,-0.004524,-0.051552,-0.016298,0.039940,...,-0.032870,0.036651,-0.016086,-0.021088,-0.019377,-0.033959,-0.039759,0.073062,female,asian/hispanic
18685,0.013343,0.017913,0.028100,0.092750,-0.048378,0.008966,0.019323,-0.071787,0.037604,0.059544,...,-0.079811,0.007672,-0.037977,-0.034392,-0.009804,-0.038382,0.001580,-0.005020,female,black
18686,-0.014462,0.024962,0.012097,0.016582,-0.025199,-0.003464,-0.006939,-0.047585,-0.004364,-0.004218,...,-0.008337,0.027403,0.000495,-0.022498,0.001039,-0.016541,-0.023958,0.019532,male,asian/hispanic
18687,-0.033682,0.041157,0.076247,0.062728,-0.020990,0.038851,-0.012032,-0.072906,-0.004788,0.059211,...,-0.020076,0.054280,0.007743,-0.020976,0.000783,-0.057290,-0.045841,0.089262,female,white


In [3]:
X_train_gender, X_test_gender, y_train_gender, y_test_gender = train_test_split(text_feature.drop(columns = ['gender','ethnicity']), text_feature['gender'], test_size=0.2, random_state=43)

In [4]:
svm_linear = LinearSVC(tol=1e-3,dual = False,max_iter=10000,penalty='l1').fit(X_train_gender, y_train_gender) 
report_SVMlinear = classification_report(svm_linear.predict(X_test_gender),y_test_gender,digits = 3)
print(report_SVMlinear)

              precision    recall  f1-score   support

      female      0.814     0.800     0.807      1920
        male      0.793     0.807     0.800      1818

    accuracy                          0.803      3738
   macro avg      0.803     0.803     0.803      3738
weighted avg      0.804     0.803     0.803      3738



In [5]:
rf = RandomForestClassifier(criterion='entropy', max_depth=70, n_estimators=500).fit(X_train_gender, y_train_gender)
report_rf = classification_report(rf.predict(X_test_gender),y_test_gender,digits = 3)
print(report_rf)

              precision    recall  f1-score   support

      female      0.778     0.783     0.781      1876
        male      0.780     0.776     0.778      1862

    accuracy                          0.779      3738
   macro avg      0.779     0.779     0.779      3738
weighted avg      0.779     0.779     0.779      3738



In [6]:
GBC = GradientBoostingClassifier(n_estimators=300,max_depth = 7).fit(X_train_gender, y_train_gender)
report_GBC = classification_report(GBC.predict(X_test_gender),y_test_gender,digits = 3)
print(report_GBC)

              precision    recall  f1-score   support

      female      0.802     0.802     0.802      1887
        male      0.798     0.798     0.798      1851

    accuracy                          0.800      3738
   macro avg      0.800     0.800     0.800      3738
weighted avg      0.800     0.800     0.800      3738



# Ethnicity prediction

In [7]:
X_train_eth, X_test_eth, y_train_eth, y_test_eth = train_test_split(text_feature.drop(columns = ['gender','ethnicity']), text_feature['ethnicity'], test_size=0.2, random_state=43)

In [8]:
svm_linear = LinearSVC(class_weight='balanced', dual=False, max_iter=15000, penalty='l1',tol=1e-2).fit(X_train_eth, y_train_eth) 
report_SVMlinear = classification_report(svm_linear.predict(X_test_eth),y_test_eth,digits = 3,zero_division = 0)
print(report_SVMlinear)

                precision    recall  f1-score   support

asian/hispanic      0.607     0.511     0.555       356
         black      0.858     0.877     0.868      1509
         white      0.854     0.864     0.859      1873

      accuracy                          0.836      3738
     macro avg      0.773     0.751     0.761      3738
  weighted avg      0.832     0.836     0.834      3738



In [9]:
rf = RandomForestClassifier(class_weight='balanced', criterion='entropy',max_depth=90, n_estimators=500).fit(X_train_eth, y_train_eth)
report_rf = classification_report(rf.predict(X_test_eth),y_test_eth,digits = 3,zero_division = 0)
print(report_rf)

                precision    recall  f1-score   support

asian/hispanic      0.457     0.806     0.583       170
         black      0.801     0.871     0.835      1419
         white      0.909     0.802     0.852      2149

      accuracy                          0.828      3738
     macro avg      0.722     0.826     0.757      3738
  weighted avg      0.848     0.828     0.833      3738



In [10]:
GBC = GradientBoostingClassifier(n_estimators=300,max_depth = 7).fit(X_train_eth, y_train_eth)
report_GBC = classification_report(GBC.predict(X_test_eth),y_test_eth,digits = 3)
print(report_GBC)

                precision    recall  f1-score   support

asian/hispanic      0.473     0.816     0.599       174
         black      0.843     0.889     0.865      1462
         white      0.915     0.825     0.868      2102

      accuracy                          0.850      3738
     macro avg      0.744     0.843     0.777      3738
  weighted avg      0.866     0.850     0.854      3738

