In [1]:
import pickle
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC, SVC

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, log_loss, f1_score

In [2]:
with open('features/both_.pkl', 'rb') as f:
    both_feature = pickle.load(f)
both_feature.dropna(inplace = True)
both_feature.reset_index(inplace = True)
both_feature.drop(both_feature.columns[0], axis=1,inplace = True)
both_feature.drop(labels = both_feature[both_feature.ethnicity.eq('other')].index,inplace =True)

both_feature["ethnicity"].replace({"asian":"asian/hispanic","hispanic":"asian/hispanic"}, inplace=True)

In [3]:
X_train_gender, X_test_gender, y_train_gender, y_test_gender = train_test_split(both_feature.drop(columns = ['gender','ethnicity']), both_feature['gender'], test_size=0.2, random_state=43)

In [4]:
svm_linear = LinearSVC(tol=1e-3,dual = False,max_iter=10000,penalty='l1').fit(X_train_gender, y_train_gender) 
report_SVMlinear = classification_report(svm_linear.predict(X_test_gender),y_test_gender,digits = 3)
print(report_SVMlinear)

              precision    recall  f1-score   support

      female      0.819     0.801     0.810      1930
        male      0.793     0.811     0.802      1808

    accuracy                          0.806      3738
   macro avg      0.806     0.806     0.806      3738
weighted avg      0.806     0.806     0.806      3738



In [5]:
rf = RandomForestClassifier(criterion='entropy', max_depth=70, n_estimators=500).fit(X_train_gender, y_train_gender)
report_rf = classification_report(rf.predict(X_test_gender),y_test_gender,digits = 3)
print(report_rf)

              precision    recall  f1-score   support

      female      0.786     0.786     0.786      1887
        male      0.782     0.782     0.782      1851

    accuracy                          0.784      3738
   macro avg      0.784     0.784     0.784      3738
weighted avg      0.784     0.784     0.784      3738



In [6]:
GBC = GradientBoostingClassifier(n_estimators=300,max_depth = 7).fit(X_train_gender, y_train_gender)
report_GBC = classification_report(GBC.predict(X_test_gender),y_test_gender,digits = 3)
print(report_GBC)

              precision    recall  f1-score   support

      female      0.809     0.807     0.808      1893
        male      0.802     0.805     0.804      1845

    accuracy                          0.806      3738
   macro avg      0.806     0.806     0.806      3738
weighted avg      0.806     0.806     0.806      3738



In [7]:
X_train_eth, X_test_eth, y_train_eth, y_test_eth = train_test_split(both_feature.drop(columns = ['gender','ethnicity']), both_feature['ethnicity'], test_size=0.2, random_state=43)

In [8]:
svm_linear = LinearSVC(class_weight='balanced', dual=False, max_iter=15000, penalty='l1',tol=1e-2).fit(X_train_eth, y_train_eth)
report_SVMlinear = classification_report(svm_linear.predict(X_test_eth),y_test_eth,digits = 3,zero_division = 0)
print(report_SVMlinear)

                precision    recall  f1-score   support

asian/hispanic      0.610     0.534     0.569       343
         black      0.856     0.878     0.867      1505
         white      0.863     0.865     0.864      1890

      accuracy                          0.840      3738
     macro avg      0.776     0.759     0.767      3738
  weighted avg      0.837     0.840     0.838      3738



In [9]:
rf = RandomForestClassifier(class_weight='balanced', criterion='entropy',max_depth=90, n_estimators=500).fit(X_train_eth, y_train_eth)
report_rf = classification_report(rf.predict(X_test_eth),y_test_eth,digits = 3,zero_division = 0)
print(report_rf)

                precision    recall  f1-score   support

asian/hispanic      0.450     0.844     0.587       160
         black      0.796     0.874     0.833      1405
         white      0.914     0.797     0.852      2173

      accuracy                          0.828      3738
     macro avg      0.720     0.838     0.757      3738
  weighted avg      0.850     0.828     0.833      3738



In [12]:
# increse the max_depth will get better performance
GBC = GradientBoostingClassifier(n_estimators=300,max_depth = 7).fit(X_train_eth, y_train_eth)
report_GBC = classification_report(GBC.predict(X_test_eth),y_test_eth,digits = 3)
print(report_GBC)

                precision    recall  f1-score   support

asian/hispanic      0.480     0.766     0.590       188
         black      0.843     0.894     0.868      1455
         white      0.918     0.831     0.872      2095

      accuracy                          0.852      3738
     macro avg      0.747     0.830     0.777      3738
  weighted avg      0.867     0.852     0.856      3738

