# Predict the demographics of Twitter users based on the emojis

In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC, SVC

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, log_loss, f1_score

In [2]:
parent_df_path = 'usage_per_user/'

with open(parent_df_path+'joh_df.pkl', 'rb') as f:
    joh_per_usage = pickle.load(f)
    
with open(parent_df_path+'lon_df.pkl', 'rb') as f:
    lon_per_usage = pickle.load(f)
    
with open(parent_df_path+'nyc_df.pkl', 'rb') as f:
    nyc_per_usage = pickle.load(f)
    
with open(parent_df_path+'ran_df.pkl', 'rb') as f:
    ran_per_usage = pickle.load(f)


all_usage = pd.concat([joh_per_usage,lon_per_usage,nyc_per_usage,ran_per_usage],ignore_index = True)

In [3]:
all_usage.drop(labels = all_usage[all_usage.ethnicity.eq('other')].index,inplace =True)
all_usage.reset_index(inplace = True,drop = True)

all_usage["ethnicity"].replace({"asian":"asian/hispanic","hispanic":"asian/hispanic"}, inplace=True)
all_usage

Unnamed: 0,#️⃣,*️⃣,0️⃣,1️⃣,2️⃣,3️⃣,4️⃣,5️⃣,6️⃣,7️⃣,...,🧺,🧻,🧼,🧽,🧾,🧿,tweets_contain_emoji,total_tweets,gender,ethnicity
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,62.0,111.0,female,black
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,female,black
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,566.0,945.0,female,black
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,251.0,444.0,male,white
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,410.0,887.0,female,black
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18684,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,117.0,3121.0,female,asian/hispanic
18685,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,551.0,3072.0,female,black
18686,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1055.0,2482.0,male,asian/hispanic
18687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,905.0,2752.0,female,white


# Train/test set split: 80%/20% 

In [4]:
emoji_usage = all_usage.drop(columns = ['tweets_contain_emoji','total_tweets','gender','ethnicity'])

X_train_gender, X_test_gender, y_train_gender, y_test_gender = train_test_split(emoji_usage, all_usage['gender'], test_size=0.2, random_state=43)

# Gender predictions:

In [5]:
svm_linear = LinearSVC(dual = False,tol=1e-2,max_iter = 10000,penalty = 'l2').fit(X_train_gender, y_train_gender) #l2
report_SVMlinear = classification_report(svm_linear.predict(X_test_gender),y_test_gender,digits = 3)
print(report_SVMlinear)

              precision    recall  f1-score   support

      female      0.692     0.823     0.752      1586
        male      0.849     0.730     0.785      2152

    accuracy                          0.770      3738
   macro avg      0.770     0.777     0.768      3738
weighted avg      0.782     0.770     0.771      3738



In [6]:
rf = RandomForestClassifier(criterion='entropy', max_depth=100, n_estimators=500).fit(X_train_gender, y_train_gender)
report_rf = classification_report(rf.predict(X_test_gender),y_test_gender,digits = 3)
print(report_rf)

              precision    recall  f1-score   support

      female      0.793     0.791     0.792      1892
        male      0.786     0.788     0.787      1846

    accuracy                          0.789      3738
   macro avg      0.789     0.789     0.789      3738
weighted avg      0.789     0.789     0.789      3738



In [7]:
GBC = GradientBoostingClassifier(n_estimators=300,max_depth = 7).fit(X_train_gender, y_train_gender)
report_GBC = classification_report(GBC.predict(X_test_gender),y_test_gender,digits = 3)
print(report_GBC)

              precision    recall  f1-score   support

      female      0.787     0.819     0.802      1814
        male      0.822     0.791     0.806      1924

    accuracy                          0.804      3738
   macro avg      0.805     0.805     0.804      3738
weighted avg      0.805     0.804     0.804      3738



# Ethnicity predictions:

In [8]:
X_train_eth, X_test_eth, y_train_eth, y_test_eth = train_test_split(all_usage.drop(columns = ['tweets_contain_emoji','total_tweets','gender','ethnicity']), all_usage['ethnicity'], test_size=0.2, random_state=32)

In [9]:
svm_linear = LinearSVC(class_weight = 'balanced',tol=1e-2,penalty = 'l2',dual = False,max_iter=20000).fit(X_train_eth, y_train_eth)
report_SVMlinear = classification_report(svm_linear.predict(X_test_eth),y_test_eth,digits = 3,zero_division = 0)
print(report_SVMlinear)

                precision    recall  f1-score   support

asian/hispanic      0.411     0.524     0.461       248
         black      0.773     0.914     0.838      1281
         white      0.906     0.783     0.840      2209

      accuracy                          0.811      3738
     macro avg      0.697     0.740     0.713      3738
  weighted avg      0.828     0.811     0.814      3738



In [10]:
rf = RandomForestClassifier(n_estimators=500, criterion='entropy',max_depth=70,class_weight = 'balanced').fit(X_train_eth, y_train_eth)
report_rf = classification_report(rf.predict(X_test_eth),y_test_eth,digits = 3)
print(report_rf)

                precision    recall  f1-score   support

asian/hispanic      0.304     0.828     0.444       116
         black      0.821     0.897     0.858      1385
         white      0.928     0.792     0.855      2237

      accuracy                          0.832      3738
     macro avg      0.684     0.839     0.719      3738
  weighted avg      0.869     0.832     0.843      3738



In [11]:
GBC = GradientBoostingClassifier(n_estimators=300,max_depth = 7).fit(X_train_eth, y_train_eth)
report_GBC = classification_report(GBC.predict(X_test_eth),y_test_eth,digits = 3)
print(report_GBC)

                precision    recall  f1-score   support

asian/hispanic      0.383     0.766     0.511       158
         black      0.844     0.900     0.871      1420
         white      0.921     0.814     0.864      2160

      accuracy                          0.845      3738
     macro avg      0.716     0.827     0.749      3738
  weighted avg      0.869     0.845     0.852      3738

