In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import eng_to_ipa as p
import math
from scipy import stats
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

## Load the data

In [2]:
df = pd.read_csv("avgRatings_annotated.csv")
df.drop(df.columns[7:22], axis = 1, inplace = True)
df.head()

Unnamed: 0,name,age,polarity,name_type,rating.mean_age,rating.mean_gender,rating.mean_valence,gender
0,Adelaide,old,,real,-0.617647,45.727273,,female
1,Adelina,,bad,real,,47.771429,31.621622,female
2,Alasdair,young,,real,18.709677,-35.657143,,male
3,Alastor,old,,madeup,13.8125,-38.833333,,male
4,Alecto,old,,madeup,3.59375,-35.722222,,female


In [3]:
for i in range(len(df["name"])):
    if df["polarity"].loc[i] == "good":
        df["polarity"].loc[i] = 1
    elif df["polarity"].loc[i] == "bad" or df["polarity"].loc[i] == "ambiguous":
        df["polarity"].loc[i] = 0
        
for i in range(len(df["name"])):
    if df["gender"].loc[i] == "female":
        df["gender"].loc[i] = 1
    elif df["gender"].loc[i] == "male":
        df["gender"].loc[i] = 0
        
for i in range(len(df["name"])):
    if df["age"].loc[i] == "young":
        df["age"].loc[i] = 1
    elif df["age"].loc[i] == "old":
        df["age"].loc[i] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


## Polarity

### Classification

In [5]:
pol_df = df.loc[df['polarity'].notnull(), ["name",  "name_type", "polarity", "rating.mean_valence"]]
pol_df.reset_index(drop=True, inplace=True)

In [9]:
POL_majority = stats.mode(pol_df["polarity"])

In [27]:
ypred = np.array([int(POL_majority[0]) for i in range(len(pol_df["polarity"]))], dtype=int)

In [29]:
ytest = np.array(pol_df["polarity"], dtype=int)

In [31]:
print("Accuracy: ", accuracy_score(ytest, ypred), 
      "Precision: ", precision_score(ytest, ypred), 
      "Recall: ", recall_score(ytest, ypred), 
      "Confusion matrix: ", confusion_matrix(ypred, ytest), sep="\n")

Accuracy: 
0.5555555555555556
Precision: 
0.5555555555555556
Recall: 
1.0
Confusion matrix: 
[[ 0  0]
 [28 35]]


### Regression

In [44]:
POL_meanrate = np.mean(pol_df["rating.mean_valence"])

In [45]:
ypred = np.array([POL_meanrate for i in range(len(pol_df["rating.mean_valence"]))])

In [46]:
ytest = np.array(pol_df["rating.mean_valence"])

In [47]:
print("MAE: ", mean_absolute_error(ytest, ypred), 
      "Med-AE: ", median_absolute_error(ytest, ypred), 
      "MSE: ", mean_squared_error(ytest, ypred), 
      "R2: ", r2_score(ypred, ytest), sep="\n")

MAE: 
16.62611052678118
Med-AE: 
15.840030464487667
MSE: 
383.469156035298
R2: 
-1.2152622645428952e+32


## Gender

### Classification

In [32]:
gen_df = df.loc[df['gender'].notnull(), ["name", "name_type","gender", "rating.mean_gender"]]
gen_df.reset_index(drop=True, inplace=True)

In [34]:
GEN_majority = stats.mode(gen_df["gender"])

In [35]:
ypred = np.array([int(GEN_majority[0]) for i in range(len(gen_df["gender"]))], dtype=int)

In [36]:
ytest = np.array(gen_df["gender"], dtype=int)

In [37]:
print("Accuracy: ", accuracy_score(ytest, ypred), 
      "Precision: ", precision_score(ytest, ypred), 
      "Recall: ", recall_score(ytest, ypred), 
      "Confusion matrix: ", confusion_matrix(ypred, ytest), sep="\n")

Accuracy: 
0.5139664804469274
Precision: 
0.5139664804469274
Recall: 
1.0
Confusion matrix: 
[[ 0  0]
 [87 92]]


### Regression

In [49]:
GEN_meanrate = np.mean(gen_df["rating.mean_gender"])

In [50]:
ypred = np.array([GEN_meanrate for i in range(len(gen_df["rating.mean_gender"]))])

In [51]:
ytest = np.array(gen_df["rating.mean_gender"])

In [52]:
print("MAE: ", mean_absolute_error(ytest, ypred), 
      "Med-AE: ", median_absolute_error(ytest, ypred), 
      "MSE: ", mean_squared_error(ytest, ypred), 
      "R2: ", r2_score(ypred, ytest), sep="\n")

MAE: 
34.02235217245305
Med-AE: 
38.77600400362458
MSE: 
1361.7698708822234
R2: 
-3.142539298946287e+36


## Age

### Classification

In [39]:
age_df = df.loc[df['age'].notnull(), ["name", "name_type", "age", "rating.mean_age"]]
age_df.reset_index(drop=True, inplace=True)

In [40]:
AGE_majority = stats.mode(age_df["age"])

In [41]:
ypred = np.array([int(AGE_majority[0]) for i in range(len(age_df["age"]))], dtype=int)

In [42]:
ytest = np.array(age_df["age"], dtype=int)

In [43]:
print("Accuracy: ", accuracy_score(ytest, ypred), 
      "Precision: ", precision_score(ytest, ypred), 
      "Recall: ", recall_score(ytest, ypred), 
      "Confusion matrix: ", confusion_matrix(ypred, ytest), sep="\n")

Accuracy: 
0.5042016806722689
Precision: 
0.5042016806722689
Recall: 
1.0
Confusion matrix: 
[[ 0  0]
 [59 60]]


### Regression

In [53]:
AGE_meanrate = np.mean(age_df["rating.mean_age"])

In [54]:
ypred = np.array([AGE_meanrate for i in range(len(age_df["rating.mean_age"]))])

In [55]:
ytest = np.array(age_df["rating.mean_age"])

In [56]:
print("MAE: ", mean_absolute_error(ytest, ypred), 
      "Med-AE: ", median_absolute_error(ytest, ypred), 
      "MSE: ", mean_squared_error(ytest, ypred), 
      "R2: ", r2_score(ypred, ytest), sep="\n")

MAE: 
17.326701560748777
Med-AE: 
16.456430965974498
MSE: 
411.1824024024797
R2: 
0.0
