In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
import plotly.express as px

In [20]:
df = pd.read_csv('data/preprocessed_emo.csv',keep_default_na=False, na_values=['_'])
df

Unnamed: 0,id,split,V,A,D,raw,lemma
0,['110CYL068_1036_1079'],train,3.00,3.00,3.20,last letter,last letter
1,['110CYL068_1079_1110'],test,2.80,3.10,2.80,,
2,['110CYL068_1127_1130'],train,3.00,3.00,3.00,,
3,['110CYL068_1137_1188'],train,3.44,3.00,3.22,people public assistance,people public assistance
4,['110CYL068_1189_1328'],train,3.55,3.27,3.46,class mire welfare system family,class mire welfare system family
...,...,...,...,...,...,...,...
10057,['wwf12_4531_4624'],train,3.00,3.50,3.00,constant reminder preserve wildlife,constant reminder preserve wildlife
10058,['wwf12_501_591'],train,3.80,3.40,3.60,appreciation last membership contribution,appreciation last membership contribution
10059,['wwf12_592_691'],train,3.00,3.00,3.10,today support pivotal year,today support pivotal year
10060,['wwf12_702_921'],train,3.33,3.44,3.44,fact urge strengthen dedication life generous ...,fact urge strengthen dedication life generous ...


In [21]:
fig = px.scatter_3d(df, x='V', y='A', z='D',
              color='split')
fig.show()

In [22]:
df_train= df.loc[df['split'] =='train']
df_test= df.loc[(df['split'] =='test')]

In [23]:
X_train = df_train['lemma']
y_train = df_train[['V','A','D']]
X_test = df_test['lemma']
y_true = df_test[['V','A','D']]

In [24]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X_train)

In [25]:
gsc = GridSearchCV(estimator= RandomForestRegressor(),
             param_grid={'n_estimators': list(range(100,301,100)), 'max_depth': list(range(2,5))},
                cv=5, scoring='neg_mean_absolute_error', verbose=0, n_jobs=-1)
grid_result = gsc.fit(X, y_train)
best_params = grid_result.best_params_
forest = RandomForestRegressor( n_estimators=best_params["n_estimators"], max_depth=best_params["max_depth"])
forest.fit(X, y_train)
y_pred_tree = forest.predict(vectorizer.transform(X_test))
mean_absolute_error(y_true, y_pred_tree,multioutput ='raw_values')

array([0.24353364, 0.18975106, 0.15576934])

In [26]:
gsc1 = GridSearchCV(estimator= KNeighborsRegressor(),
             param_grid={'n_neighbors': list(range(2,51))},
                cv=5, scoring='neg_mean_absolute_error', verbose=0, n_jobs=-1)
grid_result = gsc1.fit(X, y_train)
best_params = grid_result.best_params_
knn = KNeighborsRegressor( n_neighbors=best_params["n_neighbors"])
knn.fit(X, y_train)
y_pred_knn = knn.predict(vectorizer.transform(X_test))
mean_absolute_error(y_true, y_pred_knn, multioutput ='raw_values')

array([0.2576646, 0.1968864, 0.1620104])

In [27]:
X_train = df_train['raw']
y_train = df_train[['V','A','D']]
X_test = df_test['raw']
y_true = df_test[['V','A','D']]

In [28]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X_train)

In [29]:
gsc = GridSearchCV(estimator= RandomForestRegressor(),
             param_grid={'n_estimators': list(range(100,301,100)), 'max_depth': list(range(2,5))},
                cv=5, scoring='neg_mean_absolute_error', verbose=0, n_jobs=-1)
grid_result = gsc.fit(X, y_train)
best_params = grid_result.best_params_
forest = RandomForestRegressor( n_estimators=best_params["n_estimators"], max_depth=best_params["max_depth"])
forest.fit(X, y_train)
y_pred_tree = forest.predict(vectorizer.transform(X_test))
mean_absolute_error(y_true, y_pred_tree,multioutput ='raw_values')

array([0.24422492, 0.18988407, 0.15595239])

In [30]:
gsc1 = GridSearchCV(estimator= KNeighborsRegressor(),
             param_grid={'n_neighbors': list(range(2,51))},
                cv=5, scoring='neg_mean_absolute_error', verbose=0, n_jobs=-1)
grid_result = gsc1.fit(X, y_train)
best_params = grid_result.best_params_
knn = KNeighborsRegressor( n_neighbors=best_params["n_neighbors"])
knn.fit(X, y_train)
y_pred_knn = knn.predict(vectorizer.transform(X_test))
mean_absolute_error(y_true, y_pred_knn, multioutput ='raw_values')

array([0.2493048, 0.1937092, 0.1578696])