### Sequential Regression model

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.multioutput import RegressorChain, MultiOutputRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score

data_01=pd.read_csv('Emo_Bank_VAD.csv')

print(data_01.columns)
print('')
print(data_01.shape)
print(data_01.head())
print('')
print(data_01.dtypes)

Index(['id', 'split', 'V', 'A', 'D', 'text'], dtype='object')

(9906, 6)
                    id  split     V     A     D  \
0  110CYL068_1036_1079  train  3.00  3.00  3.20   
1  110CYL068_1079_1110   test  2.80  3.10  2.80   
2  110CYL068_1127_1130  train  3.00  3.00  3.00   
3  110CYL068_1137_1188  train  3.44  3.00  3.22   
4  110CYL068_1189_1328  train  3.55  3.27  3.46   

                                                text  
0        Remember what she said in my last letter? "  
1                          If I wasn't working here.  
2                                                .."  
3  Goodwill helps people get off of public assist...  
4  Sherry learned through our Future Works class ...  

id        object
split     object
V        float64
A        float64
D        float64
text      object
dtype: object


In [2]:
#First regression: Predict "V" from "text"

# train, validation and test split dataset
x_train, x_test, y_train, y_test = train_test_split(data_01["text"], data_01["V"], test_size = 0.1, shuffle=True, random_state = 1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.1, shuffle=True, random_state = 1) 
print("Data shapes:", x_train.shape, x_val.shape, x_test.shape, y_train.shape, y_val.shape, y_test.shape)


# Create a pipeline that includes TfidfVectorizer and Ridge Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=7000)),  
    ('ridge', Ridge(alpha=1)) 
])

# Fit the pipeline on the training data
pipeline.fit(x_train, y_train)

# Make predictions
y_pred_train_v = pipeline.predict(x_train)
y_pred_val_v = pipeline.predict(x_val)
y_pred_test_v = pipeline.predict(x_test)

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Evaluate the model
print()
print("Evaluation metrics")
print("Train RMSE for Valence :", round(rmse(y_train, y_pred_train_v),2))
print("Train MSE for Valence :", round(mean_squared_error(y_train, y_pred_train_v),2))
print("Train MAE for Valence:", round(mean_absolute_error(y_train, y_pred_train_v),2))
print()
print("Validation RMSE for Valence:", round(rmse(y_val, y_pred_val_v),2))
print("Validation MSE for Valence:", round(mean_squared_error(y_val, y_pred_val_v),2))
print("Validation MAE for Valence:", round(mean_absolute_error(y_val, y_pred_val_v),2))
print()
print("Test RMSE for Valence:", round(rmse(y_test, y_pred_test_v),2))
print("Test MSE for Valence:", round(mean_squared_error(y_test, y_pred_test_v),2))
print("Test MAE for Valence:", round(mean_absolute_error(y_test, y_pred_test_v),2))


Data shapes: (8023,) (892,) (991,) (8023,) (892,) (991,)

Evaluation metrics
Train RMSE for Valence : 0.21
Train MSE for Valence : 0.04
Train MAE for Valence: 0.15

Validation RMSE for Valence: 0.3
Validation MSE for Valence: 0.09
Validation MAE for Valence: 0.22

Test RMSE for Valence: 0.3
Test MSE for Valence: 0.09
Test MAE for Valence: 0.22


In [3]:
#Compare predictions with true values

# Training set 
train_comparison = pd.DataFrame({
    'Actual_V': y_train,
    'Predicted_V': np.round(y_pred_train_v, 2)})

# Validation set 
val_comparison = pd.DataFrame({
    'Actual_V': y_val,
    'Predicted_V': np.round(y_pred_val_v, 2)
})

# Test set 
test_comparison = pd.DataFrame({
    'Actual_V': y_test,
    'Predicted_V': np.round(y_pred_test_v, 2)
})


print("Training Set Comparison:")
print(train_comparison.head())  

print("\nValidation Set Comparison:")
print(val_comparison.head())  

print("\nTest Set Comparison:")
print(test_comparison.head()) 


# Save the comparison tables to CSV files
train_comparison.to_csv("train_comparison_seq.csv", index=False)
val_comparison.to_csv("val_comparison_chain_seq.csv", index=False)
test_comparison.to_csv("test_comparison_chain_seq.csv", index=False)

Training Set Comparison:
      Actual_V  Predicted_V
5214      3.00         3.02
4784      2.50         2.70
708       2.30         2.71
6369      3.56         3.47
2135      3.00         3.00

Validation Set Comparison:
      Actual_V  Predicted_V
7794      3.10         3.18
6770      2.90         3.03
3444      3.00         2.50
1767      3.11         2.91
8128      3.00         2.95

Test Set Comparison:
      Actual_V  Predicted_V
8687       2.9         3.05
7972       2.6         2.88
1628       3.0         2.97
8699       2.9         2.85
5648       3.0         3.07


In [4]:
#Predict "A" for "text" and predictions of "V"

# Create DataFrames with text and the predicted "V" values
train_data = pd.DataFrame({
    'text': x_train,
    'predicted_V': y_pred_train_v,
    'A': data_01.loc[x_train.index, 'A']
})
val_data = pd.DataFrame({
    'text': x_val,
    'predicted_V': y_pred_val_v,
    'A': data_01.loc[x_val.index, 'A']
})
test_data = pd.DataFrame({
    'text': x_test,
    'predicted_V': y_pred_test_v,
    'A': data_01.loc[x_test.index, 'A']
})

# Combine text and predicted_V as features
def transform_features(data, tfidf_vectorizer):
    text_features = tfidf_vectorizer.transform(data['text'])
    predicted_V_features = np.expand_dims(data['predicted_V'].values, axis=1)
    return np.hstack((text_features.toarray(), predicted_V_features))

# Fit TF-IDF vectorizer on the text data
tfidf_vectorizer = TfidfVectorizer(max_features=7000)
tfidf_vectorizer.fit(data_01["text"]) 

# Transform features
X_train_a = transform_features(train_data, tfidf_vectorizer)
X_val_a = transform_features(val_data, tfidf_vectorizer)
X_test_a = transform_features(test_data, tfidf_vectorizer)

# Train Ridge Regression model for predicting "A"
y_train_a = train_data['A']
y_val_a = val_data['A']
y_test_a = test_data['A']

pipeline_a = Ridge(alpha=1)
pipeline_a.fit(X_train_a, y_train_a)

# Make predictions
y_pred_train_a = pipeline_a.predict(X_train_a)
y_pred_val_a = pipeline_a.predict(X_val_a)
y_pred_test_a = pipeline_a.predict(X_test_a)

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


# Evaluate the model
print()
print("Evaluation metrics")
print("Train RMSE for predicting Arousal:", round(rmse(y_train_a, y_pred_train_a), 2))
print("Train MSE for predicting Arousal:", round(mean_squared_error(y_train_a, y_pred_train_a), 2))
print("Train MAE for predicting Arousal:", round(mean_absolute_error(y_train_a, y_pred_train_a), 2))
print()
print("Validation RMSE for predicting Arousal:", round(rmse(y_val_a, y_pred_val_a), 2))
print("Validation MSE for predicting Arousal:", round(mean_squared_error(y_val_a, y_pred_val_a), 2))
print("Validation MAE for predicting Arousal:", round(mean_absolute_error(y_val_a, y_pred_val_a), 2))
print()
print("Test RMSE for predicting Arousal:", round(rmse(y_test_a, y_pred_test_a), 2))
print("Test MSE for predicting Arousal:", round(mean_squared_error(y_test_a, y_pred_test_a), 2))
print("Test MAE for predicting Arousal:", round(mean_absolute_error(y_test_a, y_pred_test_a), 2))



Evaluation metrics
Train RMSE for predicting Arousal: 0.18
Train MSE for predicting Arousal: 0.03
Train MAE for predicting Arousal: 0.14

Validation RMSE for predicting Arousal: 0.25
Validation MSE for predicting Arousal: 0.06
Validation MAE for predicting Arousal: 0.19

Test RMSE for predicting Arousal: 0.26
Test MSE for predicting Arousal: 0.07
Test MAE for predicting Arousal: 0.19


In [8]:
# Training set 
train_comparison = pd.DataFrame({
    'Actual_A': y_train_a,
    'Predicted_A': np.round(y_pred_train_a, 2)})

# Validation set 
val_comparison = pd.DataFrame({
    'Actual_A': y_val_a,
    'Predicted_A': np.round(y_pred_val_a, 2)
})

# Test set 
test_comparison = pd.DataFrame({
    'Actual_A': y_test_a,
    'Predicted_A': np.round(y_pred_test_a, 2)
})


print("Training Set Comparison:")
print(train_comparison.head())  

print("\nValidation Set Comparison:")
print(val_comparison.head())  

print("\nTest Set Comparison:")
print(test_comparison.head()) 


# Save the comparison tables to CSV files
train_comparison.to_csv("train_comparison_seqVA.csv", index=False)
val_comparison.to_csv("val_comparison_chain_seqVA.csv", index=False)
test_comparison.to_csv("test_comparison_chain_seqVA.csv", index=False)

Training Set Comparison:
      Actual_A  Predicted_A
5214      2.70         2.93
4784      3.10         3.05
708       3.10         3.17
6369      3.33         3.22
2135      3.00         3.01

Validation Set Comparison:
      Actual_A  Predicted_A
7794      3.10         3.23
6770      2.90         3.03
3444      3.43         3.01
1767      3.00         2.96
8128      3.20         3.01

Test Set Comparison:
      Actual_A  Predicted_A
8687      2.20         3.17
7972      3.20         3.02
1628      2.86         2.93
8699      2.70         2.93
5648      2.78         2.98


In [9]:
#Predict "D" from "text" and predictions of "V" and "A"

# Create DataFrames with text, predicted "V", predicted "A", and the target "D"
train_data = pd.DataFrame({
    'text': x_train,
    'predicted_V': y_pred_train_v,
    'predicted_A': y_pred_train_a,
    'D': data_01.loc[x_train.index, 'D'] 
})
val_data = pd.DataFrame({
    'text': x_val,
    'predicted_V': y_pred_val_v,
    'predicted_A': y_pred_val_a,
    'D': data_01.loc[x_val.index, 'D']  
})
test_data = pd.DataFrame({
    'text': x_test,
    'predicted_V': y_pred_test_v,
    'predicted_A': y_pred_test_a,
    'D': data_01.loc[x_test.index, 'D']  
})

# Fit TF-IDF vectorizer on the entire text data
tfidf_vectorizer = TfidfVectorizer(max_features=7000)
tfidf_vectorizer.fit(data_01["text"])  

# Define a function to combine text features with predicted_V and predicted_A
def transform_features(data, tfidf_vectorizer):
    text_features = tfidf_vectorizer.transform(data['text'])
    predicted_V_features = np.expand_dims(data['predicted_V'].values, axis=1)
    predicted_A_features = np.expand_dims(data['predicted_A'].values, axis=1)
    return np.hstack((text_features.toarray(), predicted_V_features, predicted_A_features))

# Transform features for training, validation, and test sets
X_train_final = transform_features(train_data, tfidf_vectorizer)
X_val_final = transform_features(val_data, tfidf_vectorizer)
X_test_final = transform_features(test_data, tfidf_vectorizer)

# Train a new Ridge Regression model to predict "D"
y_train_d = train_data['D']
y_val_d = val_data['D']
y_test_d = test_data['D']

pipeline_d = Ridge(alpha=1)
pipeline_d.fit(X_train_final, y_train_d)

# Make predictions
y_pred_train_d = pipeline_d.predict(X_train_final)
y_pred_val_d = pipeline_d.predict(X_val_final)
y_pred_test_d = pipeline_d.predict(X_test_final)

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Evaluate the new model
print("Evaluation metrics")
print("Train RMSE for predicting 'D':", round(rmse(y_train_d, y_pred_train_d), 2))
print("Train MSE for predicting 'D':", round(mean_squared_error(y_train_d, y_pred_train_d), 2))
print("Train MAE for predicting 'D':", round(mean_absolute_error(y_train_d, y_pred_train_d), 2))
print()
print("Validation RMSE for predicting 'D':", round(rmse(y_val_d, y_pred_val_d), 2))
print("Validation MSE for predicting 'D':", round(mean_squared_error(y_val_d, y_pred_val_d), 2))
print("Validation MAE for predicting 'D':", round(mean_absolute_error(y_val_d, y_pred_val_d), 2))
print()
print("Test RMSE for predicting 'D':", round(rmse(y_test_d, y_pred_test_d), 2))
print("Test MSE for predicting 'D':", round(mean_squared_error(y_test_d, y_pred_test_d), 2))
print("Test MAE for predicting 'D':", round(mean_absolute_error(y_test_d, y_pred_test_d), 2))



Evaluation metrics
Train RMSE for predicting 'D': 0.15
Train MSE for predicting 'D': 0.02
Train MAE for predicting 'D': 0.11

Validation RMSE for predicting 'D': 0.22
Validation MSE for predicting 'D': 0.05
Validation MAE for predicting 'D': 0.17

Test RMSE for predicting 'D': 0.22
Test MSE for predicting 'D': 0.05
Test MAE for predicting 'D': 0.17


In [11]:
# Compare predicted and real values for each set

#Training set 
train_comparison = pd.DataFrame({
    'Actual_D': y_train_d,
    'Predicted_D': np.round(y_pred_train_d, 2)  
})

# Validation set comparison
val_comparison = pd.DataFrame({
    'Actual_D': y_val_d,
    'Predicted_D': np.round(y_pred_val_d, 2)  
})

# Test set comparison
test_comparison = pd.DataFrame({
    'Actual_D': y_test_d,
    'Predicted_D': np.round(y_pred_test_d, 2)  
})

# Display the tables
print("Training Set Comparison:")
print(train_comparison.head())  

print("\nValidation Set Comparison:")
print(val_comparison.head())  

print("\nTest Set Comparison:")
print(test_comparison.head())  

# Save the comparison tables to CSV files
train_comparison.to_csv("train_comparison_seqVAD.csv", index=False)
val_comparison.to_csv("val_comparison_chain_seqVAD.csv", index=False)
test_comparison.to_csv("test_comparison_chain_seqVAD.csv", index=False)

Training Set Comparison:
      Actual_D  Predicted_D
5214      3.10         3.12
4784      2.80         2.92
708       2.80         2.87
6369      3.11         3.15
2135      3.00         3.04

Validation Set Comparison:
      Actual_D  Predicted_D
7794      3.80         3.00
6770      2.90         3.05
3444      3.43         2.96
1767      3.22         2.96
8128      3.10         3.02

Test Set Comparison:
      Actual_D  Predicted_D
8687       2.9         3.03
7972       2.9         3.09
1628       3.0         3.07
8699       3.1         2.95
5648       3.0         3.13
