# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score
from sklearn.preprocessing import StandardScaler

# Data

In [2]:
with open('../../config.json', 'rb') as file:
    config = json.load(file)
csop_cols_to_ignore = config['csop']['cols_to_ignore']
csopII_cols_to_ignore = config['csopII']['cols_to_ignore']
csop_target, csopII_target = 'zscore_efficiency', 'efficiency'

In [3]:
csop_raw = pd.read_csv('../../output/csop_output_conversation_level.csv')
csop_all_features = csop_raw.drop(csop_cols_to_ignore, axis=1)
target_scaler = StandardScaler()
csop_all_features['target'] = target_scaler.fit_transform(csop_raw[csop_target].to_numpy().reshape(-1, 1))

csopII_raw = pd.read_csv('../../output/csopII_output_conversation_level.csv')
csopII_all_features = csopII_raw.drop(csopII_cols_to_ignore, axis=1)
csopII_all_features['target'] = target_scaler.transform(csopII_raw[csopII_target].to_numpy().reshape(-1, 1))

features = list(set(csop_all_features.columns).intersection(set(csopII_all_features.columns)))
csop, csopII = csop_all_features[features].copy(), csopII_all_features[features].copy()
csop, csopII = csop.fillna(0), csopII.fillna(0)

In [4]:
X, y = csop.drop(['target'], axis=1), csop['target']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
X_test, y_test = csopII.drop(['target'], axis=1), csopII['target']

feature_scaler = StandardScaler()
X_train = pd.DataFrame(feature_scaler.fit_transform(X_train.to_numpy()), columns=X_train.columns)
X_val = pd.DataFrame(feature_scaler.transform(X_val.to_numpy()), columns=X_val.columns)
X_test = pd.DataFrame(feature_scaler.transform(X_test.to_numpy()), columns=X_test.columns)

X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((882, 382), (98, 382), (481, 382), (882,), (98,), (481,))

# Helper Functions

In [5]:
def calc_metrics(dataset, model):
    X, y = dataset
    r2 = r2_score(y_true=y, y_pred=model.predict(X)).round(4)
    mae = mean_absolute_error(y_true=y, y_pred=model.predict(X)).round(4)
    mse = mean_squared_error(y_true=y, y_pred=model.predict(X)).round(4)
    rmse = np.sqrt(mse).round(4)
    print('METRICS')
    print(f'R2: {r2}\tMAE: {mae}\tRMSE: {rmse}')
    return {
        'r2': r2, 'mae': mae, 
        'mse': mse, 'rmse': rmse
    }

# Model 1: Single Model with all Features

In [6]:
model1 = RandomForestRegressor(random_state=42)
model1.fit(X_train, y_train)

In [7]:
model1_train_metrics = calc_metrics(dataset=[X_train, y_train], model=model1)

METRICS
R2: 0.3741	MAE: 0.5327	RMSE: 0.7862


In [8]:
model1_val_metrics = calc_metrics(dataset=[X_val, y_val], model=model1)

METRICS
R2: 0.2933	MAE: 0.6759	RMSE: 0.8829


# Model Based on Feature Correlations

# Model Based on Feature Functions