# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# Data

In [2]:
with open('../../config.json', 'rb') as file:
    config = json.load(file)
csop_cols_to_ignore = config['csop']['cols_to_ignore']
csopII_cols_to_ignore = config['csopII']['cols_to_ignore']
csop_target, csopII_target = 'zscore_efficiency', 'efficiency'

In [3]:
csop_raw = pd.read_csv('../../output/csop_output_conversation_level.csv')
csop_all_features = csop_raw.drop(csop_cols_to_ignore, axis=1)
target_scaler = StandardScaler()
csop_all_features['target'] = target_scaler.fit_transform(csop_raw[csop_target].to_numpy().reshape(-1, 1))

csopII_raw = pd.read_csv('../../output/csopII_output_conversation_level.csv')
csopII_all_features = csopII_raw.drop(csopII_cols_to_ignore, axis=1)
csopII_all_features['target'] = target_scaler.transform(csopII_raw[csopII_target].to_numpy().reshape(-1, 1))

features = list(set(csop_all_features.columns).intersection(set(csopII_all_features.columns)))
csop, csopII = csop_all_features[features].copy(), csopII_all_features[features].copy()
csop, csopII = csop.fillna(0), csopII.fillna(0)

In [4]:
X, y = csop.drop(['target'], axis=1), csop['target']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
X_test, y_test = csopII.drop(['target'], axis=1), csopII['target']

feature_scaler = StandardScaler()
X_train = pd.DataFrame(feature_scaler.fit_transform(X_train.to_numpy()), columns=X_train.columns)
X_val = pd.DataFrame(feature_scaler.transform(X_val.to_numpy()), columns=X_val.columns)
X_test = pd.DataFrame(feature_scaler.transform(X_test.to_numpy()), columns=X_test.columns)

X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((882, 382), (98, 382), (481, 382), (882,), (98,), (481,))

# Helper Functions

In [5]:
def calc_metrics(dataset, model):
    X, y = dataset
    r2 = r2_score(y_true=y, y_pred=model.predict(X)).round(4)
    mae = mean_absolute_error(y_true=y, y_pred=model.predict(X)).round(4)
    mse = mean_squared_error(y_true=y, y_pred=model.predict(X)).round(4)
    rmse = np.sqrt(mse).round(4)
    print('METRICS')
    print(f'R2: {r2}\tMAE: {mae}\tRMSE: {rmse}')
    return {
        'r2': r2, 'mae': mae, 
        'mse': mse, 'rmse': rmse
    }

# Model 1: Single Model with all Features

In [6]:
model1 = RandomForestRegressor(random_state=42)
model1.fit(X_train, y_train)

In [7]:
model1_train_metrics = calc_metrics(dataset=[X_train, y_train], model=model1)

METRICS
R2: 0.3751	MAE: 0.5318	RMSE: 0.7856


In [8]:
model1_val_metrics = calc_metrics(dataset=[X_val, y_val], model=model1)

METRICS
R2: 0.2945	MAE: 0.6744	RMSE: 0.8821


# Model 2: Model Based on Groupings by Feature Correlations

In [9]:
corr_df = X_train.corr()

In [10]:
seen = set()
groups = []
for row in tqdm(X_train.columns, total=X_train.shape[1]):
    candidate_group = corr_df.loc[row, :][corr_df.loc[row, :]>0.9].index.to_list()
    group = []
    for col in candidate_group:
        if col not in seen: group.append(col)
    if len(group) != 0: groups.append(group)

100%|██████████| 382/382 [00:00<00:00, 5075.55it/s]


In [11]:
train_predictions_list, val_predictions_list = [], []
for group in tqdm(groups, total=len(groups)):
    X_train_group, X_val_group = X_train[group], X_val[group]
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train_group, y_train)
    train_preds, val_preds = model.predict(X_train_group), model.predict(X_val_group)
    train_predictions_list.append(train_preds)
    val_predictions_list.append(val_preds)

100%|██████████| 360/360 [00:34<00:00, 10.44it/s]


In [12]:
X_train_groups = pd.DataFrame(np.c_[train_predictions_list].T, columns=['group'+str(idx) for idx in range(len(train_predictions_list))])
X_val_groups = pd.DataFrame(np.c_[val_predictions_list].T, columns=['group'+str(idx) for idx in range(len(val_predictions_list))])

In [13]:
model2 = RandomForestRegressor(random_state=42)
model2.fit(X_train_groups, y_train)

In [14]:
model2_train_metrics = calc_metrics(dataset=[X_train_groups, y_train], model=model2)

METRICS
R2: 0.4015	MAE: 0.4763	RMSE: 0.7688


In [15]:
model2_val_metrics = calc_metrics(dataset=[X_val_groups, y_val], model=model2)

METRICS
R2: 0.1484	MAE: 0.7397	RMSE: 0.9692


# Model 3: Model Based on Grouping by Feature Functions

In [16]:
feature_groupings_raw = pd.read_excel('feature_groupings_06_20_23.xlsx')
groups = feature_groupings_raw.columns[2:]
groups

Index(['Communication Frequency', 'Information / Knowledge Exchange',
       'Linguistic Features', 'Emotion', 'Cognitive Processes',
       'Perceptual Processes', 'Social Processes', 'Biological Processes',
       'Personal Concerns', 'Accommodation/\nMimicry', 'Temporal', 'Question',
       'Hedging', 'Politeness', 'Equality'],
      dtype='object')

In [17]:
rows = []
for idx, row in tqdm(feature_groupings_raw.iterrows(), total=feature_groupings_raw.shape[0]):
    if row['Level'] == 'Chat':
        for prefix in ['min_', 'max_', 'stdev_', 'average_']:
            feature_name = prefix + row['Feature Name']
            row_to_append = [feature_name] + row[groups].to_list()
            rows.append(row_to_append)
    else:
        feature_name = row['Feature Name']
        row_to_append = [feature_name] + row[groups].to_list()
        rows.append(row_to_append)
feature_groupings = pd.DataFrame(
    rows, 
    columns= [
        'Feature Name', 'Communication Frequency', 'Information / Knowledge Exchange',
        'Linguistic Features', 'Emotion', 'Cognitive Processes',
        'Perceptual Processes', 'Social Processes', 'Biological Processes',
        'Personal Concerns', 'Accommodation/\nMimicry', 'Temporal', 'Question',
        'Hedging', 'Politeness', 'Equality'
    ]
)

100%|██████████| 99/99 [00:00<00:00, 1966.43it/s]


In [18]:
group_features_list = []
for group_name in feature_groupings.columns[1:]:
    group_features = feature_groupings[feature_groupings[group_name].astype(bool)]['Feature Name'].to_list()
    group_features_to_append = []
    for feature in group_features:
        if feature in X_train.columns:
            group_features_to_append.append(feature)
    if len(group_features_to_append) != 0:
        group_features_list.append(group_features_to_append)

In [19]:
train_predictions_list, val_predictions_list = [], []
for group_features in group_features_list:
    X_train_group, X_val_group = X_train[group_features], X_val[group_features]
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train_group, y_train)
    train_preds, val_preds = model.predict(X_train_group), model.predict(X_val_group)
    train_predictions_list.append(train_preds)
    val_predictions_list.append(val_preds)

In [20]:
X_train_groups = pd.DataFrame(np.c_[train_predictions_list].T, columns=['group'+str(idx) for idx in range(len(train_predictions_list))])
X_val_groups = pd.DataFrame(np.c_[val_predictions_list].T, columns=['group'+str(idx) for idx in range(len(val_predictions_list))])

In [21]:
model3 = RandomForestRegressor(random_state=42)
model3.fit(X_train_groups, y_train)

In [22]:
model3_train_metrics = calc_metrics(dataset=[X_train_groups, y_train], model=model3)

METRICS
R2: 0.4007	MAE: 0.4771	RMSE: 0.7694


In [23]:
model3_val_metrics = calc_metrics(dataset=[X_val_groups, y_val], model=model3)

METRICS
R2: 0.1986	MAE: 0.7197	RMSE: 0.9402
