# Imports

In [16]:
from model_builder import ModelBuilder
import random
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import colorsys
from scipy.interpolate import splrep, BSpline # for Spline graphs
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import seaborn as sns
plt.rcParams["font.family"] = "Times New Roman"
import math
import warnings
warnings.filterwarnings("ignore")

In [17]:
with open("config.json", "rb") as json_file:
    config = json.load(json_file)
config

{'becker': {'filename': 'beckerestimation_output_conversation_level.csv',
  'cols_to_ignore': ['conversation_num',
   'mean_pre_discussion_error',
   'mean_post_discussion_error',
   'mean_pre_discussion_error_pct',
   'mean_post_discussion_error_pct',
   'question',
   'chatrooms',
   'trial_indx']},
 'csop': {'filename': 'csop_output_conversation_level.csv',
  'cols_to_ignore': ['conversation_num',
   'batch_num',
   'round_num',
   'round_index',
   'task_index',
   'complexity',
   'type',
   'social_perceptiveness',
   'skill',
   'normalized_score',
   'zscore_score',
   'zscore_round_duration',
   'zscore_efficiency']},
 'csopII': {'filename': 'csopII_output_conversation_level.csv',
  'cols_to_ignore': ['conversation_num',
   'batch_num',
   'vis_img',
   'int_verb',
   'ort_img',
   'rep_man',
   'soc_pers',
   'team_size',
   'difficulty',
   'score',
   'duration',
   'efficiency',
   'timestamp']},
 'dat': {'filename': 'DAT_output_conversation_level.csv',
  'cols_to_ignore':

# Per-Task Linear Models
This notebook contains univariate linear models models to show how different independent variables relate to the dependent variable(s) of interest, for each task.


In [26]:
def convert_metrics_dict_to_sorted_df(data):
    # Convert the nested dictionary to a DataFrame
    df = pd.DataFrame(data).T

    # Reset index to move the 'feature' names to a column
    df.reset_index(inplace=True)
    df.rename(columns={'index': 'feature'}, inplace=True)

    # Sort the DataFrame by 'r2' values in descending order
    df_sorted = df.sort_values(by='r2', ascending=False)

    # Reorder the columns as required
    columns_order = ['feature', 'r2', 'mae', 'mse', 'rmse']
    df_sorted = df_sorted[columns_order]
    return(df_sorted)

In [33]:
def repeated_kfold_cv_simplelinear(model, k = 10, seed = 19104):

    # Store metrics --- R^2, MAE, MSE
    metrics = ['r2', 'mae', 'mse', 'rmse']
    feature_metric_dictionary = {}

    # Assumes that we already called split datasets before we called this function!
    all_features = model.X.columns # include all possible features at this stage

    # set up to store results for this feature
    for feature in all_features:
        feature_metric_dictionary[feature] = {"train": None, "test": None}
        feature_metric_dictionary[feature]["train"] = pd.DataFrame(columns=metrics)
        feature_metric_dictionary[feature]["test"] = pd.DataFrame(columns=metrics)

    # Outer loop --- repeat this k times for k-fold CV
    # Repeated k-fold cross-validation
    random.seed(seed) # set seed for reproducibility
    random_states_list = [random.randint(100, 1000000) for _ in range(k)] # create a bunch of different random states

    for i in range(len(random_states_list)):
        # create an entirely different train-test split
        model.get_split_datasets(model.baseline_model, val_size = 0.2, test_size = None, random_state = random_states_list[i])

        # Fit a single linear regression on each of the features and report the results
        for feature in model.X_train: # do this only for the features that made it into this split
            # train a linear regression on just this one feature
            evaluation_metrics = model.train_simple_model(model.baseline_model, feature_subset = [feature])
            # store the results for that feature
            feature_metric_dictionary[feature]["train"] = feature_metric_dictionary[feature]["train"].append(evaluation_metrics['train'], ignore_index=True)
            feature_metric_dictionary[feature]["test"] = feature_metric_dictionary[feature]["test"].append(evaluation_metrics['val'], ignore_index=True)
            # reset the train-test-split, as the underlying X_test got modified
            model.get_split_datasets(model.baseline_model, val_size = 0.2, test_size = None, random_state = random_states_list[i])

    # Get mean metrics for each feature
    '''
    Optimal format:

    feature 1    r2   mae   mse  rmse
    feature 2    r2   mae   mse  rmse
    feature 3    r2   mae   mse  rmse
    '''
    final_feature_metrics_train = {}
    final_feature_metrics_test = {}

    for feature in feature_metric_dictionary.keys():
        final_feature_metrics_train[feature]=feature_metric_dictionary[feature]["train"].mean()
        final_feature_metrics_test[feature]=feature_metric_dictionary[feature]["test"].mean()

    return final_feature_metrics_train, final_feature_metrics_test


# Jury

### Full Data (100)%

In [31]:
# Define the basic model
juries_model = ModelBuilder(dataset_names = ["juries"])
juries_model.select_target(target=["majority_pct"])
juries_model.define_model(model_type = 'linear')
juries_model.get_split_datasets(juries_model.baseline_model, val_size = 0.2, test_size = None)

Checking Holdout Sets...Creating Holdout Sets...
Cleaning Up Columns...
Done


In [34]:
final_feature_metrics_train,final_feature_metrics_test=repeated_kfold_cv_simplelinear(juries_model)

Checking Holdout Sets...Creating Holdout Sets...
Cleaning Up Columns...
Done
Checking Holdout Sets...Creating Holdout Sets...
Cleaning Up Columns...
Done
Checking Holdout Sets...Creating Holdout Sets...
Cleaning Up Columns...
Done
Checking Holdout Sets...Creating Holdout Sets...
Cleaning Up Columns...
Done
Checking Holdout Sets...Creating Holdout Sets...
Cleaning Up Columns...
Done
Checking Holdout Sets...Creating Holdout Sets...
Cleaning Up Columns...
Done
Checking Holdout Sets...Creating Holdout Sets...
Cleaning Up Columns...
Done
Checking Holdout Sets...Creating Holdout Sets...
Cleaning Up Columns...
Done
Checking Holdout Sets...Creating Holdout Sets...
Cleaning Up Columns...
Done
Checking Holdout Sets...Creating Holdout Sets...
Cleaning Up Columns...
Done
Checking Holdout Sets...Creating Holdout Sets...
Cleaning Up Columns...
Done
Checking Holdout Sets...Creating Holdout Sets...
Cleaning Up Columns...
Done
Checking Holdout Sets...Creating Holdout Sets...
Cleaning Up Columns...
Done

In [None]:
jury_train_metrics=convert_metrics_dict_to_sorted_df(final_feature_metrics_train)
jury_test_metrics=convert_metrics_dict_to_sorted_df(final_feature_metrics_test)

In [None]:
sns.set_context("paper", rc={"axes.labelsize":20})
sns.set_context("talk", font_scale=1.4)

def plot_single_linear_model(conversation_data, x_vars, y_vars, num_top_plots=None):
    num_plots = len(x_vars) * len(y_vars)
    num_rows = len(y_vars)
    num_cols = len(x_vars)
    num_plots_per_row = min(5, num_cols)

    num_rows_needed = math.ceil(num_plots / num_plots_per_row)
    fig_height = num_rows_needed * 5
    fig_width = num_plots_per_row * 5

    plt.figure(figsize=(fig_width, fig_height))
    
    plot_num = 1
    metrics_values = []
    
    if num_top_plots is not None:
        all_metrics = []

    for y_var in y_vars:
        for x_var in x_vars:
            if num_top_plots is not None:
                x_data = conversation_data[x_var].values.reshape(-1, 1)
                y_data = conversation_data[y_var].values

                # Fit linear regression model
                model = LinearRegression()
                model.fit(x_data, y_data)

                # Predict using the model
                y_pred = model.predict(x_data)

                # Calculate metrics
                r_squared = r2_score(y_data, y_pred)
                mse = mean_squared_error(y_data, y_pred)
                mae = mean_absolute_error(y_data, y_pred)
                all_metrics.append((x_var, y_var, r_squared, mse, mae))
            
            if num_top_plots is None or plot_num <= num_top_plots:
                plt.subplot(num_rows_needed, num_plots_per_row, plot_num)
                sns.scatterplot(x=x_var, y=y_var, data=conversation_data)
                
                if num_top_plots is not None:
                    plt.plot(x_data, y_pred, color='red')
                    
                plot_num += 1
                
                if plot_num > num_plots:
                    break
    
    plt.tight_layout()
    plt.show()
    
    if num_top_plots is not None:
        metrics_df = pd.DataFrame(all_metrics, columns=['x_var', 'y_var', 'r_squared', 'mse', 'mae'])
        metrics_df = metrics_df.sort_values(by='r_squared', ascending=False).head(num_top_plots)

    return(metrics_df)

In [None]:
r2_jury_simple_models = plot_single_linear_model(juries_model.conv, juries_model.conv.drop(["target_raw", "target_std"], axis = 1), ["target_std"], num_top_plots=10)

In [None]:
r2_jury_simple_models

### 75% of Data

In [None]:
juries_model75 = ModelBuilder(dataset_names = ["juries"], output_dir = '../output/first_75/')
juries_model75.select_target(target=["majority_pct"])
juries_model75.define_model(model_type = 'rf')

r2_jury_simple_models75 = plot_single_linear_model(juries_model75.conv, juries_model75.conv.drop(["target_raw", "target_std"], axis=1), ["target_std"], num_top_plots=10)
r2_jury_simple_models75

### 50% Data

In [None]:
juries_model50 = ModelBuilder(dataset_names = ["juries"], output_dir = '../output/first_50/')
juries_model50.select_target(target=["majority_pct"])
juries_model50.define_model(model_type = 'rf')

r2_jury_simple_models50 = plot_single_linear_model(juries_model50.conv, juries_model50.conv.drop(["target_raw", "target_std"], axis=1), ["target_std"], num_top_plots=10)
r2_jury_simple_models50

### 25% of Data

In [None]:
juries_model25 = ModelBuilder(dataset_names = ["juries"], output_dir = '../output/first_25/')
juries_model25.select_target(target=["majority_pct"])
juries_model25.define_model(model_type = 'rf')

r2_jury_simple_models25 = plot_single_linear_model(juries_model25.conv, juries_model25.conv.drop(["target_raw", "target_std"], axis=1), ["target_std"], num_top_plots=10)
r2_jury_simple_models25

# CSOP (blended)

### Full Data (100%)

In [None]:
csop_blended_model = ModelBuilder(dataset_names = ["csop", "csopII"])
csop_blended_model.select_target(target=["zscore_efficiency", "efficiency"])
csop_blended_model.define_model(model_type = 'rf')

In [None]:
r2_csop_simple_models = plot_single_linear_model(csop_blended_model.conv, csop_blended_model.conv.drop(["target_raw", "target_std"], axis=1), ["target_std"], num_top_plots=10)

In [None]:
r2_csop_simple_models

### 75% Data

In [None]:
csop_blended_model75 = ModelBuilder(dataset_names = ["csop", "csopII"], output_dir = '../output/first_75/')
csop_blended_model75.select_target(target=["zscore_efficiency", "efficiency"])
csop_blended_model75.define_model(model_type = 'rf')

r2_csop_simple_models75 = plot_single_linear_model(csop_blended_model75.conv, csop_blended_model75.conv.drop(["target_raw", "target_std"], axis=1), ["target_std"], num_top_plots=10)
r2_csop_simple_models75

### 50% Data

In [None]:
csop_blended_model50 = ModelBuilder(dataset_names = ["csop", "csopII"], output_dir = '../output/first_50/')
csop_blended_model50.select_target(target=["zscore_efficiency", "efficiency"])
csop_blended_model50.define_model(model_type = 'rf')

r2_csop_simple_models50 = plot_single_linear_model(csop_blended_model50.conv, csop_blended_model50.conv.drop(["target_raw", "target_std"], axis=1), ["target_std"], num_top_plots=10)
r2_csop_simple_models50

### 25% Data

In [None]:
csop_blended_model25 = ModelBuilder(dataset_names = ["csop", "csopII"], output_dir = '../output/first_25/')
csop_blended_model25.select_target(target=["zscore_efficiency", "efficiency"])
csop_blended_model25.define_model(model_type = 'rf')

r2_csop_simple_models25 = plot_single_linear_model(csop_blended_model25.conv, csop_blended_model25.conv.drop(["target_raw", "target_std"], axis=1), ["target_std"], num_top_plots=10)
r2_csop_simple_models25

# DAT

### Full data (100%)

In [None]:
dat_model = ModelBuilder(dataset_names = ["dat"])
dat_model.select_target(target=["efficiency"])
dat_model.define_model(model_type = 'rf')

In [None]:
r2_dat_simple_models = plot_single_linear_model(dat_model.conv, dat_model.conv.drop(["target_raw", "target_std"], axis=1), ["target_std"], num_top_plots = 10)

In [None]:
r2_dat_simple_models

### 75% Data

In [None]:
dat_model75 = ModelBuilder(dataset_names = ["dat"], output_dir = '../output/first_75/')
dat_model75.select_target(target=["efficiency"])
dat_model75.define_model(model_type = 'rf')

r2_dat_simple_models75 = plot_single_linear_model(dat_model75.conv, dat_model75.conv.drop(["target_raw", "target_std"], axis=1), ["target_std"], num_top_plots=10)
r2_dat_simple_models75

### 50% Data

In [None]:
dat_model50 = ModelBuilder(dataset_names = ["dat"], output_dir = '../output/first_50/')
dat_model50.select_target(target=["efficiency"])
dat_model50.define_model(model_type = 'rf')

r2_dat_simple_models50 = plot_single_linear_model(dat_model50.conv, dat_model50.conv.drop(["target_raw", "target_std"], axis=1), ["target_std"], num_top_plots=10)
r2_dat_simple_models50

### 25% Data

In [None]:
dat_model25 = ModelBuilder(dataset_names = ["dat"], output_dir = '../output/first_25/')
dat_model25.select_target(target=["efficiency"])
dat_model25.define_model(model_type = 'rf')

r2_dat_simple_models25 = plot_single_linear_model(dat_model25.conv, dat_model25.conv.drop(["target_raw", "target_std"], axis=1), ["target_std"], num_top_plots=10)
r2_dat_simple_models25

# Estimation

### Full Data (100%)

In [None]:
estimation_blended_model = ModelBuilder(dataset_names = ["gurcay", "becker"])
estimation_blended_model.select_target(target=["mean_post_discussion_error_pct", "mean_post_discussion_error_pct"])
estimation_blended_model.define_model(model_type = 'rf')

In [None]:
r2_estimation_simple_models = plot_single_linear_model(estimation_blended_model.conv, estimation_blended_model.conv.drop(["target_raw", "target_std"], axis=1), ["target_std"], num_top_plots = 10)

In [None]:
r2_estimation_simple_models

### 75% Data

In [None]:
estimation_blended_model75 = ModelBuilder(dataset_names = ["gurcay", "becker"],  output_dir = '../output/first_75/')
estimation_blended_model75.select_target(target=["mean_post_discussion_error_pct", "mean_post_discussion_error_pct"])
estimation_blended_model75.define_model(model_type = 'rf')

r2_estimation_simple_models75 = plot_single_linear_model(estimation_blended_model75.conv, estimation_blended_model75.conv.drop(["target_raw", "target_std"], axis=1), ["target_std"], num_top_plots=10)
r2_estimation_simple_models75

### 50% Data

In [None]:
estimation_blended_model50 = ModelBuilder(dataset_names = ["gurcay", "becker"],  output_dir = '../output/first_50/')
estimation_blended_model50.select_target(target=["mean_post_discussion_error_pct", "mean_post_discussion_error_pct"])
estimation_blended_model50.define_model(model_type = 'rf')

r2_estimation_simple_models50 = plot_single_linear_model(estimation_blended_model50.conv, estimation_blended_model50.conv.drop(["target_raw", "target_std"], axis=1), ["target_std"], num_top_plots=10)
r2_estimation_simple_models50

### 25% Data

In [None]:
estimation_blended_model25 = ModelBuilder(dataset_names = ["gurcay", "becker"],  output_dir = '../output/first_25/')
estimation_blended_model25.select_target(target=["mean_post_discussion_error_pct", "mean_post_discussion_error_pct"])
estimation_blended_model25.define_model(model_type = 'rf')

r2_estimation_simple_models25 = plot_single_linear_model(estimation_blended_model25.conv, estimation_blended_model25.conv.drop(["target_raw", "target_std"], axis=1), ["target_std"], num_top_plots=10)
r2_estimation_simple_models25