# Imports

In [22]:
from model_builder import ModelBuilder
import random
import json
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [23]:
with open("config.json", "rb") as json_file:
    config = json.load(json_file)
config

{'becker': {'filename': 'beckerestimation_output_conversation_level.csv',
  'cols_to_ignore': ['conversation_num',
   'mean_pre_discussion_error',
   'mean_post_discussion_error',
   'mean_pre_discussion_error_pct',
   'mean_post_discussion_error_pct',
   'question',
   'chatrooms',
   'trial_indx']},
 'csop': {'filename': 'csop_output_conversation_level.csv',
  'cols_to_ignore': ['conversation_num',
   'batch_num',
   'round_num',
   'round_index',
   'task_index',
   'complexity',
   'type',
   'social_perceptiveness',
   'skill',
   'normalized_score',
   'zscore_score',
   'zscore_round_duration',
   'zscore_efficiency']},
 'csopII': {'filename': 'csopII_output_conversation_level.csv',
  'cols_to_ignore': ['conversation_num',
   'batch_num',
   'vis_img',
   'int_verb',
   'ort_img',
   'rep_man',
   'soc_pers',
   'team_size',
   'difficulty',
   'score',
   'duration',
   'efficiency',
   'timestamp']},
 'dat': {'filename': 'DAT_output_conversation_level.csv',
  'cols_to_ignore':

# Repeated k-Fold Cross Validation
For each model, fit it k (=10) times, and track all metrics, as well as interpretability, across all the repeats.
This gets us a less noisy estimate at our models' performance.

In [43]:
def repeated_kfold_cv(model, k = 10, seed = 19104):

    """
    Parameters:
    - model: The model we are doing k-fold CV for
    - k: the number of fols (defaults to 10)
    - seed: the random seed (defaults to 19104)

    @returns the following, pouplated with data from the k0=-fold CV:
    - train_metrics: a dataframe to store all the training metrics
    - test_metrics: a dataframe to store all the test set metrics (we will universally use a 80-20 train-test split)
    - shap_df: a dataframe to store the Shapley value summaries for each fold
    - shap_correlation_df: a dataframe to store how the Shapley values correlate with feature values for each fold
    """

    # Repeated k-fold cross-validation
    random.seed(seed) # set seed for reproducibility
    random_states_list = [random.randint(100, 1000000) for _ in range(k)] # create a bunch of different random states

    # Store metrics --- R^2, MAE, MSE
    metrics = ['r2', 'mae', 'mse', 'rmse']
    train_metrics = pd.DataFrame(columns=metrics)
    test_metrics = pd.DataFrame(columns=metrics)

    for i in range(len(random_states_list)):
        # store the model metrics for each iteration
        metrics = model.evaluate_model(model.baseline_model, val_size = 0.2, test_size = None, random_state = random_states_list[i], visualize_model = False)
        train_metrics = train_metrics.append(metrics['train'], ignore_index=True)
        test_metrics = test_metrics.append(metrics['val'], ignore_index=True)
    
        # store the shap summary for each iteration

        try:     
            shap_summary = model.shap_summary
            shap_df = pd.merge(shap_df, shap_summary[['feature', 'shap']], on='feature')
            shap_df.rename(columns={'shap': f'shap_{i+1}'}, inplace=True)
            shap_correlation_df = pd.merge(shap_correlation_df, shap_summary[['feature', 'correlation_btw_shap_and_feature_value']], on='feature')
            shap_correlation_df.rename(columns={'correlation_btw_shap_and_feature_value': f'cor_{i+1}'}, inplace=True)
        except NameError:
            # we haven't defined these yet; we're in the first iteration!
            # we have to do this becaus model.X does not show up until after the first case when evaluate_model is called
            shap_df = pd.DataFrame({'feature': model.X.columns})
            shap_correlation_df = pd.DataFrame({'feature': model.X.columns})

            shap_summary = model.shap_summary
            shap_df = pd.merge(shap_df, shap_summary[['feature', 'shap']], on='feature')
            shap_df.rename(columns={'shap': f'shap_{i+1}'}, inplace=True)
            shap_correlation_df = pd.merge(shap_correlation_df, shap_summary[['feature', 'correlation_btw_shap_and_feature_value']], on='feature')
            shap_correlation_df.rename(columns={'correlation_btw_shap_and_feature_value': f'cor_{i+1}'}, inplace=True)


    shap_df.set_index('feature', inplace=True)
    shap_correlation_df.set_index('feature', inplace=True)

    return(shap_df, shap_correlation_df, train_metrics, test_metrics)

In [36]:
def get_repeated_kfold_cv_summary(shap_df, shap_correlation_df, train_metrics, test_metrics):
    """
    Get the means of the repeated k-fold cross validation across all relevant metrics.
    """
    shap_means = shap_df.mean(axis=1).sort_values(ascending = False)
    shap_cor_means = shap_correlation_df.mean(axis=1).reindex(index = shap_means.index)
    train_means = train_metrics.mean()
    test_means = test_metrics.mean()

    return(shap_means, shap_cor_means, train_means, test_means)

# Per-Task Baseline Models
This notebook contains the _baseline models_ for each task. This allows us to get a quick understanding of the predictive features for each task (separately).

The lists of models are:

(1) Random Forests for each task:
- Jury
- CSOP Blended (across two datasets)
- CSOP Train -> CSOP II split
- DAT
- Estimation (Becker + Gurcay datasets, blended)

(2) Early Cut-off Models (Train model only on the first X% of the messages, so that we avoid issues where the final stages of the discussion reveal the outcome)
- 50% (all datasets)
- 80% (all datasets)


# Jury

### Full Data (100)%

In [24]:
# Define the basic model
juries_model = ModelBuilder(dataset_names = ["juries"])
juries_model.select_target(target=["majority_pct"])
juries_model.define_model(model_type = 'rf')

In [44]:
# Call the Repeated k-Fold CV
jury_shap, jury_shap_cor, jury_train_metrics, jury_test_metrics = repeated_kfold_cv(juries_model)
jury_shap_means, jury_shap_cor_means, jury_train_means, jury_test_means = get_repeated_kfold_cv_summary(jury_shap, jury_shap_cor, jury_train_metrics, jury_test_metrics)

Checking Holdout Sets...Creating Holdout Sets...
Done
Training Model...Done
MODEL METRICS
Train Set:	R2: 0.865	MAE: 0.3072	MSE: 0.132	RMSE: 0.3633
Validation Set:	R2: 0.1916	MAE: 0.8327	MSE: 0.8657	RMSE: 0.9304
Checking Holdout Sets...Creating Holdout Sets...
Done
Training Model...Done
MODEL METRICS
Train Set:	R2: 0.8706	MAE: 0.302	MSE: 0.1258	RMSE: 0.3547
Validation Set:	R2: 0.1661	MAE: 0.8567	MSE: 0.9133	RMSE: 0.9557
Checking Holdout Sets...Creating Holdout Sets...
Done
Training Model...Done
MODEL METRICS
Train Set:	R2: 0.8582	MAE: 0.3309	MSE: 0.1452	RMSE: 0.3811
Validation Set:	R2: 0.163	MAE: 0.7017	MSE: 0.7402	RMSE: 0.8603
Checking Holdout Sets...Creating Holdout Sets...
Done
Training Model...Done
MODEL METRICS
Train Set:	R2: 0.87	MAE: 0.3105	MSE: 0.1336	RMSE: 0.3655
Validation Set:	R2: 0.0235	MAE: 0.7849	MSE: 0.8558	RMSE: 0.9251
Checking Holdout Sets...Creating Holdout Sets...
Done
Training Model...Done
MODEL METRICS
Train Set:	R2: 0.8684	MAE: 0.3135	MSE: 0.1334	RMSE: 0.3652
Valid

In [46]:
# check out what we've got :)
jury_shap_means.head(10)

feature
average_cognitive_mech             0.092750
average_positivity_zscore_chats    0.070489
average_verbs                      0.045453
min_positivity_zscore_chats        0.032011
average_discrepancies              0.028408
average_1st_person_pl.             0.022639
average_auxiliary_verbs            0.016818
discursive_diversity               0.013122
average_certainty                  0.013093
average_hashedge                   0.012871
dtype: float64

In [47]:
jury_shap_cor_means.head(10)

feature
average_cognitive_mech            -0.756091
average_positivity_zscore_chats    0.733718
average_verbs                     -0.699445
min_positivity_zscore_chats        0.769061
average_discrepancies             -0.589950
average_1st_person_pl.             0.786318
average_auxiliary_verbs           -0.629803
discursive_diversity              -0.660073
average_certainty                  0.830832
average_hashedge                  -0.582437
dtype: float64

In [48]:
jury_train_means

r2      0.86916
mae     0.30918
mse     0.13152
rmse    0.36258
dtype: float64

In [49]:
jury_test_means

r2      0.08600
mae     0.81164
mse     0.87678
rmse    0.93535
dtype: float64

### 80% Data

In [None]:
juries_model80 = ModelBuilder(dataset_names = ["juries"], output_dir = '../output/first_80/')
juries_model80.select_target(target=["majority_pct"])
juries_model80.define_model(model_type = 'rf')

In [None]:
jury_shap80, jury_shap_cor80, jury_train_metrics80, jury_test_metrics80 = repeated_kfold_cv(juries_model80)
jury_shap_means80, jury_shap_cor_means80, jury_train_means80, jury_test_means80 = get_repeated_kfold_cv_summary(jury_shap80, jury_shap_cor80, jury_train_metrics80, jury_test_metrics80)

### 50% Data

In [None]:
juries_model50 = ModelBuilder(dataset_names = ["juries"], output_dir = '../output/first_50/')
juries_model50.select_target(target=["majority_pct"])
juries_model50.define_model(model_type = 'rf')
juries_model50.evaluate_model(juries_model50.baseline_model, val_size = 0.2, test_size = None)

In [None]:
jury_shap50, jury_shap_cor50, jury_train_metrics50, jury_test_metrics50 = repeated_kfold_cv(juries_model50)
jury_shap_means50, jury_shap_cor_means50, jury_train_means50, jury_test_means50 = get_repeated_kfold_cv_summary(jury_shap50, jury_shap_cor50, jury_train_metrics50, jury_test_metrics50)

# CSOP (blended)

### Full Data (100%)

In [None]:
csop_blended_model = ModelBuilder(dataset_names = ["csop", "csopII"])
csop_blended_model.select_target(target=["zscore_efficiency", "efficiency"])
csop_blended_model.define_model(model_type = 'rf')
csop_blended_model.evaluate_model(csop_blended_model.baseline_model, val_size = 0.2, test_size = None)

In [None]:
csop_shap, csop_shap_cor, csop_train_metrics, csop_test_metrics = repeated_kfold_cv(csop_blended_model)
csop_shap_means, csop_shap_cor_means, csop_train_means, csop_test_means = get_repeated_kfold_cv_summary(csop_shap, csop_shap_cor, csop_train_metrics, csop_test_metrics)

### 80% Data

In [None]:
csop_blended_model80 = ModelBuilder(dataset_names = ["csop", "csopII"], output_dir = '../output/first_80/')
csop_blended_model80.select_target(target=["zscore_efficiency", "efficiency"])
csop_blended_model80.define_model(model_type = 'rf')
csop_blended_model80.evaluate_model(csop_blended_model80.baseline_model, val_size = 0.2, test_size = None)

In [None]:
csop_shap80, csop_shap_cor80, csop_train_metrics80, csop_test_metrics80 = repeated_kfold_cv(csop_blended_model80)
csop_shap_means80, csop_shap_cor_means80, csop_train_means80, csop_test_means80 = get_repeated_kfold_cv_summary(csop_shap80, csop_shap_cor80, csop_train_metrics80, csop_test_metrics80)

### 50% Data

In [None]:
csop_blended_model50 = ModelBuilder(dataset_names = ["csop", "csopII"], output_dir = '../output/first_50/')
csop_blended_model50.select_target(target=["zscore_efficiency", "efficiency"])
csop_blended_model50.define_model(model_type = 'rf')
csop_blended_model50.evaluate_model(csop_blended_model50.baseline_model, val_size = 0.2, test_size = None)

In [None]:
csop_shap50, csop_shap_cor50, csop_train_metrics50, csop_test_metrics50 = repeated_kfold_cv(csop_blended_model50)
csop_shap_means50, csop_shap_cor_means50, csop_train_means50, csop_test_means50 = get_repeated_kfold_cv_summary(csop_shap50, csop_shap_cor50, csop_train_metrics50, csop_test_metrics50)

# CSOP (Train on CSOP I -> Test on CSOP II)

### Full Data (100%)

In [None]:
# csop_model = ModelBuilder(dataset_names = ["csop"], test_dataset_names=["csopII"])
# csop_model.select_target(target = "zscore_efficiency")
# csop_model.select_test_target(target = "efficiency")
# csop_model.define_model(model_type = 'rf')
# csop_model.evaluate_model(csop_model.baseline_model)

### 80% Data

In [None]:
# csop_model80 = ModelBuilder(dataset_names = ["csop"], test_dataset_names=["csopII"], output_dir = '../output/first_80/')
# csop_model80.select_target(target = "zscore_efficiency")
# csop_model80.select_test_target(target = "efficiency")
# csop_model80.define_model(model_type = 'rf')
# csop_model80.evaluate_model(csop_model80.baseline_model)

### 50% Data

In [None]:
# csop_model50 = ModelBuilder(dataset_names = ["csop"], test_dataset_names=["csopII"], output_dir = '../output/first_50/')
# csop_model50.select_target(target = "zscore_efficiency")
# csop_model50.select_test_target(target = "efficiency")
# csop_model50.define_model(model_type = 'rf')
# csop_model50.evaluate_model(csop_model50.baseline_model)

# DAT

### Full data (100%)

In [None]:
dat_model = ModelBuilder(dataset_names = ["dat"])
dat_model.select_target(target=["efficiency"])
dat_model.define_model(model_type = 'rf')
dat_model.evaluate_model(dat_model.baseline_model, val_size = 0.2, test_size = None)

In [None]:
dat_shap, dat_shap_cor, dat_train_metrics, dat_test_metrics = repeated_kfold_cv(dat_model)
dat_shap_means, dat_shap_cor_means, dat_train_means, dat_test_means = get_repeated_kfold_cv_summary(dat_shap, dat_shap_cor, dat_train_metrics, dat_test_metrics)

### 80% Data

In [None]:
dat_model80 = ModelBuilder(dataset_names = ["dat"], output_dir = '../output/first_80/')
dat_model80.select_target(target=["efficiency"])
dat_model80.define_model(model_type = 'rf')
dat_model80.evaluate_model(dat_model80.baseline_model, val_size = 0.2, test_size = None)

In [None]:
dat_shap80, dat_shap_cor80, dat_train_metrics80, dat_test_metrics80 = repeated_kfold_cv(dat_model80)
dat_shap_means80, dat_shap_cor_means80, dat_train_means80, dat_test_means80 = get_repeated_kfold_cv_summary(dat_shap80, dat_shap_cor80, dat_train_metrics80, dat_test_metrics80)

### 50% Data

In [None]:
dat_model50 = ModelBuilder(dataset_names = ["dat"], output_dir = '../output/first_50/')
dat_model50.select_target(target=["efficiency"])
dat_model50.define_model(model_type = 'rf')
dat_model50.evaluate_model(dat_model50.baseline_model, val_size = 0.2, test_size = None)

In [None]:
dat_shap50, dat_shap_cor50, dat_train_metrics50, dat_test_metrics50 = repeated_kfold_cv(dat_model50)
dat_shap_means50, dat_shap_cor_means50, dat_train_means50, dat_test_means50 = get_repeated_kfold_cv_summary(dat_shap50, dat_shap_cor50, dat_train_metrics50, dat_test_metrics50)

# Estimation

### Full Data (100%)

In [None]:
estimation_blended_model = ModelBuilder(dataset_names = ["gurcay", "becker"])
estimation_blended_model.select_target(target=["mean_post_discussion_error_pct", "mean_post_discussion_error_pct"])
estimation_blended_model.define_model(model_type = 'rf')
estimation_blended_model.evaluate_model(estimation_blended_model.baseline_model, val_size = 0.2, test_size = None)

In [None]:
estimation_shap, estimation_shap_cor, estimation_train_metrics, estimation_test_metrics = repeated_kfold_cv(estimation_blended_model)
estimation_shap_means, estimation_shap_cor_means, estimation_train_means, estimation_test_means = get_repeated_kfold_cv_summary(estimation_shap, estimation_shap_cor, estimation_train_metrics, estimation_test_metrics)

### 80% Data

In [None]:
estimation_blended_model80 = ModelBuilder(dataset_names = ["gurcay", "becker"],  output_dir = '../output/first_80/')
estimation_blended_model80.select_target(target=["mean_post_discussion_error_pct", "mean_post_discussion_error_pct"])
estimation_blended_model80.define_model(model_type = 'rf')
estimation_blended_model80.evaluate_model(estimation_blended_model80.baseline_model, val_size = 0.2, test_size = None)

In [None]:
estimation_shap80, estimation_shap_cor80, estimation_train_metrics80, estimation_test_metrics80 = repeated_kfold_cv(estimation_blended_model80)
estimation_shap_means80, estimation_shap_cor_means80, estimation_train_means80, estimation_test_means80 = get_repeated_kfold_cv_summary(estimation_shap80, estimation_shap_cor80, estimation_train_metrics80, estimation_test_metrics80)

### 50% Data

In [None]:
estimation_blended_model50 = ModelBuilder(dataset_names = ["gurcay", "becker"],  output_dir = '../output/first_50/')
estimation_blended_model50.select_target(target=["mean_post_discussion_error_pct", "mean_post_discussion_error_pct"])
estimation_blended_model50.define_model(model_type = 'rf')
estimation_blended_model50.evaluate_model(estimation_blended_model50.baseline_model, val_size = 0.2, test_size = None)

In [None]:
estimation_shap50, estimation_shap_cor50, estimation_train_metrics50, estimation_test_metrics50 = repeated_kfold_cv(estimation_blended_model50)
estimation_shap_means50, estimation_shap_cor_means50, estimation_train_means50, estimation_test_means50 = get_repeated_kfold_cv_summary(estimation_shap50, estimation_shap_cor50, estimation_train_metrics50, estimation_test_metrics50)