In [77]:
from model_builder import ModelBuilder
from sklearn.linear_model import Lasso, Ridge, LassoCV, RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import json
import os
from model_utils import *
import scipy.stats as stats 

## Multi-Task Modeling Playground

The goal of this notebook is to prototype the modeling process for the team ingredient horse race project on the multi-task data.

For *each task* ...

We want to understand the effect of different "ingredients of a team" (Composition, Task, and Conversation) on its ultimate performance.

Team Composition:
- ['birth_year', 'CRT', 'income_max', 'income_min', 'IRCS_GS', 'IRCS_GV', 'IRCS_IB', 'IRCS_IR', 'IRCS_IV', 'IRCS_RS', 'political_fiscal', 'political_social', 'RME', 'country', 'education_level', 'gender', 'marital_status', 'political_party', 'race']
- Number of players: 'playerCount'

Task Features:
- Need to append from the Task Map
- ['complexity', 'task']

Conversation Features (All)
- Everything else that is NOT an ID or a dependent variable

# Read and Preprocess Data
The function below reads in the dataframe and preprocesses each group of features:

- Composition
- Task
- Conversation

And also parses out the possible dependent variables.

In [78]:
def drop_invariant_columns(df):
    """
    Certain features are invariant throughout the training data (e.g., the entire column is 0 throughout).

    These feature obviously won't be very useful predictors, so we drop them.
    
    This function works by identifying columns that only have 1 unique value throughout the entire column,
    and then dropping them.

    @df: the dataframe containing the features (this should be X).
    """
    nunique = df.nunique()
    cols_to_drop = nunique[nunique == 1].index
    return(df.drop(cols_to_drop, axis=1))


In [79]:
def read_and_preprocess_data(path, min_num_chats):
    conv_data  = pd.read_csv(path)

    # Filter this down to teams that have at least min_num of chats
    # Can also comment this out to re-run results on *all* conversations!
    conv_data = conv_data[conv_data["sum_num_messages"] >= min_num_chats]


    # Save the important information

    # DV
    dvs = conv_data[["score","speed","efficiency","raw_duration_min","default_duration_min"]]

    # Team Composition
    composition_colnames = ['birth_year', 'CRT', 'income_max', 'income_min', 'IRCS_GS', 'IRCS_GV', 'IRCS_IB', 'IRCS_IR',
                'IRCS_IV', 'IRCS_RS', 'political_fiscal', 'political_social', 'RME', 'country', 'education_level',
                'gender', 'marital_status', 'political_party', 'race', 'playerCount']
    
    # Select columns that contain the specified keywords
    composition = conv_data[[col for col in conv_data.columns if any(keyword in col for keyword in composition_colnames)]]

    # Task
    task = conv_data[['task', 'complexity']].copy()

    task_map_path = '../utils/task_map.csv' # get task map
    task_map = pd.read_csv(task_map_path)

    task_name_mapping = {
        "Moral Reasoning": "Moral Reasoning (Disciplinary Action Case)",
        "Wolf Goat Cabbage": "Wolf, goat and cabbage transfer",
        "Guess the Correlation": "Guessing the correlation",
        "Writing Story": "Writing story",
        "Room Assignment": "Room assignment task",
        "Allocating Resources": "Allocating resources to programs",
        "Divergent Association": "Divergent Association Task",
        "Word Construction": "Word construction from a subset of letters",
        "Whac a Mole": "Whac-A-Mole"
    }
    task.loc[:, 'task'] = task['task'].replace(task_name_mapping)
    task = pd.merge(left=task, right=task_map, on = "task", how='left')
    
    # Create dummy columns for 'complexity'
    complexity_dummies = pd.get_dummies(task['complexity'])
    task = pd.concat([task, complexity_dummies], axis=1)   
    task.drop(['complexity', 'task'], axis=1, inplace=True)

    # Conversation
    conversation = conv_data.drop(columns=list(dvs.columns) + list(composition.columns) + ['task', 'complexity', 'stageId', 'roundId', 'cumulative_stageId', 'gameId', 'message', 'message_lower_with_punc', 'speaker_nickname', 'conversation_num', 'timestamp'])
    conversation = drop_invariant_columns(conversation) # drop invariant conv features

    return composition, task, conversation, dvs

In [80]:
tiny_multitask = 'conv/multi_task_TINY_output_conversation_level_stageId_cumulative.csv'
multitask_cumulative_by_stage = 'conv/multi_task_output_conversation_level_stageId_cumulative.csv'
multitask_cumulative_by_stage_and_task = 'conv/multi_task_output_conversation_level_stageId_cumulative_within_task.csv'

In [81]:
# PARAMETERS
min_num_chats = 0
desired_target = "score"
data_path = "../output/"
output_path = "./results/multi_task_cumulative_stage/" + "min=" + str(min_num_chats) + "/" + desired_target + "/"
validation_results_output_name = output_path + "multi_task_lasso_ridge_experiments" + "_" + desired_target + "_min_chat_num_" + str(min_num_chats) + ".csv"

In [82]:
team_composition_features, task_features, conv_features, targets = read_and_preprocess_data(data_path + multitask_cumulative_by_stage, min_num_chats=min_num_chats)

# Number of points in dataset
len(conv_features)

1018

# Set up X's and y's

In [83]:
X_train = pd.concat([team_composition_features, task_features, conv_features], axis = 1)
# X_train = X_train.fillna(-1) # TODO --- need a better way to handle NA's!
y_train = targets

In [87]:
def columns_with_na(df):
    """
    Check and return columns that contain NaN (NA) values in a DataFrame.

    Parameters:
    - df: pandas DataFrame

    Returns:
    - List of column names with NaN values
    """
    # Check for NaN values in each column
    na_columns = df.columns[df.isna().any()].tolist()

    return na_columns

# Check columns that have NA
result = columns_with_na(X_train)
for colname in result:
    print(colname)

CRT_nanmean
CRT_nanstd
IRCS_GS_nanmean
IRCS_GS_nanstd
IRCS_GV_nanmean
IRCS_GV_nanstd
IRCS_IB_nanmean
IRCS_IB_nanstd
IRCS_IR_nanmean
IRCS_IR_nanstd
IRCS_IV_nanmean
IRCS_IV_nanstd
IRCS_RS_nanmean
IRCS_RS_nanstd
RME_nanmean
RME_nanstd
birth_year_nanmean
birth_year_nanstd
gender_nanmean
gender_nanstd
income_max_nanmean
income_max_nanstd
income_min_nanmean
income_min_nanstd
marital_status_nanmean
marital_status_nanstd
political_fiscal_nanmean
political_fiscal_nanstd
political_party_nanmean
political_party_nanstd
political_social_nanmean
political_social_nanstd
average_info_exchange_zscore_conversation
stdev_info_exchange_zscore_conversation
min_info_exchange_zscore_conversation
max_info_exchange_zscore_conversation
average_positivity_zscore_conversation
stdev_positivity_zscore_conversation
min_positivity_zscore_conversation
max_positivity_zscore_conversation
average_user_avg_info_exchange_zscore_conversation
stdev_user_avg_info_exchange_zscore_conversation
min_user_avg_info_exchange_zscore_

In [84]:
X_train

Unnamed: 0,playerCount,CRT_nanmean,CRT_nanstd,IRCS_GS_nanmean,IRCS_GS_nanstd,IRCS_GV_nanmean,IRCS_GV_nanstd,IRCS_IB_nanmean,IRCS_IB_nanstd,IRCS_IR_nanmean,...,max_user_avg_indicative,average_user_avg_forward_flow,stdev_user_avg_forward_flow,min_user_avg_forward_flow,max_user_avg_forward_flow,average_user_avg_certainty_rocklage,stdev_user_avg_certainty_rocklage,min_user_avg_certainty_rocklage,max_user_avg_certainty_rocklage,discursive_diversity
0,3,0.801587,0.216042,4.0000,0.250000,5.466667,0.305505,4.416667,1.127312,4.555556,...,0.0,0.233939,0.012916,0.221023,0.246855,4.500000,0.000000,4.500000,4.500000,0.765981
1,3,0.801587,0.216042,4.0000,0.250000,5.466667,0.305505,4.416667,1.127312,4.555556,...,0.0,0.256893,0.030046,0.226848,0.286939,4.967857,0.172143,4.795714,5.140000,0.545211
2,3,0.801587,0.216042,4.0000,0.250000,5.466667,0.305505,4.416667,1.127312,4.555556,...,0.0,0.269441,0.017498,0.251942,0.286939,4.887024,0.091310,4.795714,4.978333,0.427894
3,3,0.801587,0.216042,4.0000,0.250000,5.466667,0.305505,4.416667,1.127312,4.555556,...,0.0,0.301444,0.033825,0.267619,0.335269,4.912727,0.002727,4.910000,4.915455,0.309471
4,3,0.801587,0.216042,4.0000,0.250000,5.466667,0.305505,4.416667,1.127312,4.555556,...,0.0,0.380066,0.075680,0.320679,0.486871,4.936859,0.218620,4.760909,5.245000,0.354114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013,6,0.714286,0.164957,3.9375,0.314576,4.450000,0.822598,4.875000,0.777282,5.000000,...,0.0,0.337045,0.019224,0.312320,0.359199,5.631656,0.515857,4.907133,6.067833,0.307894
1014,6,0.714286,0.164957,3.9375,0.314576,4.450000,0.822598,4.875000,0.777282,5.000000,...,0.0,0.341225,0.010103,0.327640,0.351849,5.396869,0.332159,5.080736,5.855833,0.215606
1015,6,0.714286,0.164957,3.9375,0.314576,4.450000,0.822598,4.875000,0.777282,5.000000,...,0.0,0.339925,0.020714,0.319919,0.368460,5.469278,0.335790,5.094274,5.909091,0.258001
1016,6,0.714286,0.164957,3.9375,0.314576,4.450000,0.822598,4.875000,0.777282,5.000000,...,0.0,0.330536,0.043657,0.280823,0.387100,5.510152,0.556652,5.027121,6.290000,0.540206


## Try LASSO/Ridge Regression, one Set of Features at a Time

Here, we want to implement *leave-one-out cross-validation*, and use Q^2 as our metric.



Two updates to make here:

1. For nested LASSO/Ridge models, add the ability to initialize the model using the previous weights
2. Visualize importance using another library, like SHAP

In [33]:
# Note --- this uses k-fold cross-validation with k = 5 (the default)
# We are testing 10,000 different alphas, so I feel like this is an OK heuristic
def get_optimal_alpha(y_target, feature_columns_list, lasso):

    if(lasso == True):
        model = LassoCV(n_alphas = 10000)
        model.fit(X_train[feature_columns_list], y_train[y_target])
    else:
        model = RidgeCV(n_alphas = 10000)
        model.fit(X_train[feature_columns_list], y_train[y_target])
        
    return model.alpha_ # optimal alpha

In [34]:
def fit_regularized_linear_model(y_target, feature_columns_list, lasso=True, tune_alpha=False, prev_coefs = None, prev_alpha = None):

    if not tune_alpha:
        alpha = 1.0
    if (prev_alpha is not None):
        alpha = prev_alpha # use previous alpha
        print("Setting alpha to previous...")
        print(alpha)
    else:
        # Hyperparameter tune the alpha
        alpha = get_optimal_alpha(y_target, feature_columns_list, lasso=True)

    if lasso:
        model = Lasso(alpha=alpha)
    else:
        model = Ridge(alpha=alpha)

    if(prev_coefs is not None): # set weights to previous coefficients
        print("Setting coefficients ....")
        model.coef_ = prev_coefs

        print(model.coef_)

    # Calculation of Q^2 metric
    squared_model_prediction_errors = []
    squared_average_prediction_errors = []

    # Initialize a list to store coefficients
    coefficients_list = []

    # Leave one out -- iterate through the entire length of the dataset
    for i in range(len(y_train)):
        # Store the evaluation datapoint
        evaluation_X = X_train.iloc[[i]]
        evaluation_y = y_train.iloc[[i]][y_target]

        # Drop the ith datapoint (leave this one out)
        X_train_fold = X_train.drop(X_train.index[i])
        y_train_fold = y_train.drop(y_train.index[i])[y_target]

        # Fit the model
        model.fit(X_train_fold[feature_columns_list], y_train_fold)

        # Save the Prediction Error
        prediction = model.predict(evaluation_X[feature_columns_list])[0]
        squared_model_prediction_errors.append((evaluation_y - prediction) ** 2)

        # Save the Total Error for this fold
        squared_average_prediction_errors.append((evaluation_y - np.mean(y_train_fold)) ** 2)

        # Append the coefficients to the list
        coefficients_list.append(model.coef_)

    # Create a DataFrame with feature names as rows and iteration results as columns
    feature_coefficients = pd.DataFrame(coefficients_list, columns=feature_columns_list).T

    q_squared = 1 - (np.sum(squared_model_prediction_errors) / np.sum(squared_average_prediction_errors))
    print("Q^2: " + str(q_squared))

    return model, q_squared, feature_coefficients


In [35]:
def display_feature_coefficients(feature_coef_df):
    # Initialize a list to store DataFrames for each feature
    dfs = []

    # Iterate through the rows of the input DataFrame
    for feature_name, coefficients in feature_coef_df.iterrows():
        # Calculate the confidence interval without NaN values
        non_nan_coefficients = coefficients[~np.isnan(coefficients)]
        if len(non_nan_coefficients) == 0:
            # Handle the case where there are no valid coefficients
            continue

        mean_coef = non_nan_coefficients.mean()

        # Check if all coefficients in the row are the same
        if len(coefficients.unique()) == 1:
            # If all coefficients are the same, set the lower and upper CI to the mean
            confidence_interval = (mean_coef, mean_coef)
        else:
            std_error = non_nan_coefficients.sem()
            confidence_interval = stats.t.interval(0.95, len(non_nan_coefficients) - 1, loc=mean_coef, scale=std_error)

        # Create a DataFrame for the summary data
        temp_df = pd.DataFrame({
            "Feature": [feature_name],
            "Mean": [mean_coef],
            "Lower_CI": [confidence_interval[0]],
            "Upper_CI": [confidence_interval[1]]
        })

        # Append the temporary DataFrame to the list
        dfs.append(temp_df)

    # Concatenate all the DataFrames in the list into the final summary DataFrame
    summary_df = pd.concat(dfs, ignore_index=True)

    return summary_df

In [36]:
def sort_by_mean_abs(df):
    return df.reindex(df["Mean"].abs().sort_values(ascending=False).index)

In [37]:
# Go through the different types of features and fit models

# First, create a data structure that saves the result
result = {
    "model": [],
    "model_type": [],
    "features_included": [],
    "alpha": [],
    "q_squared": []
}

result_df = pd.DataFrame(result)

## Team composition features

In [74]:
len(team_composition_features.columns)

33

In [38]:
model_ridge_composition, mrc_q2, mrc_feature_coefficients = fit_regularized_linear_model(desired_target, team_composition_features.columns, lasso = False, tune_alpha = True)

result_df = pd.concat([result_df, pd.DataFrame({"model": [model_ridge_composition], "model_type": ["Ridge"], "features_included": ["Team Composition"], "alpha": [model_ridge_composition.alpha.round(4)], "q_squared": [mrc_q2]})], ignore_index=True)

Q^2: 0.02712302800183075


In [39]:
directory = os.path.dirname(output_path + 'mrc_feature_coefficients' + '.csv')
if not os.path.exists(directory):
    os.makedirs(directory)
mrc_feature_coefficients.to_csv(output_path + 'mrc_feature_coefficients' + '.csv')

In [40]:
sort_by_mean_abs(display_feature_coefficients(mrc_feature_coefficients))

Unnamed: 0,Feature,Mean,Lower_CI,Upper_CI
15,RME_nanmean,0.115979,0.115912,0.116046
0,playerCount,0.051623,0.051576,0.05167
7,IRCS_IB_nanmean,-0.046278,-0.046289,-0.046267
16,RME_nanstd,-0.04568,-0.045745,-0.045615
3,IRCS_GS_nanmean,-0.025561,-0.025573,-0.025549
9,IRCS_IR_nanmean,-0.025179,-0.025198,-0.025159
12,IRCS_IV_nanstd,-0.024909,-0.024924,-0.024894
14,IRCS_RS_nanstd,0.018627,0.018615,0.018639
10,IRCS_IR_nanstd,-0.017992,-0.018009,-0.017975
1,CRT_nanmean,0.017992,0.017986,0.017997


In [41]:
model_lasso_composition, mlc_q2, mlc_feature_coefficients = fit_regularized_linear_model(desired_target, team_composition_features.columns, lasso = True, tune_alpha = True)
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_lasso_composition], "model_type": ["Lasso"], "features_included": ["Team Composition"], "alpha": [model_lasso_composition.alpha.round(4)], "q_squared": [mlc_q2]})], ignore_index=True)

Q^2: 0.020528274452905393


In [42]:
directory = os.path.dirname(output_path + 'mlc_feature_coefficients' + '.csv')
if not os.path.exists(directory):
    os.makedirs(directory)
mlc_feature_coefficients.to_csv(output_path + 'mlc_feature_coefficients' + '.csv')

## Task Features

In [75]:
len(task_features.columns)

27

In [43]:
model_ridge_task, mrt_q2, mrt_feature_coefficients = fit_regularized_linear_model(desired_target, task_features.columns, lasso = False, tune_alpha = True)
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_ridge_task], "model_type": ["Ridge"], "features_included": ["Task Complexity"], "alpha": [model_ridge_task.alpha.round(4)], "q_squared": [mrt_q2]})], ignore_index=True)

Q^2: 0.2974385403380464


In [44]:
directory = os.path.dirname(output_path + 'mrt_feature_coefficients' + '.csv')
if not os.path.exists(directory):
    os.makedirs(directory)
mrt_feature_coefficients.to_csv(output_path + 'mrt_feature_coefficients' + '.csv')

In [45]:
sort_by_mean_abs(display_feature_coefficients(mrt_feature_coefficients))

Unnamed: 0,Feature,Mean,Lower_CI,Upper_CI
22,Q5creativity_input_1,46.729255,46.722419,46.73609
8,Q11optimizing,37.890786,37.878362,37.90321
19,Q24eureka_question,-37.313419,-37.327911,-37.298928
13,Q17within_sys_sol,-33.033407,-33.049286,-33.017528
6,Q9divisible_unitary,-28.727838,-28.731864,-28.723812
1,Q3type_1_planning,26.523305,26.509832,26.536778
9,Q13outcome_multip,23.389089,23.384675,23.393503
3,Q6type_5_cc,-18.509307,-18.516226,-18.502388
12,Q16shared_knowledge,-16.902092,-16.903486,-16.900699
7,Q10maximizing,-15.652993,-15.65946,-15.646525


In [46]:
model_lasso_task, mlt_q2, mlt_feature_coefficients = fit_regularized_linear_model(desired_target, task_features.columns, lasso = True, tune_alpha = True)
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_lasso_task], "model_type": ["Lasso"], "features_included": ["Task Complexity"], "alpha": [model_lasso_task.alpha.round(4)], "q_squared": [mlt_q2]})], ignore_index=True)

Q^2: 0.28539737432670276


In [47]:
directory = os.path.dirname(output_path + 'mlt_feature_coefficients' + '.csv')
if not os.path.exists(directory):
    os.makedirs(directory)
mlt_feature_coefficients.to_csv(output_path + 'mlt_feature_coefficients' + '.csv')

In [48]:
sort_by_mean_abs(display_feature_coefficients(mlt_feature_coefficients))

Unnamed: 0,Feature,Mean,Lower_CI,Upper_CI
22,Q5creativity_input_1,61.444441,61.434393,61.45449
8,Q11optimizing,37.784589,37.766836,37.802343
6,Q9divisible_unitary,-24.949892,-24.959929,-24.939855
19,Q24eureka_question,-18.978516,-18.987495,-18.969536
7,Q10maximizing,-15.402954,-15.409408,-15.396501
9,Q13outcome_multip,7.139211,7.121995,7.156426
26,Medium,-2.966943,-2.970189,-2.963698
25,Low,2.78152,2.778145,2.784894
16,Q20type_3_type_4,0.0,0.0,0.0
24,High,0.0,0.0,0.0


## Task + Composition Together

In [49]:
# add together weights from previous models
previous_best_weights_ridge = np.array(list(model_ridge_composition.coef_) + list(model_ridge_task.coef_))

In [50]:
task_comp_features = list(task_features.columns) + list(team_composition_features.columns)

model_ridge_taskcomp, mrtc_q2, mrtc_feature_coefficients = fit_regularized_linear_model(desired_target, task_comp_features, lasso = False, tune_alpha = False, prev_coefs = previous_best_weights_ridge, prev_alpha = model_ridge_task.alpha)

Setting alpha to previous...
0.172771243077335
Setting coefficients ....
[ 4.94889886e-02  1.74932468e-02  7.30639777e-05 -2.58515980e-02
 -1.10280433e-03  9.86019289e-03  6.52010646e-03 -4.65465416e-02
  1.39987186e-03 -2.53883912e-02 -1.79252755e-02 -1.82007038e-03
 -2.44217719e-02  4.64612932e-03  1.82863028e-02  1.21299363e-01
 -4.74553218e-02 -3.96813019e-03 -1.49803462e-02 -1.69499958e-02
  5.91960488e-03 -6.55525523e-06 -5.87763462e-05  1.64246682e-04
  9.75652298e-05 -7.84858537e-03  5.21632171e-03 -8.01584975e-03
  2.88798058e-03 -1.63465481e-02  3.31664021e-03 -1.09788759e-02
 -6.02455948e-04 -1.25444341e+01  2.66703279e+01 -3.68144696e+00
 -1.84400232e+01  1.36364960e+01 -1.69116377e+00 -2.88280623e+01
 -1.56993887e+01  3.77177473e+01  2.30540208e+01 -1.41721899e+01
  5.60714681e+00 -1.68646086e+01 -3.26610694e+01  2.42659248e+00
  6.52926703e+00 -4.76865173e+00 -5.99017702e+00 -9.95671941e+00
 -3.62898258e+01  7.82541963e-01 -8.89174842e+00  4.66845741e+01
  0.00000000e+00 

In [51]:
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_ridge_taskcomp], "model_type": ["Ridge"], "features_included": ["Team Composition + Task Complexity"], "alpha": [model_ridge_taskcomp.alpha.round(4)], "q_squared": [mrtc_q2]})], ignore_index=True)

In [52]:
directory = os.path.dirname(output_path + 'mrtc_feature_coefficients_initialized' + '.csv')
if not os.path.exists(directory):
    os.makedirs(directory)
mrtc_feature_coefficients.to_csv(output_path + 'mrtc_feature_coefficients_initialized' + '.csv')

In [53]:
sort_by_mean_abs(display_feature_coefficients(mrtc_feature_coefficients))

Unnamed: 0,Feature,Mean,Lower_CI,Upper_CI
22,Q5creativity_input_1,44.181523,44.174427,44.188618
29,CRT_nanstd,36.55223,36.535752,36.568707
55,political_fiscal_nanstd,35.516793,35.499216,35.534369
28,CRT_nanmean,35.028204,35.013401,35.043006
59,political_social_nanstd,-26.939884,-26.960407,-26.919361
19,Q24eureka_question,-25.386726,-25.401082,-25.37237
9,Q13outcome_multip,23.359928,23.355631,23.364224
6,Q9divisible_unitary,-22.053449,-22.058207,-22.048692
7,Q10maximizing,-21.971188,-21.977604,-21.964772
3,Q6type_5_cc,-20.85966,-20.866512,-20.852807


In [54]:
# add together weights from previous models
previous_best_weights_lasso = np.array(list(model_lasso_composition.coef_) + list(model_lasso_task.coef_))

In [55]:
model_lasso_taskcomp, mltc_q2, mltc_feature_coefficients = fit_regularized_linear_model(desired_target, task_comp_features, lasso = True, tune_alpha = False, prev_coefs = previous_best_weights_lasso, prev_alpha = model_lasso_task.alpha)
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_lasso_taskcomp], "model_type": ["Lasso"], "features_included": ["Team Composition + Task Complexity"], "alpha": [model_lasso_taskcomp.alpha.round(4)], "q_squared": [mltc_q2]})], ignore_index=True)

Setting alpha to previous...
0.172771243077335
Setting coefficients ....
[ 0.00000000e+00  0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00  0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  0.00000000e+00  0.00000000e+00 -0.00000000e+00  1.00278384e-04
  0.00000000e+00 -0.00000000e+00  0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00  0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00 -0.00000000e+00 -2.50446378e+01
 -1.52590765e+01  3.77374264e+01  6.95269288e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  0.00000000e+00  0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -1.78663141e+01 -0.00000000e+00 -0.00000000e+00  6.15012041e+01
  0.00000000e+00 

In [56]:
directory = os.path.dirname(output_path + 'mltc_feature_coefficients_initialized' + '.csv')
if not os.path.exists(directory):
    os.makedirs(directory)
mltc_feature_coefficients.to_csv(output_path + 'mltc_feature_coefficients_initialized' + '.csv')

In [57]:
sort_by_mean_abs(display_feature_coefficients(mltc_feature_coefficients))

Unnamed: 0,Feature,Mean,Lower_CI,Upper_CI
22,Q5creativity_input_1,61.53937,61.528188,61.55055
28,CRT_nanmean,23.2637,23.249703,23.2777
6,Q9divisible_unitary,-22.25669,-22.273237,-22.24015
8,Q11optimizing,21.14506,21.125678,21.16444
9,Q13outcome_multip,20.35403,20.336279,20.37177
7,Q10maximizing,-18.0819,-18.095676,-18.06812
19,Q24eureka_question,-15.81018,-15.821805,-15.79855
34,IRCS_IB_nanmean,-11.52541,-11.529625,-11.52119
30,IRCS_GS_nanmean,-5.036105,-5.041051,-5.03116
29,CRT_nanstd,4.812557,4.800714,4.824399


## Conversation Alone

In [None]:
model_lasso_comms, mlcom_q2, mlcom_feature_coefficients = fit_regularized_linear_model(desired_target, conv_features.columns, lasso = True, tune_alpha = True)
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_lasso_comms], "model_type": ["Lasso"], "features_included": ["Communication"], "alpha": [model_lasso_comms.alpha.round(4)], "q_squared": [mlcom_q2]})], ignore_index=True)

In [None]:
directory = os.path.dirname(output_path + 'mlcom_feature_coefficients' + '.csv')
if not os.path.exists(directory):
    os.makedirs(directory)
mlcom_feature_coefficients.to_csv(output_path + 'mlcom_feature_coefficients' + '.csv')

In [None]:
sort_by_mean_abs(display_feature_coefficients(mlcom_feature_coefficients))

In [None]:
model_ridge_comms, mrcom_q2, mrcom_feature_coefficients = fit_regularized_linear_model(desired_target, conv_features.columns, lasso = False, tune_alpha = True)
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_ridge_comms], "model_type": ["Ridge"], "features_included": ["Communication"], "alpha": [model_ridge_comms.alpha.round(4)], "q_squared": [mrcom_q2]})], ignore_index=True)

In [None]:
directory = os.path.dirname(output_path + 'mrcom_feature_coefficients' + '.csv')
if not os.path.exists(directory):
    os.makedirs(directory)
mrcom_feature_coefficients.to_csv(output_path + 'mrcom_feature_coefficients' + '.csv')

In [None]:
sort_by_mean_abs(display_feature_coefficients(mrcom_feature_coefficients))

## Conversation Features + Task Features

In [58]:
task_lasso_weights = np.array(list(model_lasso_task.coef_) + list(np.zeros(len((conv_features.columns)))))

In [59]:
convtask_features = list(task_features.columns) + list(conv_features.columns)
model_lasso_tconv, mltconv_q2, mltconv_feature_coefficients = fit_regularized_linear_model(desired_target, convtask_features, lasso = True, tune_alpha = False, prev_coefs=task_lasso_weights, prev_alpha = model_lasso_task.alpha)
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_lasso_tconv], "model_type": ["Lasso"], "features_included": ["Task Complexity + Communication"], "alpha": [model_lasso_tconv.alpha.round(4)], "q_squared": [mltconv_q2]})], ignore_index=True)

Setting alpha to previous...
0.172771243077335
Setting coefficients ....
[-0. -0. -0. ...  0.  0.  0.]


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.315e+05, tolerance: 8.059e+01
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.315e+05, tolerance: 8.059e+01
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.315e+05, tolerance: 8.059e+01
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.314e+05, tolerance: 8.059e+01
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.315e+05, tolerance: 8.059e+01
Objective did n

Q^2: -81.60904958541602


In [60]:
directory = os.path.dirname(output_path + 'mltconv_feature_coefficients_initialized' + '.csv')
if not os.path.exists(directory):
    os.makedirs(directory)
mltconv_feature_coefficients.to_csv(output_path + 'mltconv_feature_coefficients_initialized' + '.csv')

In [61]:
sort_by_mean_abs(display_feature_coefficients(mltconv_feature_coefficients))

Unnamed: 0,Feature,Mean,Lower_CI,Upper_CI
22,Q5creativity_input_1,47.556530,47.545591,47.567468
7,Q10maximizing,-37.260172,-37.274975,-37.245369
9,Q13outcome_multip,19.568079,19.548896,19.587263
8,Q11optimizing,17.365403,17.346477,17.384330
18,Q23ss_out_uncert,12.836232,12.825732,12.846731
...,...,...,...,...
442,stdev_user_sum_conjunction_lexical_per_100,0.000000,0.000000,0.000000
443,min_user_sum_conjunction_lexical_per_100,0.000000,0.000000,0.000000
444,max_user_sum_conjunction_lexical_per_100,0.000000,0.000000,0.000000
445,average_user_sum_certainty_lexical_per_100,0.000000,0.000000,0.000000


In [62]:
task_ridge_weights = np.array(list(model_ridge_task.coef_) + list(np.zeros(len((conv_features.columns)))))

In [63]:
model_ridge_tconv, mrtconv_q2, mrtconv_feature_coefficients = fit_regularized_linear_model(desired_target, convtask_features, lasso = False, tune_alpha = False, prev_coefs=task_ridge_weights, prev_alpha = model_ridge_task.alpha)
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_ridge_tconv], "model_type": ["Ridge"], "features_included": ["Task Complexity + Communication"], "alpha": [model_ridge_tconv.alpha.round(4)], "q_squared": [mrtconv_q2]})], ignore_index=True)

Setting alpha to previous...
0.172771243077335
Setting coefficients ....
[-12.54443413  26.67032786  -3.68144696 ...   0.           0.
   0.        ]


Ill-conditioned matrix (rcond=2.15099e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.24854e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.22749e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.45825e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.0961e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.03634e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.06617e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.64686e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.38335e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.24224e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.23444e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.1031e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.10525e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.10487e-17): result may not be accu

Q^2: -34.418064790341674


In [64]:
directory = os.path.dirname(output_path + 'mrtconv_feature_coefficients_initialized' + '.csv')
if not os.path.exists(directory):
    os.makedirs(directory)
mrtconv_feature_coefficients.to_csv(output_path + 'mrtconv_feature_coefficients_initialized' + '.csv')

In [65]:
sort_by_mean_abs(display_feature_coefficients(mrtconv_feature_coefficients))

Unnamed: 0,Feature,Mean,Lower_CI,Upper_CI
351,stdev_1st_person_start,7.803761e+01,7.799753e+01,7.807768e+01
608,max_user_sum_auxiliary_verbs_lexical_per_100,5.315079e+01,5.311482e+01,5.318676e+01
612,max_user_sum_cognitive_mech_lexical_per_100,5.203237e+01,5.199413e+01,5.207061e+01
1072,average_user_avg_textblob_polarity,4.919355e+01,4.915752e+01,4.922957e+01
585,max_user_sum_tentativeness_lexical_per_100,4.833581e+01,4.830416e+01,4.836746e+01
...,...,...,...,...
702,min_user_sum_time_diff,-6.335436e-04,-6.403570e-04,-6.267302e-04
855,min_user_avg_anxiety_lexical_per_100,-1.392219e-04,-2.332835e-04,-4.516023e-05
692,average_user_sum_positivity_zscore_conversation,-1.559604e-12,-1.569292e-12,-1.549916e-12
425,average_user_sum_info_exchange_zscore_conversa...,-2.426607e-13,-2.472611e-13,-2.380603e-13


## Model with All Features

In [66]:
task_composition_lasso_weights = np.array(list(model_lasso_taskcomp.coef_) + list(np.zeros(len((conv_features.columns)))))

In [67]:
all_features = list(task_features.columns) + list(team_composition_features.columns) + list(conv_features.columns)
model_lasso_all, mlall_q2, mlall_feature_coefficients = fit_regularized_linear_model(desired_target, all_features, lasso = True, tune_alpha = False, prev_coefs=task_composition_lasso_weights, prev_alpha = model_lasso_taskcomp.alpha)
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_lasso_all], "model_type": ["Lasso"], "features_included": ["All Features"], "alpha": [model_lasso_all.alpha.round(4)], "q_squared": [mlall_q2]})], ignore_index=True)


Setting alpha to previous...
0.172771243077335
Setting coefficients ....
[-0. -0. -0. ...  0.  0.  0.]


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.234e+05, tolerance: 8.059e+01
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.233e+05, tolerance: 8.059e+01
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.233e+05, tolerance: 8.059e+01
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.233e+05, tolerance: 8.059e+01
Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.234e+05, tolerance: 8.059e+01
Objective did n

Q^2: -105.67981805953013


In [68]:
directory = os.path.dirname(output_path + 'mlall_feature_coefficients_initialized' + '.csv')
if not os.path.exists(directory):
    os.makedirs(directory)
mlall_feature_coefficients.to_csv(output_path + 'mlall_feature_coefficients_initialized' + '.csv')

In [69]:
sort_by_mean_abs(display_feature_coefficients(mlall_feature_coefficients))

Unnamed: 0,Feature,Mean,Lower_CI,Upper_CI
22,Q5creativity_input_1,46.731834,46.720658,46.743011
7,Q10maximizing,-34.300366,-34.315915,-34.284818
8,Q11optimizing,21.068294,21.049006,21.087583
9,Q13outcome_multip,16.246499,16.227086,16.265912
18,Q23ss_out_uncert,12.680799,12.670168,12.691429
...,...,...,...,...
472,min_user_sum_home_lexical_per_100,0.000000,0.000000,0.000000
473,max_user_sum_home_lexical_per_100,0.000000,0.000000,0.000000
474,average_user_sum_conjunction_lexical_per_100,0.000000,0.000000,0.000000
475,stdev_user_sum_conjunction_lexical_per_100,0.000000,0.000000,0.000000


In [70]:
task_composition_ridge_weights = np.array(list(model_ridge_taskcomp.coef_) + list(np.zeros(len((conv_features.columns)))))

model_ridge_all, mrall_q2, mrall_feature_coefficients = fit_regularized_linear_model(desired_target, all_features, lasso = False, tune_alpha = True, prev_coefs=task_composition_ridge_weights, prev_alpha = model_ridge_taskcomp.alpha)
result_df = pd.concat([result_df, pd.DataFrame({"model": [model_ridge_all], "model_type": ["Ridge"], "features_included": ["All Features"], "alpha": [model_ridge_all.alpha.round(4)], "q_squared": [mrall_q2]})], ignore_index=True)

Setting alpha to previous...
0.172771243077335
Setting coefficients ....
[-17.24041706  12.54203156  -8.71965578 ...   0.           0.
   0.        ]


Ill-conditioned matrix (rcond=1.95361e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.45122e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.70016e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.35697e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.02189e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.35114e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.23232e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.21707e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.21067e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.8204e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.31761e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.49432e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.34579e-17): result may not be accurate.
Ill-conditioned matrix (rcond=2.19687e-17): result may not be acc

Q^2: -103.91548347637963


In [71]:
directory = os.path.dirname(output_path + 'mrall_feature_coefficients_initialized' + '.csv')
if not os.path.exists(directory):
    os.makedirs(directory)
mrall_feature_coefficients.to_csv(output_path + 'mrall_feature_coefficients_initialized' + '.csv')

In [72]:
sort_by_mean_abs(display_feature_coefficients(mrall_feature_coefficients))

Unnamed: 0,Feature,Mean,Lower_CI,Upper_CI
384,stdev_1st_person_start,7.305411e+01,7.301646e+01,7.309176e+01
682,average_user_sum_NTRI,5.724976e+01,5.721262e+01,5.728691e+01
645,max_user_sum_cognitive_mech_lexical_per_100,5.427434e+01,5.423767e+01,5.431101e+01
411,min_hasnegative,4.839323e+01,4.833824e+01,4.844821e+01
497,max_user_sum_adverbs_lexical_per_100,4.785048e+01,4.781561e+01,4.788536e+01
...,...,...,...,...
48,income_max_nanmean,2.498999e-05,2.477636e-05,2.520362e-05
51,income_min_nanstd,2.174352e-05,2.142582e-05,2.206123e-05
725,average_user_sum_positivity_zscore_conversation,4.002659e-13,3.906925e-13,4.098393e-13
458,average_user_sum_info_exchange_zscore_conversa...,-2.873915e-13,-2.917184e-13,-2.830646e-13


# Dataframe that summarizes all these experiments!

In [None]:
result_df.sort_values(by = "q_squared", ascending = False)

In [None]:
# Save for comparing results
directory = os.path.dirname(validation_results_output_name)
if not os.path.exists(directory):
    os.makedirs(directory)
result_df.sort_values(by = "q_squared", ascending = False).to_csv(validation_results_output_name)

# Feature Importance

In [None]:
sort_by_mean_abs(display_feature_coefficients(mlall_feature_coefficients))

In [None]:
def plot_top_n_features(data, n, filepath):
    # Calculate the absolute mean value and sort the DataFrame in descending order
    data['Absolute_Mean'] = data['Mean'].abs()
    top_n_features = data.sort_values(by='Absolute_Mean', ascending=False).head(n)

    # Define color mapping for the features
    color_map = {}
    name_map = {}
    for feature in task_features.columns:
        color_map[feature] = 'yellowgreen'
        name_map[feature] = "Task Feature"
    for feature in conv_features.columns:
        color_map[feature] = 'powderblue'
        name_map[feature] = "Conversation Feature"
    for feature in team_composition_features.columns:
        color_map[feature] = 'lightpink'
        name_map[feature] = "Team Composition Feature"

    # Create a horizontal bar graph
    plt.figure(figsize=(10, 6))

    handles = []

    for feature in top_n_features['Feature']:
        color = color_map.get(feature, 'k')  # Default to black if not in any list
        bars = plt.barh(feature, top_n_features[top_n_features['Feature'] == feature]['Mean'], color=color)
        handles.append(bars[0])

    # Customize the plot
    plt.xlabel('Mean Coefficient (Across LOO Cross Validation)', fontsize = 14)
    plt.title(f'Top {n} features for {desired_target} (min chats = {min_num_chats})', fontsize=20)
    plt.gca().invert_yaxis()  # Invert the y-axis to display the highest value at the top

    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)

    # Create a legend outside the plot area with unique labels
    unique_features = []
    unique_labels = []
    for feature in top_n_features['Feature']:
        if name_map.get(feature, feature) not in unique_labels:
            unique_labels.append(name_map.get(feature, feature))
            unique_features.append(feature)

    legend_handles = [plt.Line2D([0], [0], color=color_map.get(feature, 'k'), lw=4, label=name_map.get(feature, feature)) for feature in unique_features]
    plt.legend(handles=legend_handles, loc='center left', fontsize = 14, bbox_to_anchor=(1, 0.5))

    # Add labels to the bars with increased text size and Mean rounded to 2 decimals, consistently inside the bar
    label_offset = 0.4  # Adjust this value for proper spacing
    for bar, value, feature in zip(handles, top_n_features['Mean'], top_n_features['Feature']):
        label_x = (max(value, 0) if value >= 0 else min(value, 0))
        bbox = bar.get_bbox()
        label_y = bbox.bounds[1] + label_offset
        if value >= 0:
            plt.text(label_x, label_y, f'{value:.2f}', va='center', fontsize=12)
        else:
            plt.text(label_x, label_y, f'{value:.2f}', ha='right', va='center', fontsize=12)

    # Show the plot
    plt.savefig(filepath + ".svg")
    plt.savefig(filepath + ".png")
    plt.show()

In [None]:
plot_top_n_features(display_feature_coefficients(mlall_feature_coefficients), 10, filepath = "./figures/multi_task_cumulative_stage" + "_" + desired_target + "_min_chat_num_" + str(min_num_chats))

Questions:
- More deeply understand difference between LASSO and Ridge
- Better understand `alpha` hyperparameter
- Why doesn't more features mean a better R^2? (Wouldn't the model 'throw out' features that don't work?)