In [254]:
import os
import sys
import pandas as pd

# Specify the directory containing the module you want to import
tpm_directory = '/Users/priyadcosta/Documents/GitHub/coefficientofconflict/team-process-map/feature_engine'

# Add the directory to sys.path
sys.path.append(tpm_directory)


### Step 1 : Basic Pre-processing

Converting the labels to numbers and averaging them

In [255]:
data = pd.read_csv('/Users/priyadcosta/Documents/GitHub/coefficientofconflict/tpm-data-anotation/CONFLICT_CONVO_LABELING_LOG.csv')

In [256]:
"""
Convert the labels into numeric scores
"""

def get_numeric_labels(text):

    # Convert the text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize the result variable
    result = 0
    
    # Check if "yes" is present in the text
    if 'yes' in text_lower:
        result = 1
    elif 'no' in text_lower:
        result = 0
    
    return result

"""
Convert all the columns to numeric labels
"""
def convert_labels(df):
    
    df['d_content'] = df['rating_directness_content'].apply(get_numeric_labels)
    df['d_expression'] = df['rating_directness_expression'].apply(get_numeric_labels)
    df['oi_content'] = df['rating_OI_content'].apply(get_numeric_labels)
    df['oi_expression'] = df['rating_OI_expression'].apply(get_numeric_labels)


"""
Get the average of the ratings for a single column
"""
def get_averages(df,on_column):

    # Calculate average ratings
    average_ratings = df.groupby(['CONV_ID', 'id'])[on_column].mean().reset_index()

    # Merge average ratings with original DataFrame
    df = df.merge(average_ratings, on=['CONV_ID', 'id'], how='left', suffixes=('', '_average'))

    return df


"""
Get the average ratings for all the columns
"""
def average_labels(df, columns):
    for column in columns:
        df = get_averages(df, column)
    return df


In [257]:
"""
Determine the labels for the dataset
"""
def get_label(conv_id):
    if conv_id.endswith('_A') or conv_id.endswith('_B'):
        return 'winning'
    else:
        return 'awry'

""" 
Get the dataset which the conversation belongs to awry or winning
"""
def dataset_labels(df):
    df['dataset'] = df['CONV_ID'].apply(get_label)
    

In [258]:
"""
Drop unncessary columns 
"""
def drop_cols(df,type):
    if type == 'average':
        return df[['d_content_average', 'd_expression_average', 'oi_content_average','oi_expression_average', 'dataset']]
    else:
        return df[['d_content', 'd_expression', 'oi_content','oi_expression','dataset']]

In [259]:
# get the dataset to which the chat belongs
dataset_labels(data)

#convert the text labels to numeric labels
convert_labels(data)

#get the average rating for each chat
numeric_cols = ['d_content', 'd_expression', 'oi_content', 'oi_expression']
data = average_labels(data,numeric_cols)

In [260]:
print('awry convos ' + str(data[data['dataset'] == 'awry']['CONV_ID'].nunique()))
print('winning convos ' + str(data[data['dataset'] == 'winning']['CONV_ID'].nunique()))

awry convos 32
winning convos 26


In [261]:
avg_data = drop_cols(data,'average')
original_data = drop_cols(data,'original')

### Step2 : Logistic Regression

In [262]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [263]:

def run_logistic_regression(df,target_column):

    # Split features and target
    X = df.drop(target_column, axis=1)  # Features
    y = df[target_column]   

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19104)
    model = LogisticRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    # Assuming you have already trained a logistic regression model named 'model'
    # and 'X_train' is your feature matrix

    # Get the coefficients (weights) of the logistic regression model
    coefficients = model.coef_[0]

    # Get the names of the features
    feature_names = X_train.columns

    # Create a DataFrame to store the coefficients and feature names
    coefficients_df = pd.DataFrame({'Feature': feature_names, 'Weights': coefficients})

    # Sort the DataFrame by coefficient magnitude (absolute value) to identify the most predictive features
    coefficients_df = coefficients_df.sort_values(by='Weights', ascending=False)

    # Display the DataFrame
    print(coefficients_df)


In [264]:
run_logistic_regression(avg_data,'dataset')

Accuracy: 0.6490066225165563
              precision    recall  f1-score   support

        awry       0.66      0.97      0.79       201
     winning       0.14      0.01      0.02       101

    accuracy                           0.65       302
   macro avg       0.40      0.49      0.40       302
weighted avg       0.49      0.65      0.53       302

                 Feature   Weights
1   d_expression_average  2.155936
0      d_content_average -1.463885
2     oi_content_average -1.547637
3  oi_expression_average -1.766148


In [265]:
run_logistic_regression(original_data,'dataset')

Accuracy: 0.652317880794702
              precision    recall  f1-score   support

        awry       0.66      0.98      0.79       201
     winning       0.00      0.00      0.00       101

    accuracy                           0.65       302
   macro avg       0.33      0.49      0.39       302
weighted avg       0.44      0.65      0.53       302

         Feature   Weights
1   d_expression  1.277079
2     oi_content -1.014712
3  oi_expression -1.149079
0      d_content -1.226290


### Step 3 : Calculating TPM Features on the conflict dataset

In [272]:
from importlib import import_module
clean_multi_task_data = import_module('team-process-map')

In [271]:
"""
A class to extract TPM features to compute directness and oppositional intensity
"""

class TpmFeatures:
    def __init__(self):
            self.odometer_reading = 0
    
    
    """ 
    """
    def get_tpm_features():
          
    