In [184]:
import os
import sys
import pandas as pd

# Specify the directory containing the module you want to import
tpm_directory = '/Users/priyadcosta/Documents/GitHub/coefficientofconflict/team-process-map/feature_engine'

# Add the directory to sys.path
sys.path.append(tpm_directory)


### Step 1 : Basic Pre-processing

Converting the labels to numbers and averaging them

In [185]:
data = pd.read_csv('/Users/priyadcosta/Documents/GitHub/coefficientofconflict/tpm-data-anotation/CONFLICT_CONVO_LABELING_LOG.csv')

In [186]:
"""
Convert the labels into numeric scores
"""

def get_numeric_labels(text):

    # Convert the text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize the result variable
    result = 0
    
    # Check if "yes" is present in the text
    if 'yes' in text_lower:
        result = 1
    elif 'no' in text_lower:
        result = 0
    
    return result

"""
Convert all the columns to numeric labels
"""
def convert_labels(df):
    
    df['d_content'] = df['rating_directness_content'].apply(get_numeric_labels)
    df['d_expression'] = df['rating_directness_expression'].apply(get_numeric_labels)
    df['oi_content'] = df['rating_OI_content'].apply(get_numeric_labels)
    df['oi_expression'] = df['rating_OI_expression'].apply(get_numeric_labels)


"""
Get the average of the ratings for a single column
"""
def get_averages(df,on_column):

    # Calculate average ratings
    average_ratings = df.groupby(['CONV_ID', 'id'])[on_column].mean().reset_index()

    # Merge average ratings with original DataFrame
    df = df.merge(average_ratings, on=['CONV_ID', 'id'], how='left', suffixes=('', '_average'))

    return df


"""
Get the average ratings for all the columns
"""
def average_labels(df, columns):
    for column in columns:
        df = get_averages(df, column)
    return df


In [187]:
data.head()

Unnamed: 0,CONV_ID,id,rating_directness_content,rating_directness_expression,rating_OI_content,rating_OI_expression,rater_id,status,last_updated_time
0,d3j4tn9,d3j4tn9_1,Yes - Direct Content,Yes - Direct Expression,Yes - Content opposes someone else,Yes - Expression is emotional/forceful,amy,done,2024-02-23 10:40:55.334794
1,d3j4tn9,d3j4tn9_2,Yes - Direct Content,Yes - Direct Expression,Yes - Content opposes someone else,No - Expression is not emotional/forceful,amy,done,2024-02-23 10:40:56.240762
2,d3j4tn9,d3jqp6d,Yes - Direct Content,Yes - Direct Expression,Yes - Content opposes someone else,Yes - Expression is emotional/forceful,amy,done,2024-02-23 10:40:57.068755
3,d3j4tn9,d3jtj7i,Yes - Direct Content,Yes - Direct Expression,Yes - Content opposes someone else,No - Expression is not emotional/forceful,amy,done,2024-02-23 10:40:57.947723
4,d3j4tn9,d3ldu4f,Yes - Direct Content,Yes - Direct Expression,Yes - Content opposes someone else,Yes - Expression is emotional/forceful,amy,done,2024-02-23 10:40:58.982175


In [188]:
"""
Determine the labels for the dataset
"""
def get_label(conv_id):
    if conv_id.endswith('_A') or conv_id.endswith('_B'):
        return 'winning'
    else:
        return 'awry'

""" 
Get the dataset which the conversation belongs to awry or winning
"""
def dataset_labels(df):
    df['dataset'] = df['CONV_ID'].apply(get_label)
    

In [189]:
"""
Drop unncessary columns 
"""
def drop_cols(df):
    return df[['d_content_average', 'd_expression_average', 'oi_content_average',
       'oi_expression_average', 'dataset']]

In [190]:
# get the dataset to which the chat belongs
dataset_labels(data)

#convert the text labels to numeric labels
convert_labels(data)

#get the average rating for each chat
numeric_cols = ['d_content', 'd_expression', 'oi_content', 'oi_expression']
data = average_labels(data,numeric_cols)

In [191]:
print('awry convos ' + str(data[data['dataset'] == 'awry']['CONV_ID'].nunique()))
print('winning convos ' + str(data[data['dataset'] == 'winning']['CONV_ID'].nunique()))

awry convos 32
winning convos 26


In [192]:
new_data = drop_cols(data)

In [193]:
new_data.columns

Index(['d_content_average', 'd_expression_average', 'oi_content_average',
       'oi_expression_average', 'dataset'],
      dtype='object')

### Step2 : Logistic Regression

In [194]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [195]:

def run_logistic_regression(df,target_column):

    # Split features and target
    X = df.drop(target_column, axis=1)  # Features
    y = df[target_column]   

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19104)
    model = LogisticRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

In [196]:
run_logistic_regression(new_data,'dataset')

Accuracy: 0.6490066225165563
              precision    recall  f1-score   support

        awry       0.66      0.97      0.79       201
     winning       0.14      0.01      0.02       101

    accuracy                           0.65       302
   macro avg       0.40      0.49      0.40       302
weighted avg       0.49      0.65      0.53       302



### Step 3 : Calculating TPM Features on the conflict dataset