In [181]:
import os
import sys
import pandas as pd

# Specify the directory containing the module you want to import
tpm_directory = '/Users/priyadcosta/Documents/GitHub/coefficientofconflict/team-process-map/feature_engine'

# Add the directory to sys.path
sys.path.append(tpm_directory)


### Step 1 : Basic Pre-processing

Converting the labels to numbers and averaging them

In [182]:
data = pd.read_csv('/Users/priyadcosta/Documents/GitHub/coefficientofconflict/tpm-data-anotation/CONFLICT_CONVO_LABELING_LOG.csv')

In [183]:
"""
Convert the labels into numeric scores
"""

def get_numeric_labels(text):

    # Convert the text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize the result variable
    result = 0
    
    # Check if "yes" is present in the text
    if 'yes' in text_lower:
        result = 1
    elif 'no' in text_lower:
        result = 0
    
    return result

"""
Convert all the columns to numeric labels
"""
def convert_labels(df):
    
    df['d_content'] = df['rating_directness_content'].apply(get_numeric_labels)
    df['d_expression'] = df['rating_directness_expression'].apply(get_numeric_labels)
    df['oi_content'] = df['rating_OI_content'].apply(get_numeric_labels)
    df['oi_expression'] = df['rating_OI_expression'].apply(get_numeric_labels)


"""
Get the average of the ratings for a single column
"""
def get_averages(df,on_column):

    # Calculate average ratings
    average_ratings = df.groupby(['CONV_ID', 'id'])[on_column].mean().reset_index()

    # Merge average ratings with original DataFrame
    df = df.merge(average_ratings, on=['CONV_ID', 'id'], how='left', suffixes=('', '_average'))

    return df


"""
Get the average ratings for all the columns
"""
def average_labels(df, columns):
    for column in columns:
        df = get_averages(df, column)
    return df


In [184]:
"""
Determine the labels for the dataset
"""
def get_label(conv_id):
    if conv_id.endswith('_A') or conv_id.endswith('_B'):
        return 'winning'
    else:
        return 'awry'

""" 
Get the dataset which the conversation belongs to awry or winning
"""
def dataset_labels(df):
    df['dataset'] = df['CONV_ID'].apply(get_label)
    

In [185]:
"""
Drop unncessary columns 
"""
def drop_cols(df,type):
    if type == 'average':
        return df[['d_content_average', 'd_expression_average', 'oi_content_average','oi_expression_average', 'dataset']]
    else:
        return df[['d_content', 'd_expression', 'oi_content','oi_expression','dataset']]

In [186]:
# get the dataset to which the chat belongs
dataset_labels(data)

#convert the text labels to numeric labels
convert_labels(data)

#get the average rating for each chat
numeric_cols = ['d_content', 'd_expression', 'oi_content', 'oi_expression']
data = average_labels(data,numeric_cols)

In [187]:
print('awry convos ' + str(data[data['dataset'] == 'awry']['CONV_ID'].nunique()))
print('winning convos ' + str(data[data['dataset'] == 'winning']['CONV_ID'].nunique()))

awry convos 32
winning convos 26


In [188]:
avg_data = drop_cols(data,'average')
original_data = drop_cols(data,'original')

### Step2 : Logistic Regression

In [189]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [190]:

def run_logistic_regression(df,target_column):

    # Split features and target
    X = df.drop(target_column, axis=1)  # Features
    y = df[target_column]   

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19104)
    model = LogisticRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    # Assuming you have already trained a logistic regression model named 'model'
    # and 'X_train' is your feature matrix

    # Get the coefficients (weights) of the logistic regression model
    coefficients = model.coef_[0]

    # Get the names of the features
    feature_names = X_train.columns

    # Create a DataFrame to store the coefficients and feature names
    coefficients_df = pd.DataFrame({'Feature': feature_names, 'Weights': coefficients})

    # Sort the DataFrame by coefficient magnitude (absolute value) to identify the most predictive features
    coefficients_df = coefficients_df.sort_values(by='Weights', ascending=False)

    # Display the DataFrame
    print(coefficients_df)


In [191]:
run_logistic_regression(avg_data,'dataset')

Accuracy: 0.6490066225165563
              precision    recall  f1-score   support

        awry       0.66      0.97      0.79       201
     winning       0.14      0.01      0.02       101

    accuracy                           0.65       302
   macro avg       0.40      0.49      0.40       302
weighted avg       0.49      0.65      0.53       302

                 Feature   Weights
1   d_expression_average  2.155936
0      d_content_average -1.463885
2     oi_content_average -1.547637
3  oi_expression_average -1.766148


In [192]:
run_logistic_regression(original_data,'dataset')

Accuracy: 0.652317880794702
              precision    recall  f1-score   support

        awry       0.66      0.98      0.79       201
     winning       0.00      0.00      0.00       101

    accuracy                           0.65       302
   macro avg       0.33      0.49      0.39       302
weighted avg       0.44      0.65      0.53       302

         Feature   Weights
1   d_expression  1.277079
2     oi_content -1.014712
3  oi_expression -1.149079
0      d_content -1.226290


### Step 4 : Neural Network

In [193]:
def get_dataset_numeric_labels(text):

    # Convert the text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize the result variable
    result = 0
    
    # Check if "yes" is present in the text
    if 'winning' in text_lower:
        result = 1
    elif 'awry' in text_lower:
        result = 0
    
    return result

In [194]:
#convert the dataset labels to numbers. winning = 1, awry = 0
avg_data['dataset_numeric'] = avg_data['dataset'].apply(get_dataset_numeric_labels)
data['dataset_numeric'] = data['dataset'].apply(get_dataset_numeric_labels)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  avg_data['dataset_numeric'] = avg_data['dataset'].apply(get_dataset_numeric_labels)


In [195]:
data.columns

Index(['CONV_ID', 'id', 'rating_directness_content',
       'rating_directness_expression', 'rating_OI_content',
       'rating_OI_expression', 'rater_id', 'status', 'last_updated_time',
       'dataset', 'd_content', 'd_expression', 'oi_content', 'oi_expression',
       'd_content_average', 'd_expression_average', 'oi_content_average',
       'oi_expression_average', 'dataset_numeric'],
      dtype='object')

In [196]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import precision_score, recall_score, f1_score

# Define the neural network model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(4, 64)   # Input size: 4, Output size: 64
        self.fc2 = nn.Linear(64, 32)  # Input size: 64, Output size: 32
        self.fc3 = nn.Linear(32, 1)   # Input size: 32, Output size: 1
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        return x

def neural_net(df):

    # Select features and target variable
    X = df[['d_content_average', 'd_expression_average', 'oi_content_average', 'oi_expression_average']]
    y = df['dataset_numeric']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19104)

    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert data to PyTorch tensors
    X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)  # Reshape to (batch_size, 1)
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

    # Create an instance of the model
    model = NeuralNetwork()

    # Define the loss function and optimizer - Most popularly used
    criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
    optimizer = optim.AdamW(model.parameters(), lr=0.001)  # AdamW optimizer with weight decay

    #TensorDataset: This class is used to wrap tensors representing the input features and target labels into a single dataset object. Each sample in the dataset corresponds to a pair of input features and target labels.
    #DataLoader: This class is used to create an iterable over the dataset, enabling you to iterate through batches of data during training. It allows you to specify parameters such as batch size and whether to shuffle the data between epochs.
    
    # Convert data to DataLoader
    train_data = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

    # Training the model
    epochs = 50
    for epoch in range(epochs):
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

    # Evaluate the model
    with torch.no_grad():
        model.eval()
        outputs = model(X_test_tensor)
        predictions = (outputs >= 0.5).float()  # Thresholding at 0.5
        
        # Convert PyTorch tensors to numpy arrays with float32 data type
        predictions_np = predictions.numpy().astype('float32')
        y_test_np = y_test_tensor.numpy().astype('float32')
        
        # Calculate precision, recall, and F1 score for each label
        precision_per_label = precision_score(y_test_np, predictions_np, average=None)
        recall_per_label = recall_score(y_test_np, predictions_np, average=None)
        f1_per_label = f1_score(y_test_np, predictions_np, average=None)
        
        # Print precision, recall, and F1 score for each label
        for i in range(len(precision_per_label)):
            print(f'Label {i}: Precision: {precision_per_label[i]:.4f}, Recall: {recall_per_label[i]:.4f}, F1 Score: {f1_per_label[i]:.4f}')


In [197]:
neural_net(avg_data)

Label 0: Precision: 0.6656, Recall: 1.0000, F1 Score: 0.7992
Label 1: Precision: 0.0000, Recall: 0.0000, F1 Score: 0.0000


  _warn_prf(average, modifier, msg_start, len(result))


### Step 5 : Attention

In [198]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.nn.utils.rnn import pad_sequence
import torch

def preprocess_for_attention(df):
    # Example DataFrame creation (replace this with your actual DataFrame loading)
    np.random.seed(19104)  # For reproducible random results

    # Normalize features
    scaler = StandardScaler()
    features = ['d_content_average', 'd_expression_average', 'oi_content_average', 'oi_expression_average']
    df[features] = scaler.fit_transform(df[features])

    # Group by 'CONV_ID' and prepare sequences and targets
    grouped = df.groupby('CONV_ID')
    sequences = []
    targets = []

    for _, group in grouped:
        seq = group[features].values  # Extract features as sequence
        target = group['dataset_numeric'].values[-1]  
        sequences.append(torch.tensor(seq, dtype=torch.float))
        targets.append(torch.tensor(target, dtype=torch.float))

    # Padding sequences to have the same length
    padded_sequences = pad_sequence(sequences, batch_first=True)

    # Splitting the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(padded_sequences, targets, test_size=0.2, random_state=42)

    # Convert lists to tensor for targets if necessary
    y_train = torch.stack(y_train)
    y_test = torch.stack(y_test)
    
    return X_train, X_test, y_train, y_test


In [199]:
X_train, X_test, y_train, y_test = preprocess_for_attention(data)

In [208]:
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import precision_score, recall_score

class LSTMWithAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim=1, num_layers=1):
        super(LSTMWithAttention, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.attention = nn.Linear(hidden_dim, 1)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        attn_weights = F.softmax(self.attention(lstm_out), dim=1)
        attn_applied = torch.bmm(attn_weights.transpose(1, 2), lstm_out)
        output = self.fc(attn_applied.squeeze(1))
        return output


def train_attention_model(X_train, X_test, y_train, y_test):

    # Model instantiation
    input_dim = 4  # Number of input features
    hidden_dim = 64
    model = LSTMWithAttention(input_dim, hidden_dim)

    # Define loss function and optimizer
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters())

    # Training loop
    num_epochs = 1000
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs.squeeze(), y_train)
        loss.backward()
        optimizer.step()

        # print(f'Epoch {epoch+1}, Loss: {loss.item()}')
        
    # Evaluation phase
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Disable gradient calculation
        test_outputs = model(X_test)

        # Convert model outputs to binary predictions
        preds = torch.sigmoid(test_outputs.squeeze()) >= 0.5
        
        # Convert tensors to NumPy arrays for sklearn metrics
        predictions_np = preds.numpy().astype('float32')
        y_test_np = y_test.numpy().astype('float32')

    # Calculate precision, recall, and F1 score for each label
        precision_per_label = precision_score(y_test_np, predictions_np, average=None)
        recall_per_label = recall_score(y_test_np, predictions_np, average=None)
        f1_per_label = f1_score(y_test_np, predictions_np, average=None)
        
        # Print precision, recall, and F1 score for each label
        for i in range(len(precision_per_label)):
            print(f'Label {i}: Precision: {precision_per_label[i]:.4f}, Recall: {recall_per_label[i]:.4f}, F1 Score: {f1_per_label[i]:.4f}')

In [209]:
train_attention_model(X_train, X_test, y_train, y_test)

Label 0: Precision: 0.5556, Recall: 0.8333, F1 Score: 0.6667
Label 1: Precision: 0.6667, Recall: 0.3333, F1 Score: 0.4444
