In [138]:
import os
import sys
import pandas as pd

# Specify the directory containing the module you want to import
tpm_directory = '/Users/priyadcosta/Documents/GitHub/coefficientofconflict/team-process-map/feature_engine'

# Add the directory to sys.path
sys.path.append(tpm_directory)


### Step 1 : Basic Pre-processing

Converting the labels to numbers and averaging them

In [139]:
data = pd.read_csv('/Users/priyadcosta/Documents/GitHub/coefficientofconflict/tpm-data-anotation/CONFLICT_CONVO_LABELING_LOG.csv')

In [140]:
"""
Convert the labels into numeric scores
"""

def get_numeric_labels(text):

    # Convert the text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize the result variable
    result = 0
    
    # Check if "yes" is present in the text
    if 'yes' in text_lower:
        result = 1
    elif 'no' in text_lower:
        result = 0
    
    return result

"""
Convert all the columns to numeric labels
"""
def convert_labels(df):
    
    df['d_content'] = df['rating_directness_content'].apply(get_numeric_labels)
    df['d_expression'] = df['rating_directness_expression'].apply(get_numeric_labels)
    df['oi_content'] = df['rating_OI_content'].apply(get_numeric_labels)
    df['oi_expression'] = df['rating_OI_expression'].apply(get_numeric_labels)


"""
Get the average of the ratings for a single column
"""
def get_averages(df,on_column):

    # Calculate average ratings
    average_ratings = df.groupby(['CONV_ID', 'id'])[on_column].mean().reset_index()

    # Merge average ratings with original DataFrame
    df = df.merge(average_ratings, on=['CONV_ID', 'id'], how='left', suffixes=('', '_average'))

    return df


"""
Get the average ratings for all the columns
"""
def average_labels(df, columns):
    for column in columns:
        df = get_averages(df, column)
    return df


In [141]:
"""
Determine the labels for the dataset
"""
def get_label(conv_id):
    if conv_id.endswith('_A') or conv_id.endswith('_B'):
        return 'winning'
    else:
        return 'awry'

""" 
Get the dataset which the conversation belongs to awry or winning
"""
def dataset_labels(df):
    df['dataset'] = df['CONV_ID'].apply(get_label)
    

In [142]:
"""
Drop unncessary columns 
"""
def drop_cols(df,type):
    if type == 'average':
        return df[['d_content_average', 'd_expression_average', 'oi_content_average','oi_expression_average', 'dataset']]
    else:
        return df[['d_content', 'd_expression', 'oi_content','oi_expression','dataset']]

In [143]:
"""
Drop the OP's message for winning conversations
"""
def drop_op(df):

    # Check if there is any row with 'dataset' column value as 'winning'
    if (df['dataset'] == 'winning').any():
        # Find the first 'CONV_ID' for which 'dataset' column value is 'winning'
        first_winning_conv_id = df[df['dataset'] == 'winning']['CONV_ID'].iloc[0]
        
        # Drop the row(s) with this 'CONV_ID'
        df = df[df['CONV_ID'] != first_winning_conv_id]

    return df

In [144]:
# get the dataset to which the chat belongs
dataset_labels(data)

#convert the text labels to numeric labels
convert_labels(data)

#get the average rating for each chat
numeric_cols = ['d_content', 'd_expression', 'oi_content', 'oi_expression']
data = average_labels(data,numeric_cols)

In [145]:
print('awry convos ' + str(data[data['dataset'] == 'awry']['CONV_ID'].nunique()))
print('winning convos ' + str(data[data['dataset'] == 'winning']['CONV_ID'].nunique()))

awry convos 41
winning convos 37


In [146]:
avg_data = drop_cols(data,'average')
original_data = drop_cols(data,'original')

### Step2 : Logistic Regression

In [147]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [148]:

def run_logistic_regression(df,target_column):

    # Split features and target
    X = df.drop(target_column, axis=1)  # Features
    y = df[target_column]   

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19104)
    model = LogisticRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    # Assuming you have already trained a logistic regression model named 'model'
    # and 'X_train' is your feature matrix

    # Get the coefficients (weights) of the logistic regression model
    coefficients = model.coef_[0]

    # Get the names of the features
    feature_names = X_train.columns

    # Create a DataFrame to store the coefficients and feature names
    coefficients_df = pd.DataFrame({'Feature': feature_names, 'Weights': coefficients})

    # Sort the DataFrame by coefficient magnitude (absolute value) to identify the most predictive features
    coefficients_df = coefficients_df.sort_values(by='Weights', ascending=False)

    # Display the DataFrame
    print(coefficients_df)


In [149]:
run_logistic_regression(avg_data,'dataset')

Accuracy: 0.6861313868613139
              precision    recall  f1-score   support

        awry       0.72      0.79      0.75       250
     winning       0.62      0.53      0.57       161

    accuracy                           0.69       411
   macro avg       0.67      0.66      0.66       411
weighted avg       0.68      0.69      0.68       411

                 Feature   Weights
1   d_expression_average  3.140205
2     oi_content_average -1.201586
3  oi_expression_average -1.740752
0      d_content_average -2.380186


In [150]:
run_logistic_regression(original_data,'dataset')

Accuracy: 0.6520681265206812
              precision    recall  f1-score   support

        awry       0.69      0.79      0.73       250
     winning       0.57      0.44      0.50       161

    accuracy                           0.65       411
   macro avg       0.63      0.61      0.62       411
weighted avg       0.64      0.65      0.64       411

         Feature   Weights
1   d_expression  1.472823
2     oi_content -0.750851
3  oi_expression -1.117233
0      d_content -1.303291


### Step 3 : Neural Network - Without Attention

In [151]:
def get_dataset_numeric_labels(text):

    # Convert the text to lowercase for case-insensitive matching
    text_lower = text.lower()
    
    # Initialize the result variable
    result = 0
    
    # Check if "yes" is present in the text
    if 'winning' in text_lower:
        result = 1
    elif 'awry' in text_lower:
        result = 0
    
    return result

In [152]:
#convert the dataset labels to numbers. winning = 1, awry = 0
avg_data['dataset_numeric'] = avg_data['dataset'].apply(get_dataset_numeric_labels)
data['dataset_numeric'] = data['dataset'].apply(get_dataset_numeric_labels)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  avg_data['dataset_numeric'] = avg_data['dataset'].apply(get_dataset_numeric_labels)


In [153]:
data.columns

Index(['CONV_ID', 'id', 'rating_directness_content',
       'rating_directness_expression', 'rating_OI_content',
       'rating_OI_expression', 'rater_id', 'status', 'last_updated_time',
       'dataset', 'd_content', 'd_expression', 'oi_content', 'oi_expression',
       'd_content_average', 'd_expression_average', 'oi_content_average',
       'oi_expression_average', 'dataset_numeric'],
      dtype='object')

In [154]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import precision_score, recall_score, f1_score

# Define the neural network model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(4, 64)   # Input size: 4, Output size: 64
        self.fc2 = nn.Linear(64, 32)  # Input size: 64, Output size: 32
        self.fc3 = nn.Linear(32, 1)   # Input size: 32, Output size: 1
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        return x

def neural_net(df):

    # Select features and target variable
    X = df[['d_content_average', 'd_expression_average', 'oi_content_average', 'oi_expression_average']]
    y = df['dataset_numeric']

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19104)

    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert data to PyTorch tensors
    X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)  # Reshape to (batch_size, 1)
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

    # Create an instance of the model
    model = NeuralNetwork()

    # Define the loss function and optimizer - Most popularly used
    criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
    optimizer = optim.AdamW(model.parameters(), lr=0.001)  # AdamW optimizer with weight decay

    """ 
    TensorDataset: 
    
    This class is used to wrap tensors representing the input features and target labels into a single dataset object. 
    Each sample in the dataset corresponds to a pair of input features and target labels.
    
    DataLoader: 

    This class is used to create an iterable over the dataset, enabling you to iterate through batches of data during training. 
    It allows you to specify parameters such as batch size and whether to shuffle the data between epochs.

    """
    
    # Convert data to DataLoader
    train_data = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

    # Training the model
    epochs = 50
    for epoch in range(epochs):
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

    # Evaluate the model
    with torch.no_grad():
        model.eval()
        outputs = model(X_test_tensor)
        predictions = (outputs >= 0.5).float()  # Thresholding at 0.5
        
        # Convert PyTorch tensors to numpy arrays with float32 data type
        predictions_np = predictions.numpy().astype('float32')
        y_test_np = y_test_tensor.numpy().astype('float32')
        
        # Calculate precision, recall, and F1 score for each label
        precision_per_label = precision_score(y_test_np, predictions_np, average=None)
        recall_per_label = recall_score(y_test_np, predictions_np, average=None)
        f1_per_label = f1_score(y_test_np, predictions_np, average=None)
        
        # Print precision, recall, and F1 score for each label
        for i in range(len(precision_per_label)):
            print(f'Label {i}: Precision: {precision_per_label[i]:.4f}, Recall: {recall_per_label[i]:.4f}, F1 Score: {f1_per_label[i]:.4f}')


In [157]:
# neural_net(avg_data)

RuntimeError: all elements of input should be between 0 and 1

### Step 4 : Neural Network - With Attention

In [158]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.nn.utils.rnn import pad_sequence
import torch

def preprocess_for_attention(df):
    # Example DataFrame creation (replace this with your actual DataFrame loading)
    np.random.seed(19104)  # For reproducible random results

    # Normalize features
    scaler = StandardScaler()
    features = ['d_content_average', 'd_expression_average', 'oi_content_average', 'oi_expression_average']
    df[features] = scaler.fit_transform(df[features])

    """
    Grouping by CONV_ID:

    This line groups the DataFrame df by the column CONV_ID. 
    Each group corresponds to a unique conversation identified by CONV_ID. 
    The purpose is to treat each conversation as a sequence, which is particularly useful for sequence modeling tasks where the context of the conversation is important. 
    """
    grouped = df.groupby('CONV_ID')
    sequences = []
    targets = []

    """
    Prepare Sequences and Targets:

    Iterates over each group created by the groupby operation.

    seq = group[features].values extracts just the values of the specified features 
    (features is a list of column names) from each group as a NumPy array. 
    This array represents the sequence of observations for a single conversation.

    target = group['dataset_numeric'].values[0] extracts the target variable for the sequence. 
    This example takes the last value of the dataset_numeric column from the group as the target. 
    The assumption here might be that the target of the entire sequence (conversation) is determined by its final state or message. 
    """

    for _, group in grouped:
        seq = group[features].values  # Extract features as sequence
        target = group['dataset_numeric'].values[0]  # Extract the target variable for the sequence. All the values are same for a given CONV_ID i.e the 0 for awry or 1 for winning
        sequences.append(torch.tensor(seq, dtype=torch.float))
        targets.append(torch.tensor(target, dtype=torch.float))

    """
    Padding Sequences:

    Since the sequences (conversations) can have varying lengths (i.e., different numbers of messages or observations),
    they need to be padded to have the same length to be processed in batches by the model. 
    
    The pad_sequence function from PyTorch's torch.nn.utils.rnn module achieves this by adding zeros to shorter sequences until all sequences in the batch have the same length.

    The parameter batch_first=True indicates that the output tensor should have a batch size as its first dimension, i.e.,
    the tensor shape will be (batch_size, seq_length, features), which is the format expected by most PyTorch models for batched sequence data 
    """

    # Padding sequences to have the same length
    padded_sequences = pad_sequence(sequences, batch_first=True)

    # Splitting the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(padded_sequences, targets, test_size=0.2, random_state=19104)

    # Convert lists to tensor for targets if necessary
    y_train = torch.stack(y_train)
    y_test = torch.stack(y_test)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_for_attention(data)

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from captum.attr import IntegratedGradients
from sklearn.metrics import precision_score, recall_score

class LSTMWithAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim=1, num_layers=1):
        super(LSTMWithAttention, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.attention = nn.Linear(hidden_dim, 1)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):

        """ 
        The input sequence x is passed through the LSTM layer. lstm_out contains the LSTM's output for each time step.
        """
        lstm_out, _ = self.lstm(x) #the length of x will determine the number of timestamps. Since our input data is padded, this will be the length of the longest sequence
        # print(x.shape)

        """ 
        The LSTM output is then passed through the attention layer. 
        This layer assigns a weight to each time step of the LSTM output.
        The softmax function ensures that these weights sum up to 1, making them a valid probability distribution.
        """
        attn_weights = F.softmax(self.attention(lstm_out), dim=1) 

        """ 
        The attention weights are then used to compute a weighted sum of the LSTM outputs, which is a way to focus on the most relevant parts of the input sequence. 
        The function torch.bmm performs a batch matrix-matrix product of the attention weights and LSTM outputs.
        """
        attn_applied = torch.bmm(attn_weights.transpose(1, 2), lstm_out)

        """ 
        The attention-weighted sum is passed through the final linear layer to produce the model's output.
        """
        output = self.fc(attn_applied.squeeze(1))

        return output, attn_weights


def train_attention_model(X_train, X_test, y_train, y_test):

    # Model instantiation
    input_dim = 4  # Number of input features
    hidden_dim = 64 # Just a Random Number
    model = LSTMWithAttention(input_dim, hidden_dim)

    # Define loss function and optimizer
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters())

    # Training loop
    num_epochs = 1000
    for _ in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        outputs,weights = model(X_train)
        loss = criterion(outputs.squeeze(), y_train)
        loss.backward()
        optimizer.step()
        
    # Evaluation phase
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Disable gradient calculation
        test_outputs,attn_weights = model(X_test)

        # Convert model outputs to binary predictions
        preds = torch.sigmoid(test_outputs.squeeze()) >= 0.5
        
        # Convert tensors to NumPy arrays for sklearn metrics
        predictions_np = preds.numpy().astype('float32')
        y_test_np = y_test.numpy().astype('float32')

    # Calculate precision, recall, and F1 score for each label
        precision_per_label = precision_score(y_test_np, predictions_np, average=None)
        recall_per_label = recall_score(y_test_np, predictions_np, average=None)
        f1_per_label = f1_score(y_test_np, predictions_np, average=None)
        
        # Print precision, recall, and F1 score for each label
        for i in range(len(precision_per_label)):
            print(f'Label {i}: Precision: {precision_per_label[i]:.4f}, Recall: {recall_per_label[i]:.4f}, F1 Score: {f1_per_label[i]:.4f}')


In [None]:
# predictions = train_attention_model(X_train, X_test, y_train, y_test)

Attribution per Time Step: 
The IG method calculates the contribution of each feature at each time step towards the model's prediction. This enables to see not just which features are important, but also when they are important within the sequence.

A time step refers to one point in time in the input sequence. In the context of LSTM models, it's one cycle of processing by the LSTM unit. Is 120 LSTM default timestamp!?

In [None]:
def preprocess_for_specific_conv_id(df, test_conv_id):
    np.random.seed(19104)  # For reproducible results
    # scaler = StandardScaler()
    features = ['d_content_average', 'd_expression_average', 'oi_content_average', 'oi_expression_average']
    # df[features] = scaler.fit_transform(df[features])
    
    grouped = df.groupby('CONV_ID')
    train_sequences = []
    train_targets = []
    test_sequences = []
    test_targets = []

    for conv_id, group in grouped:
        # print("Convo ID " + str(conv_id))
        seq = group[features].values #feature values
        # print(len(seq))
        # print(seq) 
        target = group['dataset_numeric'].values[0]
        # print(target) 
        if conv_id == test_conv_id:
            test_sequences.append(torch.tensor(seq, dtype=torch.float))
            test_targets.append(torch.tensor(target, dtype=torch.float))
        else:
            train_sequences.append(torch.tensor(seq, dtype=torch.float))
            train_targets.append(torch.tensor(target, dtype=torch.float))
    
    # Padding sequences
    padded_train_sequences = pad_sequence(train_sequences, batch_first=True)
    padded_test_sequences = pad_sequence(test_sequences, batch_first=True)

    print("padded train sequences " + str(padded_train_sequences.shape))
    # Converting lists to tensors for targets
    y_train = torch.stack(train_targets)
    y_test = torch.stack(test_targets)
    
    return padded_train_sequences, padded_test_sequences, y_train, y_test

In [None]:
def train_attention_model_iterative(X_train, X_test, y_train, y_test):

    # Model instantiation
    input_dim = 4  # Number of input features
    hidden_dim = 8 # Just a Random Number
    model = LSTMWithAttention(input_dim, hidden_dim)

    # Define loss function and optimizer
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters())

    # Training loop
    num_epochs = 100
    for _ in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        outputs,weights = model(X_train)
        loss = criterion(outputs.squeeze(), y_train)
        loss.backward()
        optimizer.step()
        
    # Evaluation phase
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Disable gradient calculation
        test_outputs,attn_weights = model(X_test)

        # Convert model outputs to binary predictions
        preds = torch.sigmoid(test_outputs.squeeze()) >= 0.5
        
        # Convert tensors to NumPy arrays for sklearn metrics
        predictions_np = preds.numpy().astype('float32')
        y_test_np = y_test.numpy().astype('float32')
        attn_weights_np = attn_weights.numpy().astype('float32')

    return predictions_np,y_test_np,attn_weights_np


In [None]:
def iterative_testing(df):
    unique_conv_ids = df['CONV_ID'].unique()
    
    predictions_np = []
    y_test_np = []
    weights_np = []

    for test_conv_id in unique_conv_ids:
        X_train, X_test, y_train, y_test = preprocess_for_specific_conv_id(df, test_conv_id)
        
        #train the model
        predictions_results, y_test_results,attn_weights_results = train_attention_model_iterative(X_train, X_test, y_train, y_test)

        #append and get the average
        predictions_np.append(predictions_results)
        y_test_np.append(y_test_results)
        weights_np.append(attn_weights_results)
    
     # Convert lists to numpy arrays for efficient numerical operations
    predictions_np_array = np.array(predictions_np)
    y_test_np_array = np.array(y_test_np)

    return predictions_np_array,y_test_np_array,weights_np


In [None]:

def plot_results(predictions_np_array,y_test_np_array,average_data=True):
    average_predictions = None
    average_y_test = None

    if average_data:

        average_predictions = np.stack(predictions_np_array)
        average_y_test = np.stack(y_test_np_array)

    else:

        average_predictions = predictions_np_array
        average_y_test = y_test_np_array

    # Calculate precision, recall, and F1 score for each label
    precision_per_label = precision_score(average_y_test, average_predictions, average=None)
    recall_per_label = recall_score(average_y_test, average_predictions, average=None)
    f1_per_label = f1_score(average_y_test, average_predictions, average=None)
    
    # Print precision, recall, and F1 score for each label
    for i in range(len(precision_per_label)):
        print(f'Label {i}: Precision: {precision_per_label[i]:.4f}, Recall: {recall_per_label[i]:.4f}, F1 Score: {f1_per_label[i]:.4f}')

In [None]:
print(data['CONV_ID'].nunique())

58


In [None]:
predictions_np_array,y_test_np_array,attributions_np_array = iterative_testing(data)

padded train sequences torch.Size([57, 120, 4])


padded train sequences torch.Size([57, 120, 4])
padded train sequences torch.Size([57, 120, 4])
padded train sequences torch.Size([57, 120, 4])
padded train sequences torch.Size([57, 120, 4])
padded train sequences torch.Size([57, 120, 4])
padded train sequences torch.Size([57, 120, 4])
padded train sequences torch.Size([57, 105, 4])
padded train sequences torch.Size([57, 120, 4])
padded train sequences torch.Size([57, 120, 4])
padded train sequences torch.Size([57, 120, 4])
padded train sequences torch.Size([57, 120, 4])
padded train sequences torch.Size([57, 120, 4])
padded train sequences torch.Size([57, 120, 4])
padded train sequences torch.Size([57, 120, 4])
padded train sequences torch.Size([57, 120, 4])
padded train sequences torch.Size([57, 120, 4])
padded train sequences torch.Size([57, 120, 4])
padded train sequences torch.Size([57, 120, 4])
padded train sequences torch.Size([57, 120, 4])
padded train sequences torch.Size([57, 120, 4])
padded train sequences torch.Size([57, 1

In [None]:
plot_results(predictions_np_array,y_test_np_array)

Label 0: Precision: 0.5405, Recall: 0.6250, F1 Score: 0.5797
Label 1: Precision: 0.4286, Recall: 0.3462, F1 Score: 0.3830


#### Label 0: Precision: 0.6538, Recall: 0.5312, F1 Score: 0.5862
#### Label 1: Precision: 0.5312, Recall: 0.6538, F1 Score: 0.5862

In the context of neural networks, specifically when using `torch.nn.LSTM` (Long Short-Term Memory) layers from PyTorch, "timestamps" refer to the individual time steps in a sequence of data that the LSTM processes. LSTMs are a type of recurrent neural network (RNN) that are particularly good at capturing temporal dynamics and dependencies in sequential data, making them ideal for tasks like time series prediction, natural language processing, and sequence generation.

A sequence is a collection of data points ordered in time. Each data point in the sequence is associated with a time step (or "timestamp"), which represents a specific point in the sequence's temporal order. In the case of LSTMs, the network processes these sequences one time step at a time, allowing it to maintain a memory of previous inputs through its internal state, which influences the processing of future inputs.

When you use `torch.nn.LSTM` in PyTorch, you typically do not explicitly provide "timestamps" in the sense of date or time values. Instead, you structure your input data as sequences where the order of the data points represents the temporal order. The LSTM then processes these sequences one element at a time, implicitly understanding the order as the "timestamps".

The input to an LSTM layer is usually a 3D tensor with dimensions defined as follows:

1. **Sequence Length**: The length of the sequence (i.e., the number of timestamps).
2. **Batch Size**: The number of sequences processed in parallel during training.
3. **Features**: The number of features (or dimensions) in each time step of the sequence.

In summary, "timestamps" in the context of LSTMs refer to the ordered positions of data points in a sequence that the network processes. The actual representation of time (such as dates or hours) is abstracted away; what matters is the sequential order of the data points, which allows the LSTM to model temporal dependencies and patterns.

### Step 5: Attention

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader

# Assuming your DataFrame `df` and it includes 'CONV_ID', 'd_content_average', 'd_expression_average', 
# 'oi_content_average', 'oi_expression_average', and 'dataset_numeric'

class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

class Model(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

def preprocess_data(df, target_column):
    # Assuming all targets within a group are the same, taking the first one as the representative
    grouped = df.groupby('CONV_ID').agg('first').reset_index()
    features = grouped[['d_content_average', 'd_expression_average', 'oi_content_average', 'oi_expression_average']].values
    labels = grouped[target_column].values
    return torch.tensor(features, dtype=torch.float32), torch.tensor(labels, dtype=torch.long)

def train_model(model, train_loader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        for batch_features, batch_labels in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_features)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()

def evaluate_model(model, test_features, test_labels):
    model.eval()
    with torch.no_grad():
        predictions = model(test_features)
        _, predicted_labels = torch.max(predictions, 1)
        precision, recall, f1, _ = precision_recall_fscore_support(test_labels.numpy(), predicted_labels.numpy(), average=None)
        return precision, recall, f1

def main(df, target_column):
    # Preprocess data
    features, labels = preprocess_data(df, target_column)
    
    # Split data
    features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)
    
    # Create DataLoaders
    train_dataset = CustomDataset(features_train, labels_train)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    
    # Initialize the model
    model = Model(input_dim=4, hidden_dim=128, output_dim=2)  # 2 output dim for binary classification
    
    # Set loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    # Train the model
    train_model(model, train_loader, criterion, optimizer, epochs=5)
    
    # Evaluate the model
    precision, recall, f1 = evaluate_model(model, features_test, labels_test)
    
    print(f'Precision: {precision[0]}\nRecall: {recall[0]}\nF1 Score: {f1[0]}')

In [None]:
main(data,'dataset_numeric')

Precision: 0.5
Recall: 0.42857142857142855
F1 Score: 0.4615384615384615
