# HAP 789 Sentiment Analysis Project

## Load data and initial data prep

In [None]:
# Import library
import pandas as pd

# import data
df = pd.read_csv('./data/TrainingRecords-4-4-2024.csv')

df.head()

In [None]:
# drop dateCreated column
df = df.drop(columns=['dateCreated'])

# Remove duplicates based on commentId and keep the first occurrence
df = df.drop_duplicates(subset='commentId', keep='first').reset_index(drop=True)

df.head()

In [None]:
# drop rows with missing values for comments
df = df.dropna(subset=['comment'])

# drop invalid comments
# List of commentIds to drop
commentIds_to_drop = [180459, 151656, 179845, 179923]

# Drop rows with specified commentIds
df = df[~df['commentId'].isin(commentIds_to_drop)]

df.head()

In [None]:
df.info()

In [None]:
# Create copy of dataframe that can be used for further processing
main_df = df.copy()

# Create comment_processed column
main_df['comment_processed'] = main_df['comment']

main_df.head()

## Remove proper nouns

In [None]:
# Import library
import spacy

# Load English language model with named entity recognition (NER) component
nlp = spacy.load("en_core_web_sm")

# Function to remove proper nouns from text
def remove_proper_nouns(text):
    doc = nlp(text)
    filtered_tokens = [token.text for token in doc if token.ent_type_ == ""]
    return ' '.join(filtered_tokens)

# Apply the remove_proper_nouns function to the comment_processed column
main_df['comment_processed'] = main_df['comment_processed'].apply(remove_proper_nouns)

# Output the DataFrame with no proper nouns
main_df.head()

In [None]:
# Store file with removed proper nouns
main_df.to_csv('./data/checkpoint_01_no_proper.csv', index=False)

## Text pre-processing (simple)

In [None]:
# Import library
import pandas as pd

# read checkpoint file
main_df = pd.read_csv('./data/checkpoint_01_no_proper.csv', low_memory=False)

In [None]:
# Create comment_processed column with lower case comments
main_df['comment_processed'] = main_df['comment_processed'].str.lower()

main_df.head()

In [None]:
# Import library
import re # for regular expressions

# Remove punctuation, special characters, and numbers
main_df['comment_processed'] = main_df['comment_processed'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', str(x)))

main_df.head(20)

## Text pre-processing (complex)

### Spelling correction

In [None]:
# Import library
from textblob import TextBlob

# Function to correct spelling mistakes in a text
def correct_spelling(text):
    blob = TextBlob(text)
    corrected_text = blob.correct()
    return str(corrected_text)

In [None]:
# Apply the correct_spelling function to the comment_processed column
main_df['comment_processed'] = main_df['comment_processed'].apply(correct_spelling)

# Output the DataFrame with corrected spelling
main_df.head()

In [None]:
# Store file with corrected spelling for data checkpoint purposes
main_df.to_csv('./data/checkpoint_02_spell_corrected.csv', index=False)

### Removing stop words (caution "not", "not" and other relevant words should not be removed)

In [None]:
# Import library
import pandas as pd

# read checkpoint file
main_df = pd.read_csv('./data/checkpoint_02_spell_corrected.csv', low_memory=False)

In [None]:
# remove stop words and remove proper nouns phase 2


### Lemmatization

### Convert synonyms

### Create word or phrase list

In [None]:
# Function to generate word phrase list from text
def generate_word_phrase_list(text):
    text_list = text.split()
    end_pos = len(text_list)
    word_phrase_list = []
    
    for i in range(end_pos):
        for j in range(i, end_pos):
            words = text_list[i:j+1]
            phrase = ' '.join(words)
            word_phrase_list.append(phrase)
    
    return word_phrase_list

In [None]:
# Remove missing values for comment_processed
main_df = main_df.dropna(subset=['comment_processed'])

# Apply the generate_word_phrase_list function to the comment_processed column to create word_phrase_list column
main_df['word_phrase_list'] = main_df['comment_processed'].apply(generate_word_phrase_list)

# Output the DataFrame with word phrase list
main_df.head()

In [None]:
# Store file with corrected spelling for data checkpoint purposes
main_df.to_csv('./data/checkpoint_06_with_word_phrase_list.csv', index=False)

## Modeling tasks

In [None]:
# Import library
import pandas as pd

# read checkpoint file
main_df = pd.read_csv('./data/checkpoint_06_with_word_phrase_list.csv', low_memory=False)

### Create test and training sets and calculate similarity scores

In [None]:
# Filter rows where classification is 1
complaints_df = main_df[main_df['classification'] == 1].sample(n=35, random_state=42)

# Filter rows where classification is 0
praises_df = main_df[main_df['classification'] == 0].sample(n=35, random_state=42)

# Concatenate both dataframes
test_df = pd.concat([complaints_df, praises_df])

# Reset index of the resulting dataframe
test_df = test_df.reset_index(drop=True)

test_df.head()

In [None]:
# Function to calculate similarity score
def calculate_similarity_score(row, alpha, word_phrase_list):
    n_match = row[word_phrase_list].sum(axis=1) # number of matches
    n_target_only = len(word_phrase_list) - n_match # number unmatched in target
    n_train_only = len(row['word_phrase_list']) - n_match # number unmatched in training

    similarity_score = (n_match / (n_match + (alpha * n_target_only) + ((1-alpha) * n_train_only)))

    return similarity_score

In [None]:
# Set alpha for similarity score
alpha = 0.8 # may need to change for sensitivity testing

In [None]:
# Import libraries
import os
import warnings

# Suppress the PerformanceWarning
warnings.filterwarnings("ignore", message="DataFrame is highly fragmented", category=pd.errors.PerformanceWarning)

# Create a folder for training files if it doesn't exist
if not os.path.exists('./training_files'):
    os.makedirs('./training_files')

# Loop through each row in test_df
for i in range(len(test_df)):
    test_row = test_df.iloc[[i]] # get current row in dataframe format
    
    # Create a copy of main_df
    train_df = main_df.copy()

    # Remove the row with the same commentId as the current row in test_df
    test_commentId = int(test_row['commentId'])
    train_df = train_df[train_df['commentId'] != test_commentId]

    # Get word_phrase_list of current row in test_df and create columns in train_df
    print(f"\ntest: {i}") 
    print(f"len: {len(test_row['word_phrase_list'])}")
    print(f"list[{i}]: {test_row['word_phrase_list'][i]}")
    word_phrase_list = test_row['word_phrase_list'][i]
    for word_phrase in word_phrase_list:
        train_df[word_phrase] = 0 # init to 0

    # Set the word_phrase column to 1 if it exists in the training set row (exact match)
    for word_phrase in word_phrase_list:
        train_df[word_phrase] = train_df['word_phrase_list'].apply(lambda x: 1 if word_phrase in x else 0)
        
    # Remove rows where the sum of the columns created from word_phrase_list is 0
    train_df = train_df[train_df[word_phrase_list].sum(axis=1) != 0]

    # Calculate similarity scores
    train_df['similarity_score'] = 0.0 # initialize
    
    j = 0
    for index, row in train_df.iterrows():
        train_row = train_df.iloc[[j]] # get current row in dataframe format
        train_df.at[index, 'similarity_score'] = calculate_similarity_score(train_row, alpha, word_phrase_list)
        j += 1
    
    # Get columns present in the word_phrase_list
    word_phrase_columns = train_df.columns[train_df.columns.isin(word_phrase_list)]
    
    # Get all columns that are all ones or zeros within word_phrase_columns
    columns_to_drop = word_phrase_columns[(train_df[word_phrase_columns].sum(axis=0) == len(train_df)) | (train_df[word_phrase_columns].sum(axis=0) == 0)]
    
    # Drop these columns with all ones or zeros (zero variance)
    train_df = train_df.drop(columns=columns_to_drop)
       
    # Create filename
    filename = f"./training_files/{test_commentId}.csv"

    # Write dataframe to CSV
    train_df.to_csv(filename, index=False)

In [None]:
# Store file with corrected spelling for data checkpoint purposes
test_df.to_csv('./data/test_df_initial.csv', index=False)
main_df.to_csv('./data/checkpoint_07_pre_regression.csv', index=False)

### Logistic Regression with Similarity Scores as Weight

In [None]:
# Import library
import pandas as pd

# read checkpoint files
test_df = pd.read_csv('./data/test_df_initial.csv', low_memory=False)
main_df = pd.read_csv('./data/checkpoint_07_pre_regression.csv', low_memory=False)

In [None]:
# Import library
import numpy as np

# Function to calculate probability of complaint
def calculate_probability(intercept, coefficients):
    sumcoeff = intercept + np.sum(coefficients)
    return 1 / (1 + np.exp(-sumcoeff))

In [None]:
# Import library
from sklearn.linear_model import LogisticRegression

# Function for model training using logistic regression
def calculate_probability(test_row):

    # open training file for test row
    commentId = int(test_row['commentId'])
    filename = f"./training_files/{test_commentId}.csv"
    train_df = pd.read_csv(filename, low_memory=False)

    # set target variable
    y = train_df['classification']

    # set independent variables
    cols_to_drop = ['classification',
                    'commentId',
                    'comment',
                    'comment_processed',
                    'word_phrase_list',
                    'similarity_score']
    X = train_df.drop(columns=cols_to_drop)

    # set weight to similarity score
    sample_weights = train_df['similarity_score']

    # Do logistic regression modeling with similarity_score as weight
    log_reg = LogisticRegression()
    log_reg.fit(X, y, sample_weight = sample_weights)

    probability = calculate_probability(log_reg.intercept_[0], log_reg.coef_[0])

    return probability

In [None]:
# Loop each row in test_df and calculate prediction, TP, TN, FP, FN
i = 0
cutoff = 0.5 # probability cutoff for prediction
for index, row in test_df.iterrows():
    test_row = test_df.iloc[[i]] # get current row in dataframe format
    
    # create a column in the test_df for the predicted probability 
    test_df.at[index, 'probability'] = calculate_probability(test_row) 

    # calculate prediction from probability using cutoff
    test_df.at[index, 'prediction'] = 1 if test_df.at[index, 'probability'] >= cutoff else 0

    # calculate calibration
    test_df.at[index, 'calibration'] = abs(test_df.at[index, 'probability'] - test_df.at[index, 'classification'])
    
    # Calculate TP, TN, FP, FN
    test_df.at[index, 'TP'] = 1 if (test_df.at[index, 'prediction'] == 1 and test_df.at[index, 'classification'] == 1) else 0
    test_df.at[index, 'TN'] = 1 if (test_df.at[index, 'prediction'] == 0 and test_df.at[index, 'classification'] == 0) else 0
    test_df.at[index, 'FP'] = 1 if (test_df.at[index, 'prediction'] == 1 and test_df.at[index, 'classification'] == 0) else 0
    test_df.at[index, 'FN'] = 1 if (test_df.at[index, 'prediction'] == 0 and test_df.at[index, 'classification'] == 1) else 0

    i += 1

In [None]:
# Store file with corrected spelling for data checkpoint purposes
test_df.to_csv('./data/test_df_completed.csv', index=False)

### Calculate Metrics

In [None]:
# Import library
import pandas as pd

# read checkpoint files
test_df = pd.read_csv('./data/test_df_completed.csv', low_memory=False)

In [None]:
# For entire test_df calculate accuracy
TP = test_df['TP'].sum()
TN = test_df['TN'].sum()
FP = test_df['FP'].sum()
FN = test_df['FN'].sum()
accuracy = round((TP + TN) / (TP + TN + FP + FN) * 100, 2)

# Calculate average calibration and its standard deviation
average_calibration = round(test_df['calibration'].mean(),2)
std_dev_calibration = round(test_df['calibration'].std(),2)


print(f"Model accuracy: {accuracy}%")
print(f"Average of calibration: {average_calibration}")
print(f"Standard deviation of calibration: {std_dev_calibration}")