# HAP 789 Sentiment Analysis Project

# Load data and initial data prep

In [None]:
# Import library
import pandas as pd

# import data
df = pd.read_csv('./data/TrainingRecords-4-4-2024.csv')

df.head()

In [None]:
# drop dateCreated column
df = df.drop(columns=['dateCreated'])

# Remove duplicates based on commentId and keep the first occurrence
df = df.drop_duplicates(subset='commentId', keep='first').reset_index(drop=True)

df.head()

In [None]:
# drop rows with missing values for comments
df = df.dropna(subset=['comment'])

# drop invalid comments
# List of commentIds to drop
commentIds_to_drop = [180459, 151656, 179845, 179923]

# Drop rows with specified commentIds
df = df[~df['commentId'].isin(commentIds_to_drop)]

df.head()

In [None]:
df.info()

In [None]:
# Create copy of dataframe that can be used for further processing
main_df = df.copy()

# Create comment_processed column
main_df['comment_processed'] = main_df['comment']

main_df.head()

# Remove special characters and numbers

In [None]:
# Import library
import re # for regular expressions

# Remove punctuation, special characters, and numbers
main_df['comment_processed'] = main_df['comment_processed'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', str(x)))

# Change all to lower case

In [None]:
# Create comment_processed column with lower case comments
main_df['comment_processed'] = main_df['comment_processed'].str.lower()

# Remove proper nouns

In [None]:
# Import library
import spacy

# Load English language model with named entity recognition (NER) component
nlp = spacy.load("en_core_web_sm")

# Function to remove proper nouns from text
def remove_proper_nouns(text):
    doc = nlp(text)
    filtered_tokens = [token.text for token in doc if token.ent_type_ == ""]
    return ' '.join(filtered_tokens)

In [None]:
# Apply the remove_proper_nouns function to the comment_processed column
main_df['comment_processed'] = main_df['comment_processed'].apply(remove_proper_nouns)

In [None]:
# Remove rows with empty comment_processed column
main_df = main_df[main_df['comment_processed'].notnull() & (main_df['comment_processed'] != '')]

main_df.head()

In [None]:
# Store file with removed proper nouns
main_df.to_csv('./data/checkpoint_01_no_proper.csv', index=False)

# Text pre-processing (complex)

In [None]:
# Import library
import pandas as pd

# read checkpoint file
main_df = pd.read_csv('./data/checkpoint_01_no_proper.csv', low_memory=False)

# Spelling correction

In [None]:
# Import library
from textblob import TextBlob

# Function to correct spelling mistakes in a text
def correct_spelling(text):
    blob = TextBlob(text)
    corrected_text = blob.correct()
    return str(corrected_text)

In [None]:
# Apply the correct_spelling function to the comment_processed column
main_df['comment_processed'] = main_df['comment_processed'].apply(correct_spelling)

# Output the DataFrame with corrected spelling
main_df.head()

In [None]:
# Remove rows with empty comment_processed column
main_df = main_df[main_df['comment_processed'].notnull() & (main_df['comment_processed'] != '')]

In [None]:
# Store file with corrected spelling for data checkpoint purposes
main_df.to_csv('./data/checkpoint_02_spell_corrected.csv', index=False)

# Removing stop words 
(caution "no", "not" and other relevant negation words should not be removed)

In [None]:
# Import library
import pandas as pd

# read checkpoint file
main_df = pd.read_csv('./data/checkpoint_02_spell_corrected.csv', low_memory=False)

In [None]:
import spacy

# Load English model from spaCy
nlp = spacy.load("en_core_web_sm")

# Get the list of English stop words
stop_words = nlp.Defaults.stop_words

# Convert the stop words into a DataFrame
stop_words_df = pd.DataFrame({"Stop_Words": list(stop_words)})

# Save the DataFrame to a CSV file
stop_words_df.to_csv("./data/spacy_english_stop_words.csv", index=False)

In [None]:
# Import library
import re # for regular expressions

# After processing the spacy stop word list create a custom stop word list and replace spacy stop words
custom_stop_words_df = pd.read_csv("./data/spacy_english_stop_words_processed_new.csv")

# Remove punctuation, special characters, and numbers
custom_stop_words_df['Stop_Words'] = custom_stop_words_df['Stop_Words'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', str(x)))

# Create stop word list
custom_stop_words = custom_stop_words_df['Stop_Words'].tolist()

In [None]:
# create a function to remove custom stop words
def remove_stop_words(text):

    # Process the text with spaCy
    doc = nlp(text)

    # Filter out the stop words
    filtered_text = " ".join(token.text for token in doc if token.text not in custom_stop_words)

    return filtered_text

In [None]:
# Apply the stop word removal function to the comment_processed column
main_df['comment_processed'] = main_df['comment_processed'].apply(remove_stop_words)

In [None]:
# Remove rows with empty comment_processed column
main_df = main_df[main_df['comment_processed'].notnull() & (main_df['comment_processed'] != '')]

main_df.head()

In [None]:
# Store file with corrected spelling for data checkpoint purposes
main_df.to_csv('./data/checkpoint_03_no_stop_words.csv', index=False)

# Lemmatization

In [None]:
# Import library
import pandas as pd

# read checkpoint file
main_df = pd.read_csv('./data/checkpoint_03_no_stop_words.csv', low_memory=False)

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize a single comment
def lemmatize_comment(comment):
    if comment == '' or comment is None:
        return ''  # Return an empty string if it's NaN
    
    tokens = word_tokenize(comment)  # Tokenize the comment into words
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatize each word
    return ' '.join(lemmatized_tokens)  # Join the lemmatized tokens back into a comment

In [None]:
# Apply the lemmatize function to the comment_processed column
main_df['comment_processed'] = main_df['comment_processed'].apply(lemmatize_comment)

In [None]:
# Remove rows with empty comment_processed column
main_df = main_df[main_df['comment_processed'].notnull() & (main_df['comment_processed'] != '')]

main_df.head()

In [None]:
# Store file with corrected spelling for data checkpoint purposes
main_df.to_csv('./data/checkpoint_04_lemmatized.csv', index=False)

# Convert synonyms

In [None]:
# Import library
import pandas as pd

# read checkpoint file
main_df = pd.read_csv('./data/checkpoint_04_lemmatized.csv', low_memory=False)

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Adding this line to download the required "omw-1.4" resource

# Function to find the most common synonym for a word using WordNet
def most_common_synonym(word):
    synsets = wordnet.synsets(word)  # Get all synsets for the word
    if synsets:
        all_synonyms = [syn.lemmas() for syn in synsets]  # Get all lemmas for each synset
        all_synonyms = [lemma.name().replace('_', ' ') for syn in all_synonyms for lemma in syn]  # Flatten the list of lemmas, replacing underscores with spaces
        synonym_counts = {synonym: all_synonyms.count(synonym) for synonym in all_synonyms}  # Count occurrences of each synonym
        most_common_synonym = max(synonym_counts, key=synonym_counts.get)  # Get the synonym with the highest count
        return most_common_synonym.lower()
    else:
        return word  # If no synsets found, return the original word

# Function to replace each word in a comment with its most common synonym
def replace_with_synonyms(comment):
    tokens = word_tokenize(comment)  # Tokenize the comment into words
    replaced_tokens = [most_common_synonym(token) for token in tokens]  # Replace each word with its most common synonym
    return ' '.join(replaced_tokens)  # Join the replaced tokens back into a comment

In [None]:
# Apply the synonym function to the comment_processed column
main_df['comment_processed'] = main_df['comment_processed'].apply(replace_with_synonyms)

In [None]:
# Remove rows with empty comment_processed column
main_df = main_df[main_df['comment_processed'].notnull() & (main_df['comment_processed'] != '')]

main_df.head()

In [None]:
# Store file with corrected spelling for data checkpoint purposes
main_df.to_csv('./data/checkpoint_05_synonyms.csv', index=False)

# Create word or phrase list

In [1]:
# Import library
import pandas as pd

# read checkpoint file
main_df = pd.read_csv('./data/checkpoint_05_synonyms.csv', low_memory=False)

In [2]:
# Function to generate word phrase list from text
def generate_word_phrase_list(text):
    text_list = text.split()
    end_pos = len(text_list)
    word_phrase_list = []
    
    for i in range(end_pos):
        for j in range(i, end_pos):
            words = text_list[i:j+1]
            phrase = ' '.join(words)
            word_phrase_list.append(phrase)
    
    return word_phrase_list

In [3]:
# Remove missing values for comment_processed
main_df = main_df.dropna(subset=['comment_processed'])

# Apply the generate_word_phrase_list function to the comment_processed column to create word_phrase_list column
main_df['word_phrase_list'] = main_df['comment_processed'].apply(generate_word_phrase_list)

# Output the DataFrame with word phrase list
main_df.head()

Unnamed: 0,commentId,comment,classification,comment_processed,word_phrase_list
0,129687,Moral of the story while the nurses are all gr...,0,moral story nurse great body money,"[moral, moral story, moral story nurse, moral ..."
1,169075,If you are thinking about improving your appea...,0,think better appearance want competent,"[think, think better, think better appearance,..."
2,88567,but I felt that my concerns were brushed aside...,1,feel concern brush aside go collapse,"[feel, feel concern, feel concern brush, feel ..."
3,147104,My tear trough filler in my left eye looked li...,1,tear trough filler leave eye look like garage,"[tear, tear trough, tear trough filler, tear t..."
4,137347,"So, thank you Dr. Whitaker for all you have do...",0,thank encourage way,"[thank, thank encourage, thank encourage way, ..."


In [4]:
# Store file with corrected spelling for data checkpoint purposes
main_df.to_csv('./data/checkpoint_06_final_training_file.csv', index=False)

# Final cleanups

In [5]:
# drop invalid comments from test selection
# List of commentIds to drop
commentIds_to_drop = [181673, 95658]

# Drop rows with specified commentIds
selection_df = main_df[~main_df['commentId'].isin(commentIds_to_drop)]

# Check if each value in the 'comment_processed' column contains only one word
is_single_word = selection_df['comment_processed'].str.split().apply(len) == 1

# Keep rows where the comment_processed column has more than one word
selection_df = selection_df[~is_single_word]

# Modeling tasks

## Create test and training sets and calculate similarity scores

In [6]:
# Filter rows where classification is 1
complaints_df = selection_df[selection_df['classification'] == 1].sample(n=35, random_state=42)

# Filter rows where classification is 0
praises_df = selection_df[selection_df['classification'] == 0].sample(n=35, random_state=42)

# Concatenate both dataframes
test_df = pd.concat([complaints_df, praises_df])

# Reset index of the resulting dataframe
test_df = test_df.reset_index(drop=True)

test_df.head()

Unnamed: 0,commentId,comment,classification,comment_processed,word_phrase_list
0,84833,The doctor talked to me with half his body in ...,1,doctor talk body examination room,"[doctor, doctor talk, doctor talk body, doctor..."
1,58035,And the registration person was not so nice.,1,registration person not nice,"[registration, registration person, registrati..."
2,190897,This is a pretend clinic they pretend to love ...,1,pretend clinic pretend love look fund,"[pretend, pretend clinic, pretend clinic prete..."
3,173845,"When I complained about it, I was asked to sen...",1,complain ask send picture,"[complain, complain ask, complain ask send, co..."
4,168214,Acts like a history teacher when really he's a...,1,act like history teacher dumb doctor keep ask ...,"[act, act like, act like history, act like his..."


In [7]:
# Function to calculate similarity score
def calculate_similarity_score(row, alpha, word_phrase_list, word_phrase_list_train):
    cols_to_drop = ['commentId',
                    'comment',
                    'classification',
                    'comment_processed',
                    'word_phrase_list',
                    'similarity_score']
    
    row_temp = row.drop(columns=cols_to_drop).copy()
    row_temp = row_temp.reset_index(drop=True)

    n_match = row_temp.iloc[0].sum() # number of matches
    n_target_only = len(word_phrase_list) - n_match # number unmatched in target
    n_train_only = len(word_phrase_list_train) - n_match # number unmatched in training

    similarity_score = (n_match / (n_match + (alpha * n_target_only) + ((1-alpha) * n_train_only)))

    return similarity_score

In [8]:
# Set alpha for similarity score
alpha = 0.5 # may need to change for sensitivity testing

In [9]:
# Import libraries
import os
import warnings

# Suppress the PerformanceWarning
warnings.filterwarnings("ignore", message="DataFrame is highly fragmented", category=pd.errors.PerformanceWarning)

# Create a folder for training files if it doesn't exist
if not os.path.exists('./training_files'):
    os.makedirs('./training_files')

# Loop through each row in test_df
for i in range(len(test_df)):
    test_row = test_df.iloc[[i]] # get current row in dataframe format
    
    # Create a copy of main_df
    train_df = main_df.copy()

    # Remove the row with the same commentId as the current row in test_df
    test_commentId = int(test_row['commentId'])

    train_df = train_df[train_df['commentId'] != test_commentId]

    # Get word_phrase_list of current row in test_df and create columns in train_df
    word_phrase_list = test_row['word_phrase_list'][i]

    for word_phrase in word_phrase_list:
        train_df[word_phrase] = 0 # init to 0

    # Set the word_phrase column to 1 if it exists in the training set row (exact match)
    for word_phrase in word_phrase_list:
        train_df[word_phrase] = train_df['word_phrase_list'].apply(lambda x: 1 if word_phrase in x else 0)
        
    # Remove rows where the sum of the columns created from word_phrase_list is 0
    train_df = train_df[train_df[word_phrase_list].sum(axis=1) != 0]

    # Calculate similarity scores
    train_df['similarity_score'] = 0.0 # initialize
    
    j = 0
    for index, row in train_df.iterrows():
        train_row = train_df.iloc[[j]] # get current row in dataframe format
        word_phrase_list_train = train_row['word_phrase_list'][index]
        train_df.at[index, 'similarity_score'] = calculate_similarity_score(train_row, alpha, word_phrase_list, word_phrase_list_train)
        j += 1

    # Get columns present in the word_phrase_list
    word_phrase_columns = train_df.columns[train_df.columns.isin(word_phrase_list)]
    
    # Get all columns that are all ones or zeros within word_phrase_columns
    columns_to_drop = word_phrase_columns[(train_df[word_phrase_columns].sum(axis=0) == len(train_df)) | (train_df[word_phrase_columns].sum(axis=0) == 0)]
    
    # Drop these columns with all ones or zeros (zero variance)
    train_df = train_df.drop(columns=columns_to_drop)
       
    # Create filename
    filename = f"./training_files/weighted_alpha50/{test_commentId}.csv"

    # Write dataframe to CSV
    train_df.to_csv(filename, index=False)

In [10]:
# Function to calculate 
def calculate_weight(sum_similarity_above_threshold, similarity):

    # apply formula for weight
    weight = similarity * (1 - sum_similarity_above_threshold)

    # return zero if weight is less than of equal to 0
    if weight > 0:
        return weight
    else:
        return 0

In [11]:
# Loop each row in test_df and calculate regression_weight
i = 0

for index, row in test_df.iterrows():
    # get commentId
    comment_id = row['commentId']

    # open training file for test row
    filename = f"./training_files/weighted_alpha50/{comment_id}.csv"
    train_df = pd.read_csv(filename, low_memory=False)

    # loop through each row of the dataframe to calculate regression_weights
    j = 0
    for index, row in train_df.iterrows():
        train_row = train_df.iloc[[j]] # get current row in dataframe format

        similarity = train_row['similarity_score'][index] # get similarity of current row
        
        # get sum of similarity score above similarity of current row
        filtered_df = train_df[train_df['similarity_score'] > similarity]
        sum_similarity_above_threshold = filtered_df['similarity_score'].sum()

        # calculate the weights
        train_df.at[index, 'regression_weight'] = calculate_weight(sum_similarity_above_threshold, similarity)
        
        j += 1

    # Write dataframe to CSV
    train_df.to_csv(filename, index=False)

In [12]:
# Store file with corrected spelling for data checkpoint purposes
test_df.to_csv('./data/test_df_initial_weighted_alpha50.csv', index=False)

## Logistic Regression with Similarity Scores as Weight

In [13]:
# Import library
import numpy as np

# Function to calculate probability of complaint
def calculate_probability(intercept, coefficients):
    sumcoeff = intercept + np.sum(coefficients)
    return 1 / (1 + np.exp(-sumcoeff))

In [14]:
# Import library
from sklearn.linear_model import LogisticRegression

# Loop each row in test_df and calculate prediction, TP, TN, FP, FN
i = 0
cutoff = 0.5 # probability cutoff for prediction
for index, row in test_df.iterrows():
    # get commentId
    comment_id = row['commentId']

    # open training file for test row
    filename = f"./training_files/weighted_alpha50/{comment_id}.csv"
    train_df = pd.read_csv(filename, low_memory=False)

    # set target variable
    y = train_df['classification']

    # set independent variables
    cols_to_drop = ['classification',
                    'commentId',
                    'comment',
                    'comment_processed',
                    'word_phrase_list',
                    'similarity_score',
                    'regression_weight'
                    ]
    X = train_df.drop(columns=cols_to_drop)

    # set weight to similarity score
    sample_weights = train_df['regression_weight']

    # Do logistic regression modeling with similarity_score as weight
    log_reg = LogisticRegression()
    log_reg.fit(X, y, sample_weight = sample_weights)
    
    # create a column in the test_df for the predicted probability 
    test_df.at[index, 'probability'] = calculate_probability(log_reg.intercept_[0], log_reg.coef_[0]) 

    # calculate prediction from probability using cutoff
    test_df.at[index, 'prediction'] = 1 if test_df.at[index, 'probability'] >= cutoff else 0

    # calculate calibration
    test_df.at[index, 'calibration'] = abs(test_df.at[index, 'probability'] - test_df.at[index, 'classification'])
    
    # Calculate TP, TN, FP, FN
    test_df.at[index, 'TP'] = 1 if (test_df.at[index, 'prediction'] == 1 and test_df.at[index, 'classification'] == 1) else 0
    test_df.at[index, 'TN'] = 1 if (test_df.at[index, 'prediction'] == 0 and test_df.at[index, 'classification'] == 0) else 0
    test_df.at[index, 'FP'] = 1 if (test_df.at[index, 'prediction'] == 1 and test_df.at[index, 'classification'] == 0) else 0
    test_df.at[index, 'FN'] = 1 if (test_df.at[index, 'prediction'] == 0 and test_df.at[index, 'classification'] == 1) else 0

    i += 1

## Average Classification for Maximum Match

In [15]:
# Loop each row in test_df and calculate average classification for maximum match, AveClass
cutoff = 0.5 # probability cutoff for prediction
for index, row in test_df.iterrows():
    # get commentId
    comment_id = row['commentId']

    # open training file for test row
    filename = f"./training_files/weighted_alpha50/{comment_id}.csv"
    train_df = pd.read_csv(filename, low_memory=False)

    cols_to_drop = ['commentId',
                    'comment',
                    'comment_processed',
                    'word_phrase_list',
                    'similarity_score',
                    'regression_weight']
    
    # create dataframe without unneeded columns
    train_df = train_df.drop(columns=cols_to_drop)

    # create dataframe for sum calculation (number of matches)
    sum_df = train_df.drop(columns=['classification'])

    # Calculate the sum of each row
    row_sums = sum_df.sum(axis=1)

    # Find the maximum sum among all rows
    max_sum = row_sums.max()

    # Filter the DataFrame to retain only the rows where the sum is equal to the maximum sum
    train_df = train_df[row_sums == max_sum]

    # create a column in the test_df for average classification for maximum matches 
    test_df.at[index, 'AveClass'] = float(train_df['classification'].mean())

    # calculate prediction from probability using cutoff
    test_df.at[index, 'AveClassPred'] = 1 if test_df.at[index, 'AveClass'] >= cutoff else 0

    # calculate calibration
    test_df.at[index, 'AveClassCalib'] = abs(test_df.at[index, 'AveClass'] - test_df.at[index, 'classification'])
    
    # Calculate TP, TN, FP, FN
    test_df.at[index, 'AveClass_TP'] = 1 if (test_df.at[index, 'AveClassPred'] == 1 and test_df.at[index, 'classification'] == 1) else 0
    test_df.at[index, 'AveClass_TN'] = 1 if (test_df.at[index, 'AveClassPred'] == 0 and test_df.at[index, 'classification'] == 0) else 0
    test_df.at[index, 'AveClass_FP'] = 1 if (test_df.at[index, 'AveClassPred'] == 1 and test_df.at[index, 'classification'] == 0) else 0
    test_df.at[index, 'AveClass_FN'] = 1 if (test_df.at[index, 'AveClassPred'] == 0 and test_df.at[index, 'classification'] == 1) else 0

# Average Classification for Non-Zero Weight

In [16]:
# Loop each row in test_df and calculate average classification for maximum match, AveClass
cutoff = 0.5 # probability cutoff for prediction
for index, row in test_df.iterrows():
    # get commentId
    comment_id = row['commentId']

    # open training file for test row
    filename = f"./training_files/weighted_alpha50/{comment_id}.csv"
    train_df = pd.read_csv(filename, low_memory=False)

    cols_to_drop = ['commentId',
                    'comment',
                    'comment_processed',
                    'word_phrase_list',
                    'similarity_score']
    
    # create dataframe without unneeded columns
    train_df = train_df.drop(columns=cols_to_drop)

    # Filter the DataFrame to retain only the rows where regression_weight is > 0
    train_df = train_df[train_df['regression_weight'] > 0]

    # create a column in the test_df for average classification for maximum matches 
    test_df.at[index, 'WeightClass'] = float(train_df['classification'].mean())

    # calculate prediction from probability using cutoff
    test_df.at[index, 'WeightClassPred'] = 1 if test_df.at[index, 'WeightClass'] >= cutoff else 0

    # calculate calibration
    test_df.at[index, 'WeightClassCalib'] = abs(test_df.at[index, 'WeightClass'] - test_df.at[index, 'classification'])
    
    # Calculate TP, TN, FP, FN
    test_df.at[index, 'WeightClass_TP'] = 1 if (test_df.at[index, 'WeightClassPred'] == 1 and test_df.at[index, 'classification'] == 1) else 0
    test_df.at[index, 'WeightClass_TN'] = 1 if (test_df.at[index, 'WeightClassPred'] == 0 and test_df.at[index, 'classification'] == 0) else 0
    test_df.at[index, 'WeightClass_FP'] = 1 if (test_df.at[index, 'WeightClassPred'] == 1 and test_df.at[index, 'classification'] == 0) else 0
    test_df.at[index, 'WeightClass_FN'] = 1 if (test_df.at[index, 'WeightClassPred'] == 0 and test_df.at[index, 'classification'] == 1) else 0

In [17]:
# Store file with corrected spelling for data checkpoint purposes
test_df.to_csv('./data/test_df_completed_weighted_alpha50.csv', index=False)

# Calculate Metrics

In [18]:
# For entire test_df calculate accuracy
TP = test_df['TP'].sum()
TN = test_df['TN'].sum()
FP = test_df['FP'].sum()
FN = test_df['FN'].sum()
accuracy = round((TP + TN) / (TP + TN + FP + FN) * 100, 2)

# Calculate average calibration and its standard deviation
average_calibration = round(test_df['calibration'].mean(),2)
std_dev_calibration = round(test_df['calibration'].std(),2)

print("USING REGRESSION PREDICTION")
print("CONFUSION MATRIX:")
print(f"TP: {TP} | FP: {FP}")
print(f"FN: {FN} | TN: {TN}")
print(f"\n\nModel accuracy: {accuracy}%")
print(f"Average of calibration: {average_calibration}")
print(f"Standard deviation of calibration: {std_dev_calibration}")

USING REGRESSION PREDICTION
CONFUSION MATRIX:
TP: 19.0 | FP: 4.0
FN: 16.0 | TN: 31.0


Model accuracy: 71.43%
Average of calibration: 0.32
Standard deviation of calibration: 0.34


In [19]:
# For entire test_df calculate accuracy
TP = test_df['AveClass_TP'].sum()
TN = test_df['AveClass_TN'].sum()
FP = test_df['AveClass_FP'].sum()
FN = test_df['AveClass_FN'].sum()
accuracy = round((TP + TN) / (TP + TN + FP + FN) * 100, 2)

# Calculate average calibration and its standard deviation
average_calibration = round(test_df['AveClassCalib'].mean(),2)
std_dev_calibration = round(test_df['AveClassCalib'].std(),2)

print("USING AVERAGE CLASSIFICATION FOR MAXIMUM MATCHES")
print("CONFUSION MATRIX:")
print(f"TP: {TP} | FP: {FP}")
print(f"FN: {FN} | TN: {TN}")
print(f"\n\nModel accuracy: {accuracy}%")
print(f"Average of calibration: {average_calibration}")
print(f"Standard deviation of calibration: {std_dev_calibration}")

USING AVERAGE CLASSIFICATION FOR MAXIMUM MATCHES
CONFUSION MATRIX:
TP: 23.0 | FP: 3.0
FN: 12.0 | TN: 32.0


Model accuracy: 78.57%
Average of calibration: 0.25
Standard deviation of calibration: 0.33


In [20]:
# For entire test_df calculate accuracy
TP = test_df['WeightClass_TP'].sum()
TN = test_df['WeightClass_TN'].sum()
FP = test_df['WeightClass_FP'].sum()
FN = test_df['WeightClass_FN'].sum()
accuracy = round((TP + TN) / (TP + TN + FP + FN) * 100, 2)

# Calculate average calibration and its standard deviation
average_calibration = round(test_df['WeightClassCalib'].mean(),2)
std_dev_calibration = round(test_df['WeightClassCalib'].std(),2)

print("USING AVERAGE CLASSIFICATION FOR NON-ZERO WEIGHTS")
print("CONFUSION MATRIX:")
print(f"TP: {TP} | FP: {FP}")
print(f"FN: {FN} | TN: {TN}")
print(f"\n\nModel accuracy: {accuracy}%")
print(f"Average of calibration: {average_calibration}")
print(f"Standard deviation of calibration: {std_dev_calibration}")

USING AVERAGE CLASSIFICATION FOR NON-ZERO WEIGHTS
CONFUSION MATRIX:
TP: 21.0 | FP: 2.0
FN: 14.0 | TN: 33.0


Model accuracy: 77.14%
Average of calibration: 0.32
Standard deviation of calibration: 0.32


# ChatGPT Predictions