# Load data and initial data prep

In [1]:
# Import library
import pandas as pd

# import data
df = pd.read_csv('./data/TrainingRecords-4-4-2024.csv')

df.head()

Unnamed: 0,commentId,comment,classification,dateCreated
0,129687,Moral of the story while the nurses are all gr...,0,2019-06-03 18:15:21.263
1,169075,If you are thinking about improving your appea...,0,2022-04-30 21:10:15.950
2,88567,but I felt that my concerns were brushed aside...,1,2015-10-12 17:05:36.043
3,147104,My tear trough filler in my left eye looked li...,1,2020-07-12 17:13:43.700
4,137347,"So, thank you Dr. Whitaker for all you have do...",0,2019-10-09 02:31:02.590


In [2]:
# drop dateCreated column
df = df.drop(columns=['dateCreated'])

# Remove duplicates based on commentId and keep the first occurrence
df = df.drop_duplicates(subset='commentId', keep='first').reset_index(drop=True)

df.head()

Unnamed: 0,commentId,comment,classification
0,129687,Moral of the story while the nurses are all gr...,0
1,169075,If you are thinking about improving your appea...,0
2,88567,but I felt that my concerns were brushed aside...,1
3,147104,My tear trough filler in my left eye looked li...,1
4,137347,"So, thank you Dr. Whitaker for all you have do...",0


In [3]:
# drop rows with missing values for comments
df = df.dropna(subset=['comment'])

# drop invalid comments
# List of commentIds to drop
commentIds_to_drop = [180459, 151656, 179845, 179923]

# Drop rows with specified commentIds
df = df[~df['commentId'].isin(commentIds_to_drop)]

df.head()

Unnamed: 0,commentId,comment,classification
0,129687,Moral of the story while the nurses are all gr...,0
1,169075,If you are thinking about improving your appea...,0
2,88567,but I felt that my concerns were brushed aside...,1
3,147104,My tear trough filler in my left eye looked li...,1
4,137347,"So, thank you Dr. Whitaker for all you have do...",0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 105778 entries, 0 to 105782
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   commentId       105778 non-null  int64 
 1   comment         105778 non-null  object
 2   classification  105778 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.2+ MB


In [5]:
# Create copy of dataframe that can be used for further processing
main_df = df.copy()

# Create comment_processed column
main_df['comment_processed'] = main_df['comment']

main_df.head()

Unnamed: 0,commentId,comment,classification,comment_processed
0,129687,Moral of the story while the nurses are all gr...,0,Moral of the story while the nurses are all gr...
1,169075,If you are thinking about improving your appea...,0,If you are thinking about improving your appea...
2,88567,but I felt that my concerns were brushed aside...,1,but I felt that my concerns were brushed aside...
3,147104,My tear trough filler in my left eye looked li...,1,My tear trough filler in my left eye looked li...
4,137347,"So, thank you Dr. Whitaker for all you have do...",0,"So, thank you Dr. Whitaker for all you have do..."


# Remove special characters and numbers

In [6]:
# Import library
import re # for regular expressions

# Remove punctuation, special characters, and numbers
main_df['comment_processed'] = main_df['comment_processed'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', str(x)))

# Change all to lower case

In [7]:
# Create comment_processed column with lower case comments
main_df['comment_processed'] = main_df['comment_processed'].str.lower()

# Remove proper nouns

In [8]:
# Import library
import spacy

# Load English language model with named entity recognition (NER) component
nlp = spacy.load("en_core_web_sm")

# Function to remove proper nouns from text
def remove_proper_nouns(text):
    doc = nlp(text)
    filtered_tokens = [token.text for token in doc if token.ent_type_ == ""]
    return ' '.join(filtered_tokens)

In [9]:
# Apply the remove_proper_nouns function to the comment_processed column
main_df['comment_processed'] = main_df['comment_processed'].apply(remove_proper_nouns)

In [10]:
# Remove rows with empty comment_processed column
main_df = main_df[main_df['comment_processed'].notnull() & (main_df['comment_processed'] != '')]

main_df.head()

Unnamed: 0,commentId,comment,classification,comment_processed
0,129687,Moral of the story while the nurses are all gr...,0,moral of the story while the nurses are all gr...
1,169075,If you are thinking about improving your appea...,0,if you are thinking about improving your appea...
2,88567,but I felt that my concerns were brushed aside...,1,but i felt that my concerns were brushed aside...
3,147104,My tear trough filler in my left eye looked li...,1,my tear trough filler in my left eye looked li...
4,137347,"So, thank you Dr. Whitaker for all you have do...",0,so thank you for all you have done and how enc...


In [11]:
# Store file with removed proper nouns
main_df.to_csv('./data/checkpoint_01_no_proper.csv', index=False)

# Text pre-processing (complex)

In [12]:
# Import library
import pandas as pd

# read checkpoint file
main_df = pd.read_csv('./data/checkpoint_01_no_proper.csv', low_memory=False)

# Spelling correction

In [13]:
# Import library
from textblob import TextBlob

# Function to correct spelling mistakes in a text
def correct_spelling(text):
    blob = TextBlob(text)
    corrected_text = blob.correct()
    return str(corrected_text)

In [14]:
# Apply the correct_spelling function to the comment_processed column
main_df['comment_processed'] = main_df['comment_processed'].apply(correct_spelling)

# Output the DataFrame with corrected spelling
main_df.head()

Unnamed: 0,commentId,comment,classification,comment_processed
0,129687,Moral of the story while the nurses are all gr...,0,moral of the story while the nurses are all gr...
1,169075,If you are thinking about improving your appea...,0,if you are thinking about improving your appea...
2,88567,but I felt that my concerns were brushed aside...,1,but i felt that my concerns were brushed aside...
3,147104,My tear trough filler in my left eye looked li...,1,my tear trough filler in my left eye looked li...
4,137347,"So, thank you Dr. Whitaker for all you have do...",0,so thank you for all you have done and how enc...


In [15]:
# Remove rows with empty comment_processed column
main_df = main_df[main_df['comment_processed'].notnull() & (main_df['comment_processed'] != '')]

In [16]:
# Store file with corrected spelling for data checkpoint purposes
main_df.to_csv('./data/checkpoint_02_spell_corrected.csv', index=False)

# Removing stop words (caution "no", "not" and other relevant negation words should not be removed)

In [17]:
# Import library
import pandas as pd

# read checkpoint file
main_df = pd.read_csv('./data/checkpoint_02_spell_corrected.csv', low_memory=False)

In [18]:
import spacy

# Load English model from spaCy
nlp = spacy.load("en_core_web_sm")

# Get the list of English stop words
stop_words = nlp.Defaults.stop_words

# Convert the stop words into a DataFrame
stop_words_df = pd.DataFrame({"Stop_Words": list(stop_words)})

# Save the DataFrame to a CSV file
stop_words_df.to_csv("./data/spacy_english_stop_words.csv", index=False)

In [19]:
# After processing the spacy stop word list create a custom stop word list and replace spacy stop words
custom_stop_words_df = pd.read_csv("./data/spacy_english_stop_words_processed.csv")
custom_stop_words = custom_stop_words_df['Stop_Words'].tolist()
nlp.Defaults.stop_words = custom_stop_words

In [20]:
# create a function to remove custom stop words
def remove_stop_words(text):

    # Process the text with spaCy
    doc = nlp(text)

    # Filter out the stop words
    filtered_text = " ".join(token.text for token in doc if not token.is_stop)

    return filtered_text

In [21]:
# Apply the stop word removal function to the comment_processed column
main_df['comment_processed'] = main_df['comment_processed'].apply(remove_stop_words)

In [22]:
# Remove rows with empty comment_processed column
main_df = main_df[main_df['comment_processed'].notnull() & (main_df['comment_processed'] != '')]

main_df.head()

Unnamed: 0,commentId,comment,classification,comment_processed
0,129687,Moral of the story while the nurses are all gr...,0,moral story nurses great body money
1,169075,If you are thinking about improving your appea...,0,thinking improving appearance want competent
2,88567,but I felt that my concerns were brushed aside...,1,felt concerns brushed aside went collapse
3,147104,My tear trough filler in my left eye looked li...,1,tear trough filler left eye looked like garage
4,137347,"So, thank you Dr. Whitaker for all you have do...",0,thank encouraging way


In [23]:
# Store file with corrected spelling for data checkpoint purposes
main_df.to_csv('./data/checkpoint_03_no_stop_words.csv', index=False)

# Lemmatization

In [24]:
# Import library
import pandas as pd

# read checkpoint file
main_df = pd.read_csv('./data/checkpoint_03_no_stop_words.csv', low_memory=False)

In [25]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize a single comment
def lemmatize_comment(comment):
    if comment == '' or comment is None:
        return ''  # Return an empty string if it's NaN
    
    tokens = word_tokenize(comment)  # Tokenize the comment into words
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatize each word
    return ' '.join(lemmatized_tokens)  # Join the lemmatized tokens back into a comment

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vladc\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vladc\AppData\Roaming\nltk_data...


In [26]:
# Apply the lemmatize function to the comment_processed column
main_df['comment_processed'] = main_df['comment_processed'].apply(lemmatize_comment)

In [27]:
# Remove rows with empty comment_processed column
main_df = main_df[main_df['comment_processed'].notnull() & (main_df['comment_processed'] != '')]

main_df.head()

Unnamed: 0,commentId,comment,classification,comment_processed
0,129687,Moral of the story while the nurses are all gr...,0,moral story nurse great body money
1,169075,If you are thinking about improving your appea...,0,thinking improving appearance want competent
2,88567,but I felt that my concerns were brushed aside...,1,felt concern brushed aside went collapse
3,147104,My tear trough filler in my left eye looked li...,1,tear trough filler left eye looked like garage
4,137347,"So, thank you Dr. Whitaker for all you have do...",0,thank encouraging way


In [28]:
# Store file with corrected spelling for data checkpoint purposes
main_df.to_csv('./data/checkpoint_04_lemmatized.csv', index=False)

# Convert synonyms

In [43]:
# Import library
import pandas as pd

# read checkpoint file
main_df = pd.read_csv('./data/checkpoint_04_lemmatized.csv', low_memory=False)

In [48]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Adding this line to download the required "omw-1.4" resource

# Function to find the most common synonym for a word using WordNet
def most_common_synonym(word):
    synsets = wordnet.synsets(word)  # Get all synsets for the word
    if synsets:
        all_synonyms = [syn.lemmas() for syn in synsets]  # Get all lemmas for each synset
        all_synonyms = [lemma.name().replace('_', ' ') for syn in all_synonyms for lemma in syn]  # Flatten the list of lemmas, replacing underscores with spaces
        synonym_counts = {synonym: all_synonyms.count(synonym) for synonym in all_synonyms}  # Count occurrences of each synonym
        most_common_synonym = max(synonym_counts, key=synonym_counts.get)  # Get the synonym with the highest count
        return most_common_synonym.lower()
    else:
        return word  # If no synsets found, return the original word

# Function to replace each word in a comment with its most common synonym
def replace_with_synonyms(comment):
    tokens = word_tokenize(comment)  # Tokenize the comment into words
    replaced_tokens = [most_common_synonym(token) for token in tokens]  # Replace each word with its most common synonym
    return ' '.join(replaced_tokens)  # Join the replaced tokens back into a comment

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vladc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vladc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\vladc\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [49]:
# Apply the synonym function to the comment_processed column
main_df['comment_processed'] = main_df['comment_processed'].apply(replace_with_synonyms)

In [50]:
# Remove rows with empty comment_processed column
main_df = main_df[main_df['comment_processed'].notnull() & (main_df['comment_processed'] != '')]

main_df.head()

Unnamed: 0,commentId,comment,classification,comment_processed
0,129687,Moral of the story while the nurses are all gr...,0,moral story nurse great body money
1,169075,If you are thinking about improving your appea...,0,think better appearance want competent
2,88567,but I felt that my concerns were brushed aside...,1,feel concern brush aside go collapse
3,147104,My tear trough filler in my left eye looked li...,1,tear trough filler leave eye look like garage
4,137347,"So, thank you Dr. Whitaker for all you have do...",0,thank encourage way


In [51]:
# Store file with corrected spelling for data checkpoint purposes
main_df.to_csv('./data/checkpoint_05_synonyms.csv', index=False)

# Create word or phrase list

In [1]:
# Import library
import pandas as pd

# read checkpoint file
main_df = pd.read_csv('./data/checkpoint_05_synonyms.csv', low_memory=False)

In [2]:
# Function to generate word phrase list from text
def generate_word_phrase_list(text):
    text_list = text.split()
    end_pos = len(text_list)
    word_phrase_list = []
    
    for i in range(end_pos):
        for j in range(i, end_pos):
            words = text_list[i:j+1]
            phrase = ' '.join(words)
            word_phrase_list.append(phrase)
    
    return word_phrase_list

In [3]:
# Remove missing values for comment_processed
main_df = main_df.dropna(subset=['comment_processed'])

# Apply the generate_word_phrase_list function to the comment_processed column to create word_phrase_list column
main_df['word_phrase_list'] = main_df['comment_processed'].apply(generate_word_phrase_list)

# Output the DataFrame with word phrase list
main_df.head()

Unnamed: 0,commentId,comment,classification,comment_processed,word_phrase_list
0,129687,Moral of the story while the nurses are all gr...,0,moral story nurse great body money,"[moral, moral story, moral story nurse, moral ..."
1,169075,If you are thinking about improving your appea...,0,think better appearance want competent,"[think, think better, think better appearance,..."
2,88567,but I felt that my concerns were brushed aside...,1,feel concern brush aside go collapse,"[feel, feel concern, feel concern brush, feel ..."
3,147104,My tear trough filler in my left eye looked li...,1,tear trough filler leave eye look like garage,"[tear, tear trough, tear trough filler, tear t..."
4,137347,"So, thank you Dr. Whitaker for all you have do...",0,thank encourage way,"[thank, thank encourage, thank encourage way, ..."


In [160]:
# Store file with corrected spelling for data checkpoint purposes
main_df.to_csv('./data/checkpoint_06_final_training_file.csv', index=False)

# Final cleanups

In [4]:
# drop invalid comments from test selection
# List of commentIds to drop
commentIds_to_drop = [181673, 95658]

# Drop rows with specified commentIds
selection_df = main_df[~main_df['commentId'].isin(commentIds_to_drop)]

# Check if each value in the 'comment_processed' column contains only one word
is_single_word = selection_df['comment_processed'].str.split().apply(len) == 1

# Keep rows where the comment_processed column has more than one word
selection_df = selection_df[~is_single_word]

## Modeling tasks

### Create test and training sets and calculate similarity scores

In [5]:
# Filter rows where classification is 1
complaints_df = selection_df[selection_df['classification'] == 1].sample(n=35, random_state=42)

# Filter rows where classification is 0
praises_df = selection_df[selection_df['classification'] == 0].sample(n=35, random_state=42)

# Concatenate both dataframes
test_df = pd.concat([complaints_df, praises_df])

# Reset index of the resulting dataframe
test_df = test_df.reset_index(drop=True)

test_df.head()

Unnamed: 0,commentId,comment,classification,comment_processed,word_phrase_list
0,126376,I have been in an incredible amount of pain wh...,1,incredible pain ignore doctor,"[incredible, incredible pain, incredible pain ..."
1,188698,un-returned phone call post surgery after a co...,1,return phone post surgery concern report,"[return, return phone, return phone post, retu..."
2,182819,and I feel like the bodytite was not worth it ...,1,feel like bodytite worth mind kiss dramaticall...,"[feel, feel like, feel like bodytite, feel lik..."
3,155926,Two years later the improvements are nowhere t...,1,improvement see,"[improvement, improvement see, see]"
4,116285,My skin on the side of my face is still numb.,1,skin face numb,"[skin, skin face, skin face numb, face, face n..."


In [6]:
# Function to calculate similarity score
def calculate_similarity_score(row, alpha, word_phrase_list):
    n_match = row[word_phrase_list].sum(axis=1) # number of matches
    n_target_only = len(word_phrase_list) - n_match # number unmatched in target
    n_train_only = len(row['word_phrase_list']) - n_match # number unmatched in training

    similarity_score = (n_match / (n_match + (alpha * n_target_only) + ((1-alpha) * n_train_only)))

    return similarity_score

In [7]:
# Set alpha for similarity score
alpha = 0.8 # may need to change for sensitivity testing

In [8]:
# Import libraries
import os
import warnings

# Suppress the PerformanceWarning
warnings.filterwarnings("ignore", message="DataFrame is highly fragmented", category=pd.errors.PerformanceWarning)

# Create a folder for training files if it doesn't exist
if not os.path.exists('./training_files'):
    os.makedirs('./training_files')

# Loop through each row in test_df
for i in range(len(test_df)):
    test_row = test_df.iloc[[i]] # get current row in dataframe format
    
    # Create a copy of main_df
    train_df = main_df.copy()

    # Remove the row with the same commentId as the current row in test_df
    test_commentId = int(test_row['commentId'])
    train_df = train_df[train_df['commentId'] != test_commentId]

    # Get word_phrase_list of current row in test_df and create columns in train_df
    word_phrase_list = test_row['word_phrase_list'][i]

    for word_phrase in word_phrase_list:
        train_df[word_phrase] = 0 # init to 0

    # Set the word_phrase column to 1 if it exists in the training set row (exact match)
    for word_phrase in word_phrase_list:
        train_df[word_phrase] = train_df['word_phrase_list'].apply(lambda x: 1 if word_phrase in x else 0)
        
    # Remove rows where the sum of the columns created from word_phrase_list is 0
    train_df = train_df[train_df[word_phrase_list].sum(axis=1) != 0]

    # Calculate similarity scores
    train_df['similarity_score'] = 0.0 # initialize
    
    j = 0
    for index, row in train_df.iterrows():
        train_row = train_df.iloc[[j]] # get current row in dataframe format
        train_df.at[index, 'similarity_score'] = calculate_similarity_score(train_row, alpha, word_phrase_list)
        j += 1
    
    # Get columns present in the word_phrase_list
    word_phrase_columns = train_df.columns[train_df.columns.isin(word_phrase_list)]
    
    # Get all columns that are all ones or zeros within word_phrase_columns
    columns_to_drop = word_phrase_columns[(train_df[word_phrase_columns].sum(axis=0) == len(train_df)) | (train_df[word_phrase_columns].sum(axis=0) == 0)]
    
    # Drop these columns with all ones or zeros (zero variance)
    train_df = train_df.drop(columns=columns_to_drop)
       
    # Create filename
    filename = f"./training_files/{test_commentId}.csv"

    # Write dataframe to CSV
    train_df.to_csv(filename, index=False)

In [9]:
# Store file with corrected spelling for data checkpoint purposes
test_df.to_csv('./data/test_df_initial_alpha80.csv', index=False)

### Logistic Regression with Similarity Scores as Weight

In [10]:
# Import library
import numpy as np

# Function to calculate probability of complaint
def calculate_probability(intercept, coefficients):
    sumcoeff = intercept + np.sum(coefficients)
    return 1 / (1 + np.exp(-sumcoeff))

In [11]:
# Import library
from sklearn.linear_model import LogisticRegression

# Loop each row in test_df and calculate prediction, TP, TN, FP, FN
i = 0
cutoff = 0.5 # probability cutoff for prediction
for index, row in test_df.iterrows():
    # get commentId
    comment_id = row['commentId']

    # open training file for test row
    filename = f"./training_files/{comment_id}.csv"
    train_df = pd.read_csv(filename, low_memory=False)

    # set target variable
    y = train_df['classification']

    # set independent variables
    cols_to_drop = ['classification',
                    'commentId',
                    'comment',
                    'comment_processed',
                    'word_phrase_list',
                    'similarity_score']
    X = train_df.drop(columns=cols_to_drop)

    # set weight to similarity score
    sample_weights = train_df['similarity_score']

    # Do logistic regression modeling with similarity_score as weight
    log_reg = LogisticRegression()
    log_reg.fit(X, y, sample_weight = sample_weights)
    
    # create a column in the test_df for the predicted probability 
    test_df.at[index, 'probability'] = calculate_probability(log_reg.intercept_[0], log_reg.coef_[0]) 

    # calculate prediction from probability using cutoff
    test_df.at[index, 'prediction'] = 1 if test_df.at[index, 'probability'] >= cutoff else 0

    # calculate calibration
    test_df.at[index, 'calibration'] = abs(test_df.at[index, 'probability'] - test_df.at[index, 'classification'])
    
    # Calculate TP, TN, FP, FN
    test_df.at[index, 'TP'] = 1 if (test_df.at[index, 'prediction'] == 1 and test_df.at[index, 'classification'] == 1) else 0
    test_df.at[index, 'TN'] = 1 if (test_df.at[index, 'prediction'] == 0 and test_df.at[index, 'classification'] == 0) else 0
    test_df.at[index, 'FP'] = 1 if (test_df.at[index, 'prediction'] == 1 and test_df.at[index, 'classification'] == 0) else 0
    test_df.at[index, 'FN'] = 1 if (test_df.at[index, 'prediction'] == 0 and test_df.at[index, 'classification'] == 1) else 0

    i += 1

In [12]:
# Store file with corrected spelling for data checkpoint purposes
test_df.to_csv('./data/test_df_completed_alpha80.csv', index=False)

### Calculate Metrics

In [13]:
# For entire test_df calculate accuracy
TP = test_df['TP'].sum()
TN = test_df['TN'].sum()
FP = test_df['FP'].sum()
FN = test_df['FN'].sum()
accuracy = round((TP + TN) / (TP + TN + FP + FN) * 100, 2)

# Calculate average calibration and its standard deviation
average_calibration = round(test_df['calibration'].mean(),2)
std_dev_calibration = round(test_df['calibration'].std(),2)


print(f"Model accuracy: {accuracy}%")
print(f"Average of calibration: {average_calibration}")
print(f"Standard deviation of calibration: {std_dev_calibration}")

Model accuracy: 72.86%
Average of calibration: 0.33
Standard deviation of calibration: 0.23


# ChatGPT Predictions