# Create word or phrase list

In [1]:
# Import library
import pandas as pd

# read checkpoint file
main_df = pd.read_csv('./data/checkpoint_05_synonyms.csv', low_memory=False)

In [2]:
# Function to generate word phrase list from text
def generate_word_phrase_list(text):
    text_list = text.split()
    end_pos = len(text_list)
    word_phrase_list = []
    
    for i in range(end_pos):
        for j in range(i, end_pos):
            words = text_list[i:j+1]
            phrase = ' '.join(words)
            word_phrase_list.append(phrase)
    
    return word_phrase_list

In [3]:
# Remove missing values for comment_processed
main_df = main_df.dropna(subset=['comment_processed'])

# Apply the generate_word_phrase_list function to the comment_processed column to create word_phrase_list column
main_df['word_phrase_list'] = main_df['comment_processed'].apply(generate_word_phrase_list)

# Output the DataFrame with word phrase list
main_df.head()

Unnamed: 0,commentId,comment,classification,comment_processed,word_phrase_list
0,129687,Moral of the story while the nurses are all gr...,0,moral story nurse great body money,"[moral, moral story, moral story nurse, moral ..."
1,169075,If you are thinking about improving your appea...,0,think better appearance want competent,"[think, think better, think better appearance,..."
2,88567,but I felt that my concerns were brushed aside...,1,feel concern brush aside go collapse,"[feel, feel concern, feel concern brush, feel ..."
3,147104,My tear trough filler in my left eye looked li...,1,tear trough filler leave eye look like garage,"[tear, tear trough, tear trough filler, tear t..."
4,137347,"So, thank you Dr. Whitaker for all you have do...",0,thank encourage way,"[thank, thank encourage, thank encourage way, ..."


# Final cleanups

In [4]:
# drop invalid comments from test selection
# List of commentIds to drop
commentIds_to_drop = [181673, 95658]

# Drop rows with specified commentIds
selection_df = main_df[~main_df['commentId'].isin(commentIds_to_drop)]

# Check if each value in the 'comment_processed' column contains only one word
is_single_word = selection_df['comment_processed'].str.split().apply(len) == 1

# Keep rows where the comment_processed column has more than one word
selection_df = selection_df[~is_single_word]

# Modeling tasks

## Create test and training sets and calculate similarity scores

In [5]:
# Filter rows where classification is 1
complaints_df = selection_df[selection_df['classification'] == 1].sample(n=35, random_state=42)

# Filter rows where classification is 0
praises_df = selection_df[selection_df['classification'] == 0].sample(n=35, random_state=42)

# Concatenate both dataframes
test_df = pd.concat([complaints_df, praises_df])

# Reset index of the resulting dataframe
test_df = test_df.reset_index(drop=True)

test_df.head()

Unnamed: 0,commentId,comment,classification,comment_processed,word_phrase_list
0,84833,The doctor talked to me with half his body in ...,1,doctor talk body examination room,"[doctor, doctor talk, doctor talk body, doctor..."
1,58035,And the registration person was not so nice.,1,registration person not nice,"[registration, registration person, registrati..."
2,190897,This is a pretend clinic they pretend to love ...,1,pretend clinic pretend love look fund,"[pretend, pretend clinic, pretend clinic prete..."
3,173845,"When I complained about it, I was asked to sen...",1,complain ask send picture,"[complain, complain ask, complain ask send, co..."
4,168214,Acts like a history teacher when really he's a...,1,act like history teacher dumb doctor keep ask ...,"[act, act like, act like history, act like his..."


In [7]:
# Import libraries
import os
import warnings

# Suppress the PerformanceWarning
warnings.filterwarnings("ignore", message="DataFrame is highly fragmented", category=pd.errors.PerformanceWarning)

# Create a folder for training files if it doesn't exist
if not os.path.exists('./training_files'):
    os.makedirs('./training_files')

# Loop through each row in test_df
for i in range(len(test_df)):
    test_row = test_df.iloc[[i]] # get current row in dataframe format
    
    # Create a copy of main_df
    train_df = main_df.copy()

    # Remove the row with the same commentId as the current row in test_df
    test_commentId = int(test_row['commentId'])

    train_df = train_df[train_df['commentId'] != test_commentId]

    # Get word_phrase_list of current row in test_df and create columns in train_df
    word_phrase_list = test_row['word_phrase_list'][i]

    for word_phrase in word_phrase_list:
        train_df[word_phrase] = 0 # init to 0

    # Set the word_phrase column to 1 if it exists in the training set row (exact match)
    for word_phrase in word_phrase_list:
        train_df[word_phrase] = train_df['word_phrase_list'].apply(lambda x: 1 if word_phrase in x else 0)
        
    # Remove rows where the sum of the columns created from word_phrase_list is 0
    train_df = train_df[train_df[word_phrase_list].sum(axis=1) != 0]

    # Get columns present in the word_phrase_list
    word_phrase_columns = train_df.columns[train_df.columns.isin(word_phrase_list)]
    
    # Get all columns that are all ones or zeros within word_phrase_columns
    columns_to_drop = word_phrase_columns[(train_df[word_phrase_columns].sum(axis=0) == len(train_df)) | (train_df[word_phrase_columns].sum(axis=0) == 0)]
    
    # Drop these columns with all ones or zeros (zero variance)
    train_df = train_df.drop(columns=columns_to_drop)
       
    # Create filename
    filename = f"./training_files/unweighted/{test_commentId}.csv"

    # Write dataframe to CSV
    train_df.to_csv(filename, index=False)

In [8]:
# Store file with corrected spelling for data checkpoint purposes
test_df.to_csv('./data/test_df_initial_unweighted.csv', index=False)

## Logistic Regression with Similarity Scores as Weight

In [9]:
# Import library
import numpy as np

# Function to calculate probability of complaint
def calculate_probability(intercept, coefficients):
    sumcoeff = intercept + np.sum(coefficients)
    return 1 / (1 + np.exp(-sumcoeff))

In [10]:
# Import library
from sklearn.linear_model import LogisticRegression

# Loop each row in test_df and calculate prediction, TP, TN, FP, FN
i = 0
cutoff = 0.5 # probability cutoff for prediction
for index, row in test_df.iterrows():
    # get commentId
    comment_id = row['commentId']

    # open training file for test row
    filename = f"./training_files/unweighted/{comment_id}.csv"
    train_df = pd.read_csv(filename, low_memory=False)

    # set target variable
    y = train_df['classification']

    # set independent variables
    cols_to_drop = ['classification',
                    'commentId',
                    'comment',
                    'comment_processed',
                    'word_phrase_list'
                    ]
    X = train_df.drop(columns=cols_to_drop)

    # Do logistic regression modeling with similarity_score as weight
    log_reg = LogisticRegression()
    log_reg.fit(X, y)
    
    # create a column in the test_df for the predicted probability 
    test_df.at[index, 'probability'] = calculate_probability(log_reg.intercept_[0], log_reg.coef_[0]) 

    # calculate prediction from probability using cutoff
    test_df.at[index, 'prediction'] = 1 if test_df.at[index, 'probability'] >= cutoff else 0

    # calculate calibration
    test_df.at[index, 'calibration'] = abs(test_df.at[index, 'probability'] - test_df.at[index, 'classification'])
    
    # Calculate TP, TN, FP, FN
    test_df.at[index, 'TP'] = 1 if (test_df.at[index, 'prediction'] == 1 and test_df.at[index, 'classification'] == 1) else 0
    test_df.at[index, 'TN'] = 1 if (test_df.at[index, 'prediction'] == 0 and test_df.at[index, 'classification'] == 0) else 0
    test_df.at[index, 'FP'] = 1 if (test_df.at[index, 'prediction'] == 1 and test_df.at[index, 'classification'] == 0) else 0
    test_df.at[index, 'FN'] = 1 if (test_df.at[index, 'prediction'] == 0 and test_df.at[index, 'classification'] == 1) else 0

    i += 1

In [11]:
# Store file with corrected spelling for data checkpoint purposes
test_df.to_csv('./data/test_df_completed_unweighted.csv', index=False)

# Calculate Metrics

In [12]:
# For entire test_df calculate accuracy
TP = test_df['TP'].sum()
TN = test_df['TN'].sum()
FP = test_df['FP'].sum()
FN = test_df['FN'].sum()
accuracy = round((TP + TN) / (TP + TN + FP + FN) * 100, 2)

# Calculate average calibration and its standard deviation
average_calibration = round(test_df['calibration'].mean(),2)
std_dev_calibration = round(test_df['calibration'].std(),2)

print("USING REGRESSION PREDICTION")
print("CONFUSION MATRIX:")
print(f"TP: {TP} | FP: {FP}")
print(f"FN: {FN} | TN: {TN}")
print(f"\n\nModel accuracy: {accuracy}%")
print(f"Average of calibration: {average_calibration}")
print(f"Standard deviation of calibration: {std_dev_calibration}")

USING REGRESSION PREDICTION
CONFUSION MATRIX:
TP: 28.0 | FP: 3.0
FN: 7.0 | TN: 32.0


Model accuracy: 85.71%
Average of calibration: 0.22
Standard deviation of calibration: 0.24
