# HAP 789 Sentiment Analysis Project

## Load data and initial data prep

In [1]:
# Import library
import pandas as pd

# import data
df = pd.read_csv('./data/TrainingRecords-4-4-2024.csv')

df.head()

Unnamed: 0,commentId,comment,classification,dateCreated
0,129687,Moral of the story while the nurses are all gr...,0,2019-06-03 18:15:21.263
1,169075,If you are thinking about improving your appea...,0,2022-04-30 21:10:15.950
2,88567,but I felt that my concerns were brushed aside...,1,2015-10-12 17:05:36.043
3,147104,My tear trough filler in my left eye looked li...,1,2020-07-12 17:13:43.700
4,137347,"So, thank you Dr. Whitaker for all you have do...",0,2019-10-09 02:31:02.590


In [2]:
# drop dateCreated column
df = df.drop(columns=['dateCreated'])

# Remove duplicates based on commentId and keep the first occurrence
df = df.drop_duplicates(subset='commentId', keep='first').reset_index(drop=True)

df.head()

Unnamed: 0,commentId,comment,classification
0,129687,Moral of the story while the nurses are all gr...,0
1,169075,If you are thinking about improving your appea...,0
2,88567,but I felt that my concerns were brushed aside...,1
3,147104,My tear trough filler in my left eye looked li...,1
4,137347,"So, thank you Dr. Whitaker for all you have do...",0


In [8]:
# drop rows with missing values for comments
df = df.dropna(subset=['comment'])

# drop invalid comments
# List of commentIds to drop
commentIds_to_drop = [180459, 151656, 179845, 179923]

# Drop rows with specified commentIds
df = df[~df['commentId'].isin(commentIds_to_drop)]

df.head()

Unnamed: 0,commentId,comment,classification
0,129687,Moral of the story while the nurses are all gr...,0
1,169075,If you are thinking about improving your appea...,0
2,88567,but I felt that my concerns were brushed aside...,1
3,147104,My tear trough filler in my left eye looked li...,1
4,137347,"So, thank you Dr. Whitaker for all you have do...",0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 105778 entries, 0 to 105782
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   commentId       105778 non-null  int64 
 1   comment         105778 non-null  object
 2   classification  105778 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.2+ MB


## Text pre-processing (simple)

In [10]:
# Create copy of dataframe that can be used for further processing
main_df = df.copy()

# Create comment_processed column with lower case comments
main_df['comment_processed'] = main_df['comment'].str.lower()

main_df.head()

Unnamed: 0,commentId,comment,classification,comment_processed
0,129687,Moral of the story while the nurses are all gr...,0,moral of the story while the nurses are all gr...
1,169075,If you are thinking about improving your appea...,0,if you are thinking about improving your appea...
2,88567,but I felt that my concerns were brushed aside...,1,but i felt that my concerns were brushed aside...
3,147104,My tear trough filler in my left eye looked li...,1,my tear trough filler in my left eye looked li...
4,137347,"So, thank you Dr. Whitaker for all you have do...",0,"so, thank you dr. whitaker for all you have do..."


In [11]:
# Import library
import re # for regular expressions

# Remove punctuation, special characters, and numbers
main_df['comment_processed'] = main_df['comment_processed'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', str(x)))

main_df.head(20)

Unnamed: 0,commentId,comment,classification,comment_processed
0,129687,Moral of the story while the nurses are all gr...,0,moral of the story while the nurses are all gr...
1,169075,If you are thinking about improving your appea...,0,if you are thinking about improving your appea...
2,88567,but I felt that my concerns were brushed aside...,1,but i felt that my concerns were brushed aside...
3,147104,My tear trough filler in my left eye looked li...,1,my tear trough filler in my left eye looked li...
4,137347,"So, thank you Dr. Whitaker for all you have do...",0,so thank you dr whitaker for all you have done...
5,101267,but he inly wants what's best for you as the p...,0,but he inly wants whats best for you as the pa...
6,83185,and I even have the hourglass figure I always ...,0,and i even have the hourglass figure i always ...
7,102121,Great.,0,great
8,146628,and and I came across Sadrian MD plastic surge...,0,and and i came across sadrian md plastic surge...
9,97081,and i trust her to make it better because she ...,0,and i trust her to make it better because she ...


## Text pre-processing (complex)

### Spelling correction

In [16]:
# Import library
from textblob import TextBlob

# Function to correct spelling mistakes in a text
def correct_spelling(text):
    blob = TextBlob(text)
    corrected_text = blob.correct()
    return str(corrected_text)

# Apply the correct_spelling function to the comment_processed column
main_df['comment_processed'] = main_df['comment_processed'].apply(correct_spelling)

# Output the DataFrame with corrected spelling
main_df.head()

Unnamed: 0,commentId,comment,classification,comment_processed
0,129687,Moral of the story while the nurses are all gr...,0,moral of the story while the nurses are all gr...
1,169075,If you are thinking about improving your appea...,0,if you are thinking about improving your appea...
2,88567,but I felt that my concerns were brushed aside...,1,but i felt that my concerns were brushed aside...
3,147104,My tear trough filler in my left eye looked li...,1,my tear trough filler in my left eye looked li...
4,137347,"So, thank you Dr. Whitaker for all you have do...",0,so thank you dr whither for all you have done ...


In [17]:
# Store file with corrected spelling for data checkpoint purposes
main_df.to_csv('./data/checkpoint_01_spell_corrected.csv', index=False)

### Remove proper nouns

In [None]:
# read checkpoint file
main_df = pd.read_csv('./data/checkpoint_01_spell_corrected.csv', low_memory=False)

In [20]:
# Import library
import spacy

# Load English language model with named entity recognition (NER) component
nlp = spacy.load("en_core_web_sm")

# Function to remove proper nouns from text
def remove_proper_nouns(text):
    doc = nlp(text)
    filtered_tokens = [token.text for token in doc if token.ent_type_ == ""]
    return ' '.join(filtered_tokens)

# Apply the remove_proper_nouns function to the comment_processed column
main_df['comment_processed'] = main_df['comment_processed'].apply(correct_spelling)

# Output the DataFrame with no proper nouns
main_df.head()

Unnamed: 0,commentId,comment,classification,comment_processed
0,129687,Moral of the story while the nurses are all gr...,0,moral of the story while the nurses are all gr...
1,169075,If you are thinking about improving your appea...,0,if you are thinking about improving your appea...
2,88567,but I felt that my concerns were brushed aside...,1,but i felt that my concerns were brushed aside...
3,147104,My tear trough filler in my left eye looked li...,1,my tear trough filler in my left eye looked li...
4,137347,"So, thank you Dr. Whitaker for all you have do...",0,so thank you dr whither for all you have done ...


In [21]:
# Store file with removed proper nouns
main_df.to_csv('./data/checkpoint_02_no_proper.csv', index=False)

### Removing stop words (caution "not", "not" and other relevant words should not be removed)

In [None]:
# read checkpoint file
main_df = pd.read_csv('./data/checkpoint_02_no_proper.csv', low_memory=False)

In [None]:
# remove stop words

### Remove extra whitespaces

### Lemmatization

### Convert synonyms

### Create word or phrase list

## Modeling tasks

### Create test and training sets

### Calculate similarity scores

### Logistic Regression with Similarity Scores as Weight

### Prediction Accuracy