**Baseline: Simple Bag of Words Implementation**
This is a model that uses BoW and Naive Bayes. Basically, it gets every word, matches it with a value, and uses Naive Bayes to predict the label based on the previous values attached with the word. If its above a certain threshhold, it will be a certain result.

In [42]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, f1_score

In [43]:
#import as a pandas file

tinder = pd.read_csv(r'tinderdata\tinder_google_play_reviews.csv')

amazon = pd.read_csv(r'amazondata\Reviews.csv')

tinder_df = tinder.drop(columns=[
    'userName', 'userImage', 'reviewCreatedVersion', 'at', 'replyContent', 'repliedAt', 'appVersion'
])
amazon_df = amazon.drop(columns=[
    'ProductId', 'UserId', 'ProfileName', 'Time'
])
classes = ["Not Satisfied", "Somewhat Satisfied", "Satisfied"]
labels_class = [0, 1, 2]
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}


**Quick Preprocessing (Same as deBerta)**

In [44]:

import spacy
import re

nlp = spacy.load("en_core_web_sm")# Load English model that defines lemmatization and stopwords

def stop_words_and_lemmatize_texts(texts):
    lemmatized_texts = []
    for doc in nlp.pipe(texts, batch_size=1000):
        filtered_lemmas = [
            token.lemma_ for token in doc
            if not token.is_punct and not token.is_space and not token.is_stop
        ]
        lemmatized_texts.append(" ".join(filtered_lemmas))
    return lemmatized_texts

def clean_text(text):
    if pd.isnull(text):
        return ""

    text = text.lower()  # Lowercase
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-z0-9\s.,!?]', '', text)  # Remove special chars (keep basic punctuation)
    text = re.sub(r'\s+', ' ', text)  # Remove excessive whitespace
    return text.strip()

def clean_dataframe(df, text_column):
    df = df.copy()  # make an explicit copy to avoid chained assignment issues
    df.loc[:, text_column] = df[text_column].astype(str).apply(clean_text)  # Clean with regex
    df.loc[:, text_column] = stop_words_and_lemmatize_texts(df[text_column])  # Lemmatize & remove stopwords
    return df

print(amazon_df.shape)
# Drop duplicates based on the 'text' column
a_df_unique = amazon_df.drop_duplicates(subset='Text')
t_df_unique = tinder_df.drop_duplicates(subset='content')

# print(a_df_unique["Score"].value_counts())

#===================================================
#2 Drop 0 rated helpful reviews. Reviews should be rated helpful at least.

a_df_filtered = a_df_unique[a_df_unique['HelpfulnessDenominator'] <= (a_df_unique['HelpfulnessNumerator']*2)] #Basically, get rid of all rows that have less than 1:2 helpful to non-helpful reviews. 
a_df_filtered_2 = a_df_filtered[a_df_filtered['HelpfulnessNumerator'] != 0]

t_df_filtered = t_df_unique[t_df_unique['thumbsUpCount'] != 0]

# Would cut 560,000 to 170,000
#===================================================

# Extra: Miniature version of dataset to test smaller chunks of data in the interest of time and my computer's health
percent_used = 0.001
mini_amazon_df = a_df_filtered_2.sample(frac=percent_used, random_state=42)
mini_tinder_df = t_df_filtered.sample(frac=percent_used, random_state=42)
#===================================================

#3 standardize all text (lemmatize and drop stopwords)
a_df_cleaned = clean_dataframe(mini_amazon_df, "Text")
t_df_cleaned = clean_dataframe(mini_tinder_df, "content")

#===================================================

# 4. Bin into classes using pd.cut, converting previous scores of 1-5 to a class 0, 1, or 2 (not saitsfied, somewhat satisfied, satisfied)

a_df_cleaned['Satisfaction'] = pd.cut(
    a_df_cleaned['Score'], bins=[0, 3, 4, 5],
    labels=classes, right=True, include_lowest=True
)
t_df_cleaned['Satisfaction'] = pd.cut(
    t_df_cleaned['score'], bins=[0, 3, 4, 5],
    labels=classes, right=True, include_lowest=True
)

# Map satisfaction text to int labels

a_df_cleaned['Satisfaction'] = a_df_cleaned['Satisfaction'].astype(str)
t_df_cleaned['Satisfaction'] = t_df_cleaned['Satisfaction'].astype(str)

a_df_cleaned["label"] = a_df_cleaned["Satisfaction"].map(class2id).astype(int)
t_df_cleaned["label"] = t_df_cleaned["Satisfaction"].map(class2id).astype(int)

# Drop any rows with missing labels (in case of mapping issues)
a_df_cleaned = a_df_cleaned.dropna(subset=["label"])
t_df_cleaned = t_df_cleaned.dropna(subset=["label"])

(568454, 6)


**Finish processing, make train and test, and run predictions**

In [45]:
#Arrange and split the data
combined_text = pd.concat([a_df_cleaned["Text"], t_df_cleaned["content"]], ignore_index=True)
combined_labels = pd.concat([a_df_cleaned["label"], t_df_cleaned["label"]], ignore_index=True)

df = pd.DataFrame({"text": combined_text, "label": combined_labels})


examples = df["text"].values
Y_true = df["label"].values

X_train, X_test, y_train, y_test = train_test_split(examples, Y_true, test_size=0.05, random_state=42)

In [46]:

#Bag of Words
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

#NaiveBayes Classifier
nb_clf = MultinomialNB()
nb_clf.fit(X_train_bow, y_train)

#Predict and Score
y_pred = nb_clf.predict(X_test_bow)

print(f"RESULTS LOG:")

print(f"F1: {f1_score(y_test, y_pred, average='weighted')}")
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')

precision_micro = precision_score(y_test, y_pred, average='micro')
recall_micro = recall_score(y_test, y_pred, average='micro')

print(f"MACROS: Precision: {precision_macro}  Recall: {recall_macro} ")
print(f"MICROS: Precision: {precision_micro}  Recall: {recall_micro} ")

RESULTS LOG:
F1: 0.6593406593406594
MACROS: Precision: 0.4761904761904762  Recall: 0.5555555555555556 
MICROS: Precision: 0.7142857142857143  Recall: 0.7142857142857143 


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
