**SETTING UP BACKGROUND VARIABLES AND THE DATASETS!**

In [1]:
#!python -m spacy download en_core_web_sm
#^^ downloads an english dataset to help with dataset cleaning


In [2]:
import os

import sys
assert sys.version_info.major == 3 and sys.version_info.minor == 11

import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from transformers import (AutoTokenizer, Trainer, DataCollatorWithPadding, TrainingArguments, DebertaTokenizer, AutoModelForSequenceClassification)
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score
import pandas as pd

import re
import spacy

In [3]:
model_path = 'microsoft/deberta-v3-small'

#load and drop everything except for the ID, the score given, and the text
tinder_df = pd.read_csv(r'tinderdata\tinder_google_play_reviews.csv')
amazon_df = pd.read_csv(r'amazondata\Reviews.csv')

# Drop unwanted columns
tinder_df = tinder_df.drop(columns=[
    'userName', 'userImage', 'reviewCreatedVersion', 'at', 'replyContent', 'repliedAt', 'appVersion'
])

amazon_df = amazon_df.drop(columns=[
    'ProductId', 'UserId', 'ProfileName', 'Time'
])

# Initializes classes and IDs for the mdoel
classes = ["Not Satisfied", "Somewhat Satisfied", "Satisfied"]
labels_class = [0, 1, 2]
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}




FileNotFoundError: [Errno 2] No such file or directory: 'tinderdata\\tinder_google_play_reviews.csv'

In [None]:
from cProfile import label

nlp = spacy.load("en_core_web_sm")# Load English model that defines lemmatization and stopwords

def stop_words_and_lemmatize_texts(texts):
    lemmatized_texts = []
    for doc in nlp.pipe(texts, batch_size=1000):
        filtered_lemmas = [
            token.lemma_ for token in doc
            if not token.is_punct and not token.is_space and not token.is_stop
        ]
        lemmatized_texts.append(" ".join(filtered_lemmas))
    return lemmatized_texts

# ChatGPT helped code it - but we do know how to do regex!
def clean_text(text):
    if pd.isnull(text):
        return ""

    text = text.lower()  # Lowercase
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-z0-9\s.,!?]', '', text)  # Remove special chars (keep basic punctuation)
    text = re.sub(r'\s+', ' ', text)  # Remove excessive whitespace
    return text.strip()

def clean_dataframe(df, text_column):
    df = df.copy()  # make an explicit copy to avoid chained assignment issues
    df.loc[:, text_column] = df[text_column].astype(str).apply(clean_text)  # Clean with regex
    df.loc[:, text_column] = stop_words_and_lemmatize_texts(df[text_column])  # Lemmatize & remove stopwords
    return df




In [None]:
# Data Cleaning Block

#===================================================
#1) Drop duplicates

a_df_unique = amazon_df.drop_duplicates(subset='Text')
t_df_unique = tinder_df.drop_duplicates(subset='content')


#===================================================
#2 Drop 0 rated helpful reviews. Reviews should be rated helpful at least.

a_df_filtered = a_df_unique[a_df_unique['HelpfulnessDenominator'] <= (a_df_unique['HelpfulnessNumerator']*2)] #Basically, get rid of all rows that have less than 1:2 helpful to non-helpful reviews. 
a_df_filtered_2 = a_df_filtered[a_df_filtered['HelpfulnessNumerator'] != 0]

t_df_filtered = t_df_unique[t_df_unique['thumbsUpCount'] != 0]

#===================================================

# Extra: Miniature version of dataset to test smaller chunks of data in the interest of time and my computer's health
percent_used = 0.005

mini_amazon_df = a_df_filtered_2.sample(frac=percent_used, random_state=42)
mini_tinder_df = t_df_filtered.sample(frac=percent_used, random_state=42)
#===================================================

#3 standardize all text (lemmatize and drop stopwords)
a_df_cleaned = clean_dataframe(mini_amazon_df, "Text")
t_df_cleaned = clean_dataframe(mini_tinder_df, "content")

#===================================================



In [None]:
# 4. Bin into classes using pd.cut, converting previous scores of 1-5 to a class 0, 1, or 2 (not saitsfied, somewhat satisfied, satisfied)

a_df_cleaned['Satisfaction'] = pd.cut(
    a_df_cleaned['Score'], bins=[0, 3, 4, 5],
    labels=classes, right=True, include_lowest=True
)
t_df_cleaned['Satisfaction'] = pd.cut(
    t_df_cleaned['score'], bins=[0, 3, 4, 5],
    labels=classes, right=True, include_lowest=True
)

# Map satisfaction text to int labels

a_df_cleaned['Satisfaction'] = a_df_cleaned['Satisfaction'].astype(str)
t_df_cleaned['Satisfaction'] = t_df_cleaned['Satisfaction'].astype(str)

a_df_cleaned["label"] = a_df_cleaned["Satisfaction"].map(class2id).astype(int)
t_df_cleaned["label"] = t_df_cleaned["Satisfaction"].map(class2id).astype(int)

# Drop any rows with missing labels (in case of mapping issues)
a_df_cleaned = a_df_cleaned.dropna(subset=["label"])
t_df_cleaned = t_df_cleaned.dropna(subset=["label"])

# # Now convert to Hugging Face Datasets bc its better for NLP
# from datasets import Dataset

# amazon_dataset = Dataset.from_pandas(a_df_cleaned[["Text", "label"]].reset_index(drop=True))
# tinder_dataset = Dataset.from_pandas(t_df_cleaned[["content", "label"]].reset_index(drop=True))
print(a_df_cleaned.head())

            Id  HelpfulnessNumerator  HelpfulnessDenominator  Score  \
67333    67334                     5                       7      1   
47358    47359                     1                       1      1   
530169  530170                     4                       4      5   
159483  159484                     1                       1      4   
495351  495352                     3                       3      5   

                                   Summary  \
67333                       Skinny Noodles   
47358                      Not worth it...   
530169  Great Price Delivered to Your Door   
159483                      Nice in coffee   
495351                         Maple Syrup   

                                                     Text        Satisfaction  \
67333   miracle noodle aka skinny noodle disgusting th...       Not Satisfied   
47358   well buy half piece large size cheap pos produ...       Not Satisfied   
530169  club cracker arrive fresh good shape buck goo

**READ DATA FROM DATASET AND TOKENIZE IT!**

In [None]:
# Load an array for each pair of text/label for each dataset.
tokenizer = AutoTokenizer.from_pretrained(model_path, clean_up_tokenization_spaces=True)

combined_text = pd.concat([a_df_cleaned["Text"], t_df_cleaned["content"]], ignore_index=True)
combined_labels = pd.concat([a_df_cleaned["label"], t_df_cleaned["label"]], ignore_index=True)

df = pd.DataFrame({"text": combined_text, "label": combined_labels})

examples = df["text"].values
Y_true = df["label"].values

# make training and validation sets through the training dataset
examples_train, examples_test, labels_train, labels_test = train_test_split(examples, Y_true, test_size=0.05, random_state=42) #95% of the dataset is training, 5% for eval

train = Dataset.from_dict( {"text": examples_train, "label": labels_train} )

split = train.train_test_split(test_size=0.1, seed=42)
train_dataset = split['train']
val_dataset = split['test']

test_dataset = Dataset.from_dict( {"text": examples_test, "label": labels_test} )

#Tokenize each dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)



Map:   0%|          | 0/1181 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/132 [00:00<?, ? examples/s]

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

**SET UP VARIABLES AND FUNCTIONS FOR RESULTS LOG**

In [None]:

from datasets import load_metric
accuracy_metric = load_metric("accuracy")
f1_metric = load_metric("f1")

print(len(tokenized_train))
print(len(tokenized_test))

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def sigmoid(x):
   return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    labels = labels.squeeze()
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')
    return {**accuracy, **f1}

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


1181
70


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


**SET UP MODEL AND TRAIN IT**

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                        num_labels=len(classes), id2label=id2class, label2id=class2id, problem_type = "single_label_classification")

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# for i in range(num_folds):

#Best model for Deberta: batch_size = 8, learning_rate = 2e-5

training_args = TrainingArguments(
   output_dir="my_awesome_model",
   per_device_train_batch_size = 8, #adjustable!
   learning_rate= 2e-5, #adjustable!
   num_train_epochs=3,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
)

trainer = Trainer(

   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_val,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

trainer.train()




  0%|          | 0/444 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

{'eval_loss': 0.5318787097930908, 'eval_accuracy': 0.7878787878787878, 'eval_f1': 0.7623992318170654, 'eval_runtime': 5.38, 'eval_samples_per_second': 24.535, 'eval_steps_per_second': 3.16, 'epoch': 1.0}


  0%|          | 0/17 [00:00<?, ?it/s]

{'eval_loss': 0.5214681625366211, 'eval_accuracy': 0.8257575757575758, 'eval_f1': 0.799997196759454, 'eval_runtime': 4.8883, 'eval_samples_per_second': 27.003, 'eval_steps_per_second': 3.478, 'epoch': 2.0}


  0%|          | 0/17 [00:00<?, ?it/s]

{'eval_loss': 0.46288469433784485, 'eval_accuracy': 0.8560606060606061, 'eval_f1': 0.8293300653594771, 'eval_runtime': 5.1138, 'eval_samples_per_second': 25.813, 'eval_steps_per_second': 3.324, 'epoch': 3.0}
{'train_runtime': 903.7318, 'train_samples_per_second': 3.92, 'train_steps_per_second': 0.491, 'train_loss': 0.635158160785297, 'epoch': 3.0}


TrainOutput(global_step=444, training_loss=0.635158160785297, metrics={'train_runtime': 903.7318, 'train_samples_per_second': 3.92, 'train_steps_per_second': 0.491, 'total_flos': 93635653080456.0, 'train_loss': 0.635158160785297, 'epoch': 3.0})

**TEST THE MODEL AND RUN PRINT RESULTS**

In [None]:
TESTING_SET = tokenized_test


predictions = trainer.predict(TESTING_SET)

probs = sigmoid(torch.from_numpy(predictions.predictions))

#Mostly ChatGPT
print(f"RESULTS LOG:")

pred_class = np.argmax(probs, axis=1)  # predicted class index per example

print(f"F1: {f1_score(y_true=TESTING_SET['label'], y_pred=pred_class, average='weighted')}")

# Precision and recall (macro-average across labels)
precision_macro = precision_score(pred_class, labels_test, average='macro')
recall_macro = recall_score(pred_class, labels_test, average='macro')

# Precision and recall (micro-average across all samples and labels)
precision_micro = precision_score(pred_class, labels_test, average='micro')
recall_micro = recall_score(pred_class, labels_test, average='micro')

print(f"MACROS: Precision: {precision_macro}  Recall: {recall_macro} ")
print(f"MICROS: Precision: {precision_micro}  Recall: {recall_micro} ")


  0%|          | 0/9 [00:00<?, ?it/s]

RESULTS LOG:
F1: 0.8064449648711944
MACROS: Precision: 0.5906932573599241  Recall: 0.5506535947712418 
MICROS: Precision: 0.8285714285714286  Recall: 0.8285714285714286 


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
