*import necessary libraries*

In [1]:
import pandas as pd
import sys
import os
import numpy as np

# Import the data

In [3]:
# import the preprocessed vectorized data
data = pd.read_csv('data/pre_proc_data/Musical_instruments_reviews_vector.csv')

# Optimize the model class

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import plotly.express as px
from imblearn.over_sampling import SMOTE

In [18]:
class_labels = ['Negative', 'Neutral', 'Positive']

confusion_matrix_kwargs = dict(
    text_auto=True, 
    title="Confusion Matrix", width=1000, height=800,
    labels=dict(x="Predicted", y="True Label"),
    x=class_labels,
    y=class_labels,
    color_continuous_scale='Blues'
)

class Model:
    def __init__(self, X, y, model_architecture, vectorizer, balancer, random_seed=42, test_size=0.2) -> None:
        self.X = X
        self.y = y
        self.model_instance = model_architecture
        self.vectorizer = vectorizer
        self.balancer = balancer
        self.random_seed = random_seed
        self.test_size = test_size
        
        # lets vectorize the data
        self.x_vect = self.vectorizer.fit_transform(self.X)
        # lets balance the data
        self.x_balanced, self.y_balanced = self.balancer.fit_resample(self.x_vect,self.y)
        # train test split using the above X, y, test_size and random_state
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.x_balanced, self.y_balanced, test_size=self.test_size, random_state=self.random_seed)
        
    def fit(self):
        # fit the model to the training data
        self.model_instance.fit(self.X_train, self.y_train)
        
    def predict(self):
        # predict the test data
        return self.model_instance.predict(self.X_test)
    
    def predict_custom(self, X_test):
        return self.pipeline.predict(X_test)
    
    def predict_proba(self):
        return self.pipeline.predict_proba(self.X_test)
    
    def report(self, y_pred, class_labels):
        print(classification_report(self.y_test, y_pred, target_names=class_labels))
        confusion_matrix_toshow = confusion_matrix(self.y_test, y_pred)
        fig = px.imshow(
            confusion_matrix_toshow, 
            **confusion_matrix_kwargs
            )
        fig.show()
    def report_custom(self, y_true, y_pred, class_labels):
        print(classification_report(y_true, y_pred, target_names=class_labels))
        confusion_matrix_toshow = confusion_matrix(y_true, y_pred)
        fig = px.imshow(
            confusion_matrix_toshow, 
            **confusion_matrix_kwargs
            )
        fig.show()

In [19]:
X = data['review']
y = data['sentiment']

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
model = Model(X, y, LogisticRegression(), TfidfVectorizer(min_df=0.0,max_df=1,use_idf=True,ngram_range=(1,2)), SMOTE(random_state=42))
model.fit()

In [23]:
y_pred = model.predict()

In [24]:
model.report(y_pred, class_labels)

              precision    recall  f1-score   support

    Negative       1.00      0.88      0.94      1820
     Neutral       1.00      0.83      0.91      1789
    Positive       0.78      1.00      0.87      1800

    accuracy                           0.90      5409
   macro avg       0.93      0.90      0.91      5409
weighted avg       0.93      0.90      0.91      5409



In [25]:
from sklearn.naive_bayes import MultinomialNB


In [26]:
model = Model(X, y, MultinomialNB(), TfidfVectorizer(min_df=0.0,max_df=1,use_idf=True,ngram_range=(1,2)), SMOTE(random_state=42))
model.fit()
y_pred = model.predict()
model.report(y_pred, class_labels)

              precision    recall  f1-score   support

    Negative       1.00      1.00      1.00      1820
     Neutral       0.99      1.00      1.00      1789
    Positive       1.00      0.99      1.00      1800

    accuracy                           1.00      5409
   macro avg       1.00      1.00      1.00      5409
weighted avg       1.00      1.00      1.00      5409



In [28]:
test_size=0.5
random_state=42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
model = Model(X, y, MultinomialNB(), TfidfVectorizer(min_df=0.0,max_df=1,use_idf=True,ngram_range=(1,2)), SMOTE(random_state=42), test_size=test_size, random_seed=random_state)
model.fit()
y_pred = model.predict()
model.report(y_pred, class_labels)

              precision    recall  f1-score   support

    Negative       1.00      1.00      1.00      4559
     Neutral       0.99      1.00      0.99      4452
    Positive       1.00      0.99      0.99      4512

    accuracy                           1.00     13523
   macro avg       1.00      1.00      1.00     13523
weighted avg       1.00      1.00      1.00     13523

