<a href="https://colab.research.google.com/github/Que1Pereza2/Mr.CrabsAnalyzer/blob/main/CanYouFeelItNow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Imports block

In [55]:
import numpy as np
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.under_sampling import RandomUnderSampler

This block reads the None2775.csv file and creates the arrays features and labels.

In [63]:
reviews = pd.read_csv("None2775.csv")

features = reviews.iloc[:,0].values
labels = reviews.iloc[:,1].values

This function uses regex to clean the data and prepares it for the neural network to train on.

In [57]:
processed_features = []
for sentence in range(0, len(features)):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature)

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8)
processed_features = vectorizer.fit_transform(processed_features).toarray()


Here we convert the array scores from strings to int.

In [89]:
# Create a Pandas Series
string_series = pd.Series(labels)
# Converting to int after removing quotation marks
labels = [int(x.replace('"', '')) for x in string_series]
labels = pd.DataFrame(labels)

This block handles the undersampling of the positive reviews so the scores appear in equal quantity.

In [88]:
# Separate the classes
majority_class = reviews[reviews['score'] == 1]
minority_class = reviews[reviews['score'] == 0]
print(reviews['score'].unique())

# Downsample majority class
majority_downsampled = majority_class.sample(n=len(minority_class), random_state=42)

# Combine minority class with downsampled majority class
balanced_df = pd.concat([majority_downsampled, minority_class])

# Shuffle the resulting DataFrame
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(balanced_df)


['"1"' '"0"']
Empty DataFrame
Columns: [review, score]
Index: []


Creating and feeding the data to the Neural Network

In [59]:
rus = RandomUnderSampler(random_state=42, sampling_strategy = 'majority')

X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=0)

X_resampled, y_X_resampled = rus.fit_resample(X_train, y_train)

text_classifier = RandomForestClassifier(criterion="entropy",n_estimators=200, random_state=42)

text_classifier.fit(X_resampled, y_X_resampled)
# text_classifier.fit(X_train, y_train)

predictions = text_classifier.predict(X_test)

from pprint import pprint

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


Neural Network stats

In [97]:
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

[[ 444  101]
 [ 279 1187]]
              precision    recall  f1-score   support

           0       0.61      0.81      0.70       545
           1       0.92      0.81      0.86      1466

    accuracy                           0.81      2011
   macro avg       0.77      0.81      0.78      2011
weighted avg       0.84      0.81      0.82      2011

0.8110392839383391


Using the AI

In [96]:
reviewToPredict = input(f"Please provide a review!\n ")
if text_classifier.predict(vectorizer.transform([reviewToPredict]).toarray()) == 1:
    print("The review is positive")
else:
    print("The review is negative")

Please provide a review!
 I didn't like the movie, it was awful
The review is negative
