<a href="https://colab.research.google.com/github/Que1Pereza2/Mr.CrabsAnalyzer/blob/main/CanYouFeelItNow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Imports block

In [20]:
import re
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

This block reads the None2775.csv file and loads it's contents into review.

In [21]:
reviews = pd.read_csv("None2775.csv")

This block handles the undersampling of the positive reviews so the scores appear in equal quantity and creates the label and features arrays used the neural network to train and test.

In [22]:
# Takes all the reviews[score] values and replaces the ' " ' with no space so
#they can be converted to int.
reviews['score'] = reviews['score'].str.replace('"', '').astype(int)

# Separates all the reviews by score.
majority_class = reviews[reviews.score == 1]
minority_class = reviews[reviews.score == 0]

# Downsample majority class
majority_downsampled = majority_class.sample(n = len(minority_class), random_state=42)

# Combine minority class with downsampled majority class
balanced_df = pd.concat([majority_downsampled, minority_class])

# Shuffle the resulting DataFrame
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Creates the features and labels arrays
features = balanced_df.iloc[:, 0].values
labels = balanced_df.iloc[:, 1].values


This block converts the scores from strings to ints, uses regex to clean the data and vectorizes the labels DataFrame, which is composed of strings, so the Neural Network can train on it.

In [23]:
processed_features = []

for sentence in range(0, len(features)):
    # Remove all the special characters.
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))

    # Remove all single characters.
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start.
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature)

    # Substituting multiple spaces with single space.
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'.
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase.
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

# Creation of the vectorizer.
vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8)

# Loading the vectorized array back into processed_features.
processed_features = vectorizer.fit_transform(processed_features).toarray()

This block splits the data into train and test arrays and feeds the training data to the Neural Network.

In [25]:
# Creation of the test and train arrays.
X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=0)

# Creation of the Neural Network.
text_classifier = RandomForestClassifier(criterion="entropy", n_estimators=2000, random_state=42)

# Training the Neural Network.
text_classifier.fit(X_train, y_train)

# Generating the Test results.
predictions = text_classifier.predict(X_test)

Neural Network scores.

In [26]:
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

[[489  85]
 [ 89 447]]
              precision    recall  f1-score   support

           0       0.85      0.85      0.85       574
           1       0.84      0.83      0.84       536

    accuracy                           0.84      1110
   macro avg       0.84      0.84      0.84      1110
weighted avg       0.84      0.84      0.84      1110

0.8432432432432433


User Interface.

In [18]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 42)
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)
rf_random.best_params_

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}
{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
Fitting 3 folds for each of 100 candidates, totalling 300 fits


KeyboardInterrupt: 

In [35]:
reviewToPredict = input(f"Please provide a review!\n ")
resultUser=text_classifier.predict(vectorizer.transform([reviewToPredict]).toarray())
print(resultUser[0])
if resultUser == 1:
    print("The review is positive")
else:
    print("The review is negative")

Please provide a review!
 i don't like it
1
The review is positive
