<a href="https://colab.research.google.com/github/RafaelAnga/MachineLearning_Bootcamp/blob/main/natural_language_processing_pipeline_Stemming.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing

## Importing the libraries

In [14]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from textblob import TextBlob

from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

## Importing the dataset

In [15]:
#Connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
import os #Libary necesary for accessing the folder
os.chdir('/content/drive/MyDrive/Machine Learning/NLP/DataSets')

os.listdir()

['Restaurant_Reviews.tsv']

In [17]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3) #quoting = 3 ignores the quotes


## Cleaning the texts

In [18]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer #Transform words into their base from (loved -> love) -> into Present tense
corpus = []

#iterates indexes
for i in range(0, 1000):
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
  review = review.lower()
  review = review.split() #splits review into different words -------> ask chatgpt
  ps = PorterStemmer()
  #Used to remove 'not' from stopwords
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  #review = [ps.stem(word) for word in review if len(word) > 2 and word not in set(all_stopwords)]
  #review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = [re.sub(r'(.)\1+', r'\1', word) for word in review]
  review = [str(TextBlob(word).correct()) for word in review]

  custom_stopwords = {'food', 'place', 'service', 'restaurant', 'menu', 'server', 'staff'}
  review = [word for word in review if word not in custom_stopwords]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
corpus

['now loved this',
 'crust is not god',
 'not taste and the texture was just nasty',
 'stopped by during the late may bank holiday of rich steve recommendation and loved it',
 'the selection on the was great and so were the prices',
 'now i am getting angry and i want my damn who',
 'honestly it didn t taste that fresh',
 'the potatoes were like rubber and you could tell they had ben made up ahead of time being kept under a warmer',
 'the fires were great to',
 'a great touch',
 'was very prompt',
 'would not go back',
 'the cashier had no care what so ever on what i had to say it still ended up being way overpriced',
 'i tried the cape cod revolt chicken with chancery m',
 'i was disgusted because i was pretty sure that was human hair',
 'i was shocked because no signs indicate cash only',
 'highly recommended',
 'waited was a little slow in',
 'this is not worth your time let alone began',
 'did not like at al',
 'the bursitis bah',
 'the for amazing',
 'is also cut',
 'i could care 

## Creating the Bag of Words model

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features= 1300)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,-1].values

In [21]:
len(X[0])

1300

## Splitting the dataset into the Training set and Test set

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

## Training the pipeline models on the Training set

In [23]:
# Modified Sampling 900-1100 Pipeline
XGBoost_pipeline = ImbPipeline([
    ('classifier', XGBClassifier())])

#Modified model to randomforest
random_forest_pipeline = ImbPipeline([
    ('classifier', RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42))])

logistic_reg_pipeline = ImbPipeline([
    ('classifier', LogisticRegression())])

naive_bayes_pipeline = ImbPipeline([
    ('classifier', GaussianNB())])

svm_pipeline = ImbPipeline([
    ('classifier', SVC(probability=True))])

gradient_boost_pipeline = ImbPipeline([
    ('classifier', GradientBoostingClassifier(random_state=42))])

knn_pipeline = ImbPipeline([
    ('classifier', KNeighborsClassifier(n_neighbors=5))])

In [24]:
# Define the pipelines
pipelines_to_compare = {
    'XGBoost Pipeline': XGBoost_pipeline,
    'Random Forest Pipeline': random_forest_pipeline,
    'Logistic Regression Pipeline': logistic_reg_pipeline,
    'Naive Bayes Pipeline': naive_bayes_pipeline,
    'Support Vector Machine Pipeline': svm_pipeline,
    'Gradient Boost Pipeline': gradient_boost_pipeline,
    'KNN Pipeline': knn_pipeline
}

results = {}

for name, pipeline in pipelines_to_compare.items():
    print(f"\nEvaluating {name} on X_train...")

    # Train the pipeline
    pipeline.fit(X_train, y_train)

    # Predict on test data
    y_pred = pipeline.predict(X_train)
    y_pred_proba = pipeline.predict_proba(X_train)[:, 1]

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_train, y_pred)


    # Store results
    results[name] = {
        'Accuracy': accuracy,
    }


# Display results for comparison
results_df = pd.DataFrame(results).T

print("Pipeline Comparison Results:")
display(results_df)




Evaluating XGBoost Pipeline on X_train...

Evaluating Random Forest Pipeline on X_train...

Evaluating Logistic Regression Pipeline on X_train...

Evaluating Naive Bayes Pipeline on X_train...

Evaluating Support Vector Machine Pipeline on X_train...

Evaluating Gradient Boost Pipeline on X_train...

Evaluating KNN Pipeline on X_train...
Pipeline Comparison Results:


Unnamed: 0,Accuracy
XGBoost Pipeline,0.94
Random Forest Pipeline,0.985
Logistic Regression Pipeline,0.97
Naive Bayes Pipeline,0.8975
Support Vector Machine Pipeline,0.97125
Gradient Boost Pipeline,0.88625
KNN Pipeline,0.7775


## Predicting the Test set results & making the Confusion Matrix

In [25]:
from sklearn.metrics import accuracy_score, confusion_matrix

# Evaluate the pipelines on the test set
for name, pipeline in pipelines_to_compare.items():
    print(f"\nEvaluating {name} on X_test...")

    # Predict on the test set
    y_test_pred = pipeline.predict(X_test)

    # Calculate evaluation metrics
    test_accuracy = accuracy_score(y_test, y_test_pred)
    conf_matrix = confusion_matrix(y_test, y_test_pred)

    # Print the results
    print(f"Accuracy: {test_accuracy *100}% ")
    print(f"Confusion Matrix:\n{conf_matrix}")



Evaluating XGBoost Pipeline on X_test...
Accuracy: 77.0% 
Confusion Matrix:
[[82 15]
 [31 72]]

Evaluating Random Forest Pipeline on X_test...
Accuracy: 77.0% 
Confusion Matrix:
[[86 11]
 [35 68]]

Evaluating Logistic Regression Pipeline on X_test...
Accuracy: 82.0% 
Confusion Matrix:
[[82 15]
 [21 82]]

Evaluating Naive Bayes Pipeline on X_test...
Accuracy: 68.5% 
Confusion Matrix:
[[52 45]
 [18 85]]

Evaluating Support Vector Machine Pipeline on X_test...
Accuracy: 79.5% 
Confusion Matrix:
[[82 15]
 [26 77]]

Evaluating Gradient Boost Pipeline on X_test...
Accuracy: 73.5% 
Confusion Matrix:
[[87 10]
 [43 60]]

Evaluating KNN Pipeline on X_test...
Accuracy: 60.0% 
Confusion Matrix:
[[64 33]
 [47 56]]


## Used to predict a single review.

In [26]:
# Single review for prediction
new_review = 'I hate this restaurant so much'

# Clean the review (same preprocessing as in the training corpus)
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if word not in set(all_stopwords)]
new_review = ' '.join(new_review)

# Transform the review into the same format as the training data
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()

# Predict using the Logistic Regression Pipeline
new_y_pred = logistic_reg_pipeline.predict(new_X_test)

# Output the result
if new_y_pred[0] == 1:
    print("The sentiment of the review is: Positive")
else:
    print("The sentiment of the review is: Negative")


The sentiment of the review is: Negative
