In [3]:
# Text Classification of Amazon, Yelp & IMDB Reviews using TF IDF Matrix

# Importing the libraries
import pandas as pd, numpy as np

# Importing dataset
yelp_ds = pd.read_csv('yelp_labelled.txt', sep ='\t', header = None, names = ['reviews', 'rating'])
amazon_ds = pd.read_csv('amazon_cells_labelled.txt', sep ='\t', header = None, names = ['reviews', 'rating'])
imdb_ds = pd.read_csv('imdb_labelled.txt', sep ='\t', header = None, names = ['reviews', 'rating'])

#combine the datasets
dataset = pd.concat([yelp_ds, amazon_ds, imdb_ds], ignore_index = True)

# Manually setting the rating for 2 reviews with NaN values
dataset.fillna(1, inplace = True)

In [8]:
dataset.head()

Unnamed: 0,reviews,rating
0,wow loved place,1
1,crust good,0
2,tasty texture nasty,0
3,stopped late may bank holiday rick steve recom...,1
4,selection menu great price,1


In [4]:
dataset['reviews'].head()

0                             Wow... Loved this place.
1                                   Crust is not good.
2            Not tasty and the texture was just nasty.
3    Stopped by during the late May bank holiday of...
4    The selection on the menu was great and so wer...
Name: reviews, dtype: object

In [5]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import re

# Defining a Function to clean up the reviews
def text_preprocess(ds: pd.Series) -> pd.Series:
    """
    Apply NLP Preprocessing Techniques to the reviews.

    Parameters:
    ds (pd.Series): A pandas Series containing the text data to preprocess.

    Returns:
    pd.Series: A pandas Series with cleaned and preprocessed text.
    """
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    def clean_text(text):
        # Remove non-alphanumeric characters and convert to lowercase
        text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
        # Tokenize words
        words = text.split()
        # Remove stopwords and lemmatize each word
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
        # Join back to form the processed sentence
        return ' '.join(words)
    
    # Apply the cleaning function to the entire Series
    return ds.apply(clean_text)

# Assuming dataset is a DataFrame with a 'reviews' column
dataset['reviews'] = text_preprocess(dataset['reviews'])


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/youssefmecky/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/youssefmecky/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/youssefmecky/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
dataset['reviews'].head()

0                                      wow loved place
1                                           crust good
2                                  tasty texture nasty
3    stopped late may bank holiday rick steve recom...
4                           selection menu great price
Name: reviews, dtype: object

In [9]:
# Building a TF-IDF matrix out of the corpus of reviews
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


# 'reviews' column should already be preprocessed

# Step 1: Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X = tfidf_vectorizer.fit_transform(dataset['reviews'])
y = dataset['rating']  # Target labels

# Step 2: Splitting into training & test subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Training the classifier & predicting on test data
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Making predictions
y_pred = classifier.predict(X_test)

# Step 4: Classification metrics
print('\nAccuracy:', accuracy_score(y_test, y_pred))
print('\nClassification Report')
print('======================================================')
print('\n', classification_report(y_test, y_pred))



Accuracy: 0.7709090909090909

Classification Report

               precision    recall  f1-score   support

           0       0.79      0.75      0.77       285
           1       0.75      0.79      0.77       265

    accuracy                           0.77       550
   macro avg       0.77      0.77      0.77       550
weighted avg       0.77      0.77      0.77       550

