# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import tensorflow_datasets as tfds
import tensorflow as tf
tfds.disable_progress_bar()

# Read Dataset





In [None]:
dataset,info=tfds.load('imdb_reviews',with_info=True, as_supervised=True)
train_dataset,test_dataset=dataset['train'],dataset['test']


In [None]:
# Convert a subset of the training dataset to a Pandas DataFrame
df = pd.DataFrame(columns=['text', 'label'])
for text, label in train_dataset.take(3000):
    df = df.append({'text': text.numpy().decode('utf-8'), 'label': label.numpy()}, ignore_index=True)

In [None]:
#Overview of data , Showing the columns name and data type of each column
df.info()

In [None]:
df.head()

In [None]:
#check the dataset shape (rows,columns)
df.shape

#Check for missing Values

In [None]:
df.isnull().sum()

# Descriptive Statistics

In [None]:
df.describe(include='all')

In [None]:
df.describe(include="O")

#Label Distribution

# Preprocessing and cleaning

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
def preprocess_text(text):
    # Lowercasing
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Removing Punctuation
    tokens = [token for token in tokens if token not in string.punctuation]

    # Removing Stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Joining the tokens back into a single string
    processed_text = ' '.join(tokens)

    return processed_text



In [None]:
# Apply preprocessing to the 'text' column of the dataset
df['text'] = df['text'].apply(preprocess_text)
# Print the preprocessed text
print(df['text'])

In [None]:
#view sample of data after cleaning and preprocessing
df.head()

# Bag of Word

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Create an object used to transform text data into a matrix of token counts.
bow_vectorizer = CountVectorizer()

In [None]:
# Apply bag of words on the preprocessed text column
X_bow = bow_vectorizer.fit_transform(df['text'])

# Print the bag of words matrix
print(X_bow.toarray())

# Bag of Ngrams



In [None]:
#breaking down a text into sequences of  words of length n and counting the frequency of each sequence.

In [None]:
# Create an instance of the CountVectorizer for bag of n-grams
ngram_vectorizer = CountVectorizer(ngram_range=(2, 3))
#considers all  sequences of 2 and 3 words in the text and counts their occurrences.

In [None]:
# Apply bag of n-grams on the preprocessed text column
X_ngram = ngram_vectorizer.fit_transform(df['text'])

# Print the bag of n-grams matrix
print(X_ngram.toarray())

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create an instance of the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

In [None]:
# Apply TF-IDF on the preprocessed text column
X_tfidf = tfidf_vectorizer.fit_transform(df['text'])

# Print the TF-IDF matrix
print(X_tfidf.toarray())

# OneHotEncoder & LabelEncoder for Categorical

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Create an instance of LabelEncoder for label encoding
label_encoder = LabelEncoder()

# Apply label encoding on the preprocessed text column
labels = label_encoder.fit_transform(df['text'])

# Print the encoded labels
print(labels)

# Reshape the labels to a column vector
labels = labels.reshape(-1, 1)

# Create an instance of OneHotEncoder for one-hot encoding
onehot_encoder = OneHotEncoder(sparse=False)

# Apply one-hot encoding on the labels
onehot_labels = onehot_encoder.fit_transform(labels)

# Print the one-hot encoded labels
print(onehot_labels)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# # Create an object used to transform text data into a matrix of token counts.
vectorizer = CountVectorizer()

In [None]:
# Preprocess the text data and apply CountVectorizer
text_data = df['text']
text_features = vectorizer.fit_transform(text_data.tolist())

#Label Encoder and Full Sample target variable and independent variables

In [None]:
# Convert the labels to numerical values like 'red', 'green', and 'blue', label encoding would transform them into numerical values like 0, 1, and 2.
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['label'])

# Splitting to training and testing data


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(text_features, labels, test_size=0.2, random_state=42)

#Logistic Regression

In [None]:
#statistical method used for binary classification,output is transformed using the logistic function (sigmoid) to produce values between 0 and 1
# goal is to predict a binary outcome

In [None]:
from sklearn.linear_model import LogisticRegression

# Create an instance of the logistic regression model
model = LogisticRegression(max_iter=1000, random_state=42)

# Fit the model on the training data
model.fit(X_train, y_train)

# Evaluate the model on the test data
rounded_acc_lr = model.score(X_test, y_test)



print("Accuracy:", rounded_acc_lr)


# Gradient Boosting

In [None]:
#XGBoost Model is correcting errors made by previous models and improve predictive accuracy and speed by sequentially adding weak learners (decision trees)
#used for both regression and classification

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Create an instance of the Gradient Boosting classifier
model = GradientBoostingClassifier()

# Fit the model on the training data
model.fit(X_train, y_train)

# Evaluate the model on the test data
rounded_acc_xgb  = model.score(X_test, y_test)


print("Accuracy:", rounded_acc_xgb)


# Random Forest

In [None]:
#Random Forest model is builds multiple decision trees during training and combines their predictions to improve accuracy and generalization.
#provide robust predictions for both classification and regression tasks.

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create an instance of the Random Forest classifier
model_rf = RandomForestClassifier()

# Fit the RandomForest model on the training data
model_rf.fit(X_train, y_train)

# Evaluate the RandomForest model on the test data
rounded_acc_rf  = model_rf.score(X_test, y_test)


print("Accuracy:", rounded_acc_rf)


#Chi-Square

In [None]:
#Chi-Square statistic measures the independence  on categorical variables
#association between each feature and the target variable in classification tasks.

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
# Apply Chi-Square feature selection
k_best = SelectKBest(chi2, k=1000)  # Adjust the value of k based on your requirements
X_train_chi2 = k_best.fit_transform(X_train, y_train)
X_test_chi2 = k_best.transform(X_test)

# Create an instance of the RandomForest classifier for Chi-Square model
model_chi2 = RandomForestClassifier()

# Fit the RandomForest model with Chi-Square features on the training data
model_chi2.fit(X_train_chi2, y_train)

# Evaluate the RandomForest model with Chi-Square features on the test data
rounded_acc_chi2 = model_chi2.score(X_test_chi2, y_test)

print("Accuracy (Chi-Square):", rounded_acc_chi2)

#Models Compared

In [None]:
models = pd.DataFrame({
    'Model': ['Logistic Regression', 'XGBoost', 'Random Forest','Chi-Square'],
    'R-squared Score': [ rounded_acc_lr*100, rounded_acc_xgb*100,rounded_acc_rf*100,rounded_acc_chi2*100]})
models.sort_values(by='R-squared Score', ascending=False)