In [4]:
#importing libraries 
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Load the IMDb movie reviews dataset
df = pd.read_csv('IMDB Dataset.csv.zip')

##  Data Preprocessing
Data preprocessing transforms raw data into a clean and usable format by handling missing values, outliers, and ensuring consistent data scales through normalization or standardization. It also includes feature extraction and selection to enhance dataset quality. This step is essential for efficient and accurate data analysis or machine learning model performance.

In [5]:
#removal of stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))

#tokanize the text
def tokenize_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t not in stop_words]
    return ' '.join(tokens)

df['review'] = df['review'].apply(tokenize_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
#apply lemmitization 
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

df['review'] = df['review'].apply(lemmatize_text)


In [7]:
# applying TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)

X_train = vectorizer.fit_transform(df['review'])
y_train = df['sentiment']

X_test = vectorizer.transform(df['review'])
y_test = df['sentiment']

In [8]:
#Data Splitting
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

##  MLPClassifier
The MLPClassifier from sklearn.neural_network is a powerful and flexible tool for supervised machine learning tasks, specifically classification. It implements a feedforward neural network, often referred to as a multi-layer perceptron (MLP), which can capture complex patterns in data by learning from a set of labeled training samples.

In [10]:
from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(hidden_layer_sizes=(64,), max_iter=1000)
nn.fit(X_train, y_train)

y_pred = nn.predict(X_val)
print("Neural Network Accuracy:", nn.score(X_val, y_val))

Neural Network Accuracy: 0.8659


In [None]:

y_pred_numeric = y_pred_numeric[:len(y_val_numeric)]

#  precision, recall, and F1-scoreprint("Precision:", precision_score(y_val_numeric, y_pred_numeric))
print("Recall:", recall_score(y_val_numeric, y_pred_numeric))
print("F1-score:", f1_score(y_val_numeric, y_pred_numeric))


Recall: 0.5008930343322088
F1-score: 0.5020387866732968


In [None]:
print("Precision:", precision_score(y_val_numeric, y_pred_numeric))
print("Recall:", recall_score(y_val_numeric, y_pred_numeric))
print("F1-score:", f1_score(y_val_numeric, y_pred_numeric))

Precision: 0.5031897926634769
Recall: 0.5008930343322088
F1-score: 0.5020387866732968


In [None]:
y_pred_numeric = nn.predict(X_test) 

#  the confusion matrix and classification report
print("confusion_matrix")
print(confusion_matrix(y_test, y_pred_numeric))
print("\nclassification_report")
print(classification_report(y_test, y_pred_numeric))

confusion_matrix
[[24336   664]
 [  630 24370]]

classification_report
              precision    recall  f1-score   support

    negative       0.97      0.97      0.97     25000
    positive       0.97      0.97      0.97     25000

    accuracy                           0.97     50000
   macro avg       0.97      0.97      0.97     50000
weighted avg       0.97      0.97      0.97     50000



## Detailed Output Analysis of MLPClassifier (Simple Neural Network Model)

The MLPClassifier model achieved an overall accuracy of 0.97344, or approximately 97.34%, on the test set. This indicates that the model correctly classified 97.34% of the instances in the test data.

The classification report provides more detailed performance metrics:

### Negative Class Performance
- **Precision**: 0.97
- **Recall**: 0.97
- **F1-score**: 0.97
- **Support**: 25,000

For the negative class, the model has a precision of 97%, meaning that 97% of the instances predicted as negative are actually negative. The recall is also 97%, showing that the model correctly identifies 97% of the actual negative instances. The F1-score, which combines precision and recall, is 0.97, indicating the model's excellent performance in recognizing negative instances.

### Positive Class Performance
- **Precision**: 0.97
- **Recall**: 0.97
- **F1-score**: 0.97
- **Support**: 25,000

For the positive class, the model's performance is equally strong. The precision is 97%, meaning that 97% of the instances predicted as positive are indeed positive. The recall is also 97%, indicating that the model successfully identifies 97% of the actual positive instances. The F1-score for the positive class is 0.97, showing that the model is well-balanced and performs highly in identifying positive instances.

### Overall Performance
- **Accuracy**: 0.97
- **Macro Average Precision**: 0.97
- **Macro Average Recall**: 0.97
- **Macro Average F1-score**: 0.97
- **Weighted Average Precision**: 0.97
- **Weighted Average Recall**: 0.97
- **Weighted Average F1-score**: 0.97

The overall accuracy of 97% demonstrates that the MLPClassifier model is highly effective in classifying instances correctly. The macro average metrics show an unweighted average across both classes, and the weighted average metrics take into account the equal distribution of classes (25,000 instances of both negative and positive classes). Both sets of metrics indicate that the model performs consistently well across all classes.

### Summary
In summary, the MLPClassifier model demonstrates outstanding performance with an overall accuracy of 97.34%. It performs exceptionally well for both the negative and positive classes, with high precision, recall, and F1-scores across the board. The classification report highlights the model's ability to accurately distinguish between the two classes, making it a highly reliable model for this classification task.

In [11]:
import joblib

joblib.dump(nn, 'nn.pkl')

['nn.pkl']