In [30]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import FreqDist
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from nltk.classify import NaiveBayesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imbd-movie-reviews-for-binary-sentiment-analysis/MovieReviewTrainingDatabase.csv


In [31]:
reviews=pd.read_csv("/kaggle/input/imbd-movie-reviews-for-binary-sentiment-analysis/MovieReviewTrainingDatabase.csv")
reviews

Unnamed: 0,sentiment,review
0,Positive,With all this stuff going down at the moment w...
1,Positive,'The Classic War of the Worlds' by Timothy Hin...
2,Negative,The film starts with a manager (Nicholas Bell)...
3,Negative,It must be assumed that those who praised this...
4,Positive,Superbly trashy and wondrously unpretentious 8...
...,...,...
24995,Negative,It seems like more consideration has gone into...
24996,Negative,I don't believe they made this film. Completel...
24997,Negative,"Guy is a loser. Can't get girls, needs to buil..."
24998,Negative,This 30 minute documentary Buñuel made in the ...


In [32]:
reviews.sentiment.value_counts()

Positive    12500
Negative    12500
Name: sentiment, dtype: int64

# **Pre-processing & Data Cleaning**

In [33]:

# Text Cleaning
reviews["review"]=reviews["review"].apply(lambda x: re.sub(r"[^\w\s]", "", str(x)))
# Lower casing
reviews["review"]=reviews["review"].apply(lambda x:x.lower())
# Tokenization
reviews["review"]=reviews["review"].apply(lambda x:word_tokenize(x))
# StopWords 
stopwords_set = set(stopwords.words("english"))
reviews["review"] = reviews["review"].apply(lambda x: [word for word in x if word not in stopwords_set])
# Convert list of words back to a single string
reviews["review"] = reviews["review"].apply(lambda x: " ".join(x))

# Feature Generation using TF-IDF & Split train and test set

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(reviews['review'])  # Review text column
y = reviews['sentiment']  # Sentiment column (target)
# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [35]:
X_train

<20000x117087 sparse matrix of type '<class 'numpy.float64'>'
	with 2006108 stored elements in Compressed Sparse Row format>

# Model Building and Evaluation using different machine learning algorithms

In [36]:
model = DecisionTreeClassifier()
classifier = model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.7184


In [37]:

model = RandomForestClassifier()
classifier = model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.8448


In [38]:
model = MultinomialNB()
classifier = model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.8668


In [39]:
model = LogisticRegression()
classifier = model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))


Accuracy:  0.8896


Since Logistic Regression gave the best accuracy ,it's by far the mos suitable learning algorithm for this dataset

For more detailed resulted i will use **classification report** & **confusion matrix**

In [40]:
## How the confusion matrix works :
#             Predicted Class
#              Negative  Positive
#Actual Class
#Negative        TN       FP
#Positive        FN       TP


In [41]:
print("Here is the classification repport: \n",classification_report(y_test, y_pred))
print("Here is the confusion matrix: \n",confusion_matrix(y_test, y_pred))

Here is the classification repport: 
               precision    recall  f1-score   support

    Negative       0.90      0.88      0.89      2539
    Positive       0.88      0.90      0.89      2461

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000

Here is the confusion matrix: 
 [[2229  310]
 [ 242 2219]]
