In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

import re
import subprocess
import nltk
from nltk.tokenize import word_tokenize

from nltk.stem import  WordNetLemmatizer
# to download and unzip wordnet for the effective working of the lemmatizer
nltk.download('wordnet', download_dir='/kaggle/working/')
command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
subprocess.run(command.split())
nltk.data.path.append('/kaggle/working/')

from nltk.corpus import stopwords ,wordnet

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, classification_report ,confusion_matrix


In [None]:
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
df.head()

## EDA

Identifying missing values

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.sentiment.value_counts()

In [None]:
lemmatizer = WordNetLemmatizer()

def TokenizeandClean (review):
    review = re.sub('[^a-zA-Z]',' ',review)
    review = review.lower()
    review = word_tokenize(review)
    review = [lemmatizer.lemmatize(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    return(review)

## Data Preprocessing

In [None]:
df.review = df.review.apply(TokenizeandClean)

In [None]:
X = df['review']
Y = df['sentiment']

### Splitting into training and testing datasets

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(X,Y,test_size=0.2,random_state=42)

**Label Encode the sentiment feature**

In [None]:
lab_encode = LabelEncoder()

ytrain = lab_encode.fit_transform(ytrain)
ytest = lab_encode.transform(ytest)

**The text has to be vectorized before feeding into the machine learning models.
TF-IDF Vectorizer is used for vectorization**

In [None]:
tf = TfidfVectorizer()

xtraintf = tf.fit_transform(xtrain)
xtesttf = tf.transform(xtest)

## Model Building

In [None]:
models = {'logistic':LogisticRegression(random_state=42),
          'randomforest':RandomForestClassifier(random_state=42),
         'gradientboost':GradientBoostingClassifier(random_state=42),
         'mnaivebayes':MultinomialNB(),
         'xgboost':XGBClassifier(random_state=42)}


def model_build(xtrain,ytrain,xtest,ytest):
    acc = []
    for model_name,model in models.items():
        model.fit(xtrain,ytrain)
        print(str.center(model_name,40),'\n',str.center('='*25,50))
        ypred = model.predict(xtest)
        acc.append(round(accuracy_score(ytest,ypred),2))
        print(confusion_matrix(ytest,ypred))
        print(classification_report(ytest,ypred))
    return acc  

In [None]:
accuracy_tf = model_build(xtraintf,ytrain,xtesttf,ytest)


###### After working the dataset with TF-IDF vectorizer and building the models,
- we get better vectors with tf-idf vectorizer
- all the models gave above 80% accuracy but Logistic Regression gave us a score of 90%


### Conclusion

##### Logistic regression is the best model when coupled with TF-IDF Vectorizer 

In [None]:
#Testing our model with a real world review
msg = input('Enter the review :')

# tokenize and clean the review
msg = TokenizeandClean(msg)
# vectorize using tf idf
transformed_review = tf.transform([msg])
# predict the sentiment using logistic regression
ypredict = models['logistic'].predict(transformed_review)

# applying inverse transformimg on the predicted value to find the sentiment
lab_encode.inverse_transform(ypredict).item()

