# Dataset Description

IMBD dataset having 50K movie reviews for natural language processing or Text analytics. In this project we provide a set of 25,000 highly polar reviews for training and 25,000 for testing. So, predict the number of positive and negatice reviews using either classification or deep learning algorithms.

In [30]:
import numpy as np
import pandas as pd 

In [31]:
data=pd.read_csv("IMDB-Dataset.csv")

In [32]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [33]:
data.isnull().any()

review       False
sentiment    False
dtype: bool

In [34]:
data.shape

(50000, 2)

In [35]:
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [7]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [36]:
data.dropna(axis=0, inplace=True)

# Tokenization

In [None]:
!pip install wordcloud

In [None]:
!pip install spacy

In [None]:
!pip install bs4

In [None]:
pd.__version__

In [None]:
!pip install textblob

In [10]:
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud, STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize

In [11]:
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob, Word 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from bs4 import BeautifulSoup

In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\R418028\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
#Tokenization of text
tokenizer=ToktokTokenizer()
#Setting English stopwords
stopwords=nltk.corpus.stopwords.words('english')

In [37]:
#Removing the noisy text
def noiseremoval_text(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = re.sub('\[[^]]*\]', '',text)
    return text

In [38]:
#Apply function on review column
data['review'] = data['review'].apply(noiseremoval_text)



In [39]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Stemming

In [40]:
def stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [41]:
#apply function on review column
data['review']=data['review'].apply(stemmer)

In [42]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other review ha mention that after ...,positive
1,a wonder littl production. the film techniqu i...,positive
2,i thought thi wa a wonder way to spend time on...,positive
3,basic there' a famili where a littl boy (jake)...,negative
4,"petter mattei' ""love in the time of money"" is ...",positive


# Removing Stopwords

In [61]:
#Removing stopwords
def removing_stopwords(text, is_lower_case=False):
    #Tokenization of text
    tokenizers = ToktokTokenizer()
    #Setting english stopwords
    tokens = tokenizer.tokenize(text)
    tokens = [i.strip() for i in tokens]
    filetokens = [i for i in tokens if i.lower() not in stopwords]
    filtered_texts = ' '.join(filetokens)
    return filtered_texts

In [62]:
data['review']=data['review'].apply(removing_stopwords)

In [63]:
data.head()

Unnamed: 0,review,sentiment
0,one review ha mention watch 1 oz episod ' hook...,positive
1,wonder littl production. film techniqu veri un...,positive
2,thought thi wa wonder way spend time hot summe...,positive
3,basic ' famili littl boy ( jake ) think ' zomb...,negative
4,"petter mattei ' "" love time money "" visual stu...",positive


# Train test split

In [64]:
#Split the dataset
#Train dataset
train_reviews_data = data.review[:30000]
test_review_data = data.review[30000:]

# Bag of words