In [2]:
%load_ext jupyter_black

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud, STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re, string, unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import os

print(os.listdir("/home/zaki/Desktop/AI"))
import warnings

warnings.filterwarnings("ignore")

The jupyter_black extension is already loaded. To reload it, use:
  %reload_ext jupyter_black
['Sentiment Analysis']


In [3]:
imdb_data = pd.read_csv("IMDB Dataset.csv")
print(imdb_data.shape)
imdb_data.head(10)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [4]:
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [5]:
imdb_data["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [6]:
# split the data sets into two train and test , train takes the first 40k and the test the remianing recs which are 10k
train_reviews = imdb_data.review[:40000]
train_sentiments = imdb_data.sentiment[:40000]
test_reviews = imdb_data.review[40000:]
test_sentiments = imdb_data.sentiment[40000:]
print(train_reviews.shape, train_sentiments.shape)
print(test_reviews.shape, test_sentiments.shape)

(40000,) (40000,)
(10000,) (10000,)


In [7]:
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words("english")

In [8]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()


def remove_between_square_brackets(text):
    return re.sub("\[[^]]*\]", "", text)


def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text


imdb_data["review"] = imdb_data["review"].apply(denoise_text)

In [9]:
def remove_special_characters(text, remove_digits=True):
    pattern = r"[^a-zA-z0-9\s]"
    text = re.sub(pattern, "", text)
    return text


imdb_data["review"] = imdb_data["review"].apply(remove_special_characters)

In [10]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = " ".join([ps.stem(word) for word in text.split()])
    return text


imdb_data["review"] = imdb_data["review"].apply(simple_stemmer)

In [11]:
stop = set(stopwords.words("english"))
print(stop)


def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [
            token for token in tokens if token.lower() not in stopword_list
        ]
    filtered_text = " ".join(filtered_tokens)
    return filtered_text


# Apply function on review column
imdb_data["review"] = imdb_data["review"].apply(remove_stopwords)

{'this', 'themselves', 'these', 'such', 'can', 'mightn', 'have', 'aren', 'you', 'whom', 'at', 'during', 'm', "couldn't", "wasn't", 'are', 'from', 'who', 'of', 'itself', 'each', 'myself', 'should', 'll', "shan't", 'theirs', 'a', 'than', 'mustn', 'were', 'up', 'will', "weren't", 'no', 'again', 'my', 'what', 'or', 'more', 'doing', 'her', 'did', 'over', 'be', 'our', 'into', 'until', 'those', 'so', 'do', 'below', 'that', 'by', "hadn't", 'hers', 'most', "hasn't", 'been', 'under', 'in', 'very', 'how', 'his', 'further', 'same', 'your', 'yours', 'd', 'other', 'any', 'to', "didn't", 'don', 'both', 'on', "shouldn't", 'him', 'yourself', 'it', "it's", 'off', 'because', 're', 'nor', 'if', 'we', 'now', 'being', 'haven', 'shan', 't', 'ain', "you're", 'o', 'their', "mightn't", 'all', 'few', 'between', 'too', 'me', "doesn't", 'why', 'while', 'an', "won't", 'just', 'i', 'then', 'shouldn', 'yourselves', 'only', 'once', 'ourselves', "needn't", 'where', "you'll", 'the', 'and', 'hasn', 'own', 'had', 'having'

In [15]:
# normalized train reviews
norm_train_reviews = imdb_data.review[:40000]
norm_train_reviews[0]
# convert dataframe to string
norm_train_string=norm_train_reviews.to_string()
# Spelling correction using Textblob
norm_train_spelling=TextBlob(norm_train_string)
norm_train_spelling.correct()
norm_train_words=norm_train_spelling.words
norm_train_words

In [None]:
# Count vectorizer for bag of words
cv = CountVectorizer(min_df=0, max_df=1, binary=False, ngram_range=(1, 3))
# transformed train reviews
cv_train_reviews = cv.fit_transform(norm_train_reviews)
# transformed test reviews
cv_test_reviews = cv.transform(norm_test_reviews)

print("BOW_cv_train:", cv_train_reviews.shape)
print("BOW_cv_test:", cv_test_reviews.shape)
# vocab=cv.get_feature_names()-toget feature names

InvalidParameterError: The 'min_df' parameter of CountVectorizer must be a float in the range [0.0, 1.0] or an int in the range [1, inf). Got 0 instead.