In [None]:
import nltk
nltk.download('punkt') # for sent_tokenize
nltk.download('stopwords') 
nltk.download('wordnet') # for WordNetLemmatizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Setting random seed
seed = 123
# Data manipulation/analysis
import numpy as np
import pandas as pd
# Data partitioning
from sklearn.model_selection import train_test_split
# Text preprocessing/analysis
import re
from nltk import word_tokenize, sent_tokenize, FreqDist
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
# Visualisation
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="whitegrid", context='talk')

In [None]:
sample = pd.read_csv('/movie_data.csv')
print(f"{sample.shape[0]} rows and {sample.shape[1]} columns")
sample.head()

50000 rows and 2 columns


Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [None]:
sample['sentiment'].value_counts()

1    25000
0    25000
Name: sentiment, dtype: int64

In [None]:
# Split data into train & test
X_train, X_test, y_train, y_test = train_test_split(sample['review'], sample['sentiment'], test_size=5000, random_state=seed, 
                                                    stratify=sample['sentiment'])
# Append sentiment back using indices
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)
# Check dimensions
print(f"Train: {train.shape[0]} rows and {train.shape[1]} columns")
print(f"{train['sentiment'].value_counts()}\n")
print(f"Test: {test.shape[0]} rows and {test.shape[1]} columns")
print(test['sentiment'].value_counts())

Train: 45000 rows and 2 columns
1    22500
0    22500
Name: sentiment, dtype: int64

Test: 5000 rows and 2 columns
1    2500
0    2500
Name: sentiment, dtype: int64


In [None]:
train.head()

Unnamed: 0,review,sentiment
26436,SPOILER WARNING: There are some minor spoilers...,0
21565,"Fans of the Pink Panther, Naked Gun, or Get Sm...",1
28582,Daffy Duck has signs hanging from every inch o...,1
29011,A woman borough a boy to this world and was al...,1
34450,Galaxy Express 999 (Ginga tetsudô Three-Nine)....,1


In [None]:
# Prepare training corpus into one giant string
train_string = " ".join(X_train.values)
print(f"***** Extract of train_string ***** \n{train_string[:101]}", "\n")
# Split train_corpus by white space
splits = train_string.split()  
print(f"***** Extract of splits ***** \n{splits[:18]}\n")

***** Extract of train_string ***** 

***** Extract of splits ***** 



In [None]:
print(f"Number of strings: {len(splits)}")
print(f"Number of unique strings: {len(set(splits))}")

Number of strings: 10405680
Number of unique strings: 410467


In [None]:
freq_splits = FreqDist(splits)
print(f"10 most common strings\n{freq_splits.most_common(10)}", "\n")


10 most common strings
[('the', 512507), ('a', 276387), ('and', 271920), ('of', 255456), ('to', 235730), ('is', 182758), ('in', 152963), ('I', 119013), ('that', 114050), ('this', 102230)] 



In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
count = CountVectorizer()

In [None]:
docs = np.array(sample['review'])

In [None]:
docs[2]

'***SPOILER*** Do not read this, if you think about watching that movie, although it would be a waste of time. (By the way: The plot is so predictable that it does not make any difference if you read this or not anyway)<br /><br />If you are wondering whether to see "Coyote Ugly" or not: don\'t! It\'s not worth either the money for the ticket or the VHS / DVD. A typical "Chick-Feel-Good-Flick", one could say. The plot itself is as shallow as it can be, a ridiculous and uncritical version of the American Dream. The young good-looking girl from a small town becoming a big success in New York. The few desperate attempts of giving the movie any depth fail, such as the "tragic" accident of the father, the "difficulties" of Violet\'s relationship with her boyfriend, and so on. McNally (Director) tries to arouse the audience\'s pity and sadness put does not have any chance to succeed in this attempt due to the bad script and the shallow acting. Especially Piper Perabo completely fails in conv

In [None]:
bag = count.fit_transform(docs)

In [None]:
print(count.vocabulary)

None


In [None]:
print(bag.toarray())