In [0]:
import numpy as np
import pandas as pd

In [0]:
text = '''Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.
They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense.'''

In [0]:
ascii_chars = [ord(char) for char in text]
ascii_chars[:5]

[77, 114, 46, 32, 97]

In [0]:
len(text)

262

**NLTK Package**

In [0]:
import nltk

In [0]:
tokens = nltk.word_tokenize(text)

In [0]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [0]:
tokens[:5]

['Mr.', 'and', 'Mrs.', 'Dursley', ',']

In [0]:
from nltk import tokenize

In [0]:
sent_tokens = tokenize.sent_tokenize(text)
nltk.sent_tokenize(text)
sent_tokens

['Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.',
 'They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense.']

In [0]:
from nltk.tokenize import MWETokenizer

In [0]:
tokenizer = MWETokenizer()
tokenizer.add_mwe(('in', 'spite', 'of'))
tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split())

['In',
 'a',
 'little',
 'or',
 'a',
 'little',
 'bit',
 'or',
 'a',
 'lot',
 'in_spite_of']

**PoS tags for English**
https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [0]:
# nltk.pos_tag(tokens[10:20])
nltk.pos_tag(tokens[9:20])

[('Privet', 'NNP'),
 ('Drive', 'NNP'),
 (',', ','),
 ('were', 'VBD'),
 ('proud', 'JJ'),
 ('to', 'TO'),
 ('say', 'VB'),
 ('that', 'IN'),
 ('they', 'PRP'),
 ('were', 'VBD'),
 ('perfectly', 'RB')]

In [0]:
from nltk.corpus import wordnet
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
syn = wordnet.synsets("building")

print(syn)
print(syn[0].name())
print(syn[0].definition())
print(syn[0].examples())
 
print(syn[1].name())
print(syn[1].definition())

[Synset('building.n.01'), Synset('construction.n.01'), Synset('construction.n.07'), Synset('building.n.04'), Synset('construct.v.01'), Synset('build_up.v.02'), Synset('build.v.03'), Synset('build.v.04'), Synset('build.v.05'), Synset('build.v.06'), Synset('build.v.07'), Synset('build.v.08'), Synset('build_up.v.04'), Synset('build.v.10')]
building.n.01
a structure that has a roof and walls and stands more or less permanently in one place
['there was a three-story building on the corner', 'it was an imposing edifice']
construction.n.01
the act of constructing something


In [0]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
	stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [0]:
freq_dist = nltk.FreqDist(tokens)
freq_dist.most_common(10)

[(',', 5),
 ('were', 3),
 ('to', 2),
 ('they', 2),
 ('you', 2),
 ('.', 2),
 ('’', 2),
 ('Mr.', 1),
 ('and', 1),
 ('Mrs.', 1)]

In [0]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [0]:
wiki_text = 'A lemma is a word that stands at the head of a definition in a dictionary. All the head words in a dictionary are lemmas'
wiki_tokens = nltk.word_tokenize(wiki_text)
for word in wiki_tokens:
    print ("{0:15}{1:15}".format(word, wordnet_lemmatizer.lemmatize(word)))

A              A              
lemma          lemma          
is             is             
a              a              
word           word           
that           that           
stands         stand          
at             at             
the            the            
head           head           
of             of             
a              a              
definition     definition     
in             in             
a              a              
dictionary     dictionary     
.              .              
All            All            
the            the            
head           head           
words          word           
in             in             
a              a              
dictionary     dictionary     
are            are            
lemmas         lemma          


In [0]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

In [0]:
for word in wiki_tokens:
    print ("{0:15}{1:15}".format(word, stemmer.stem(word)))

A              a              
lemma          lemma          
is             is             
a              a              
word           word           
that           that           
stands         stand          
at             at             
the            the            
head           head           
of             of             
a              a              
definition     definit        
in             in             
a              a              
dictionary     dictionari     
.              .              
All            all            
the            the            
head           head           
words          word           
in             in             
a              a              
dictionary     dictionari     
are            are            
lemmas         lemma          


**Data Vectorizing**

In [0]:
from google.colab import files
uploaded = files.upload()

Saving imdb_sampled.csv to imdb_sampled.csv


In [0]:
df = pd.read_csv('imdb_sampled.csv')

In [0]:
df.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,48366,train,"Not sure why it doesn't play in Peoria, appare...",pos,8530_9.txt
1,43357,train,A complex story laid on the background of part...,pos,4021_7.txt
2,40357,train,Beat a path to this important documentary that...,pos,1321_10.txt
3,37230,train,It's rare that I feel a need to write a review...,neg,9759_2.txt
4,27703,train,"I like to keep my reviews short and simple, bu...",neg,12433_3.txt


In [0]:
df['review'][0]

'Not sure why it doesn\'t play in Peoria, apparently, but this is a very funny, clever British comedy. It\'s set at the end of the "swinging sixties". Peter Sellars is fantastic as the rich, forty-something serial womaniser. The perfectly delectable Goldie Hawn, playing a 19 year American girl in London, is, initially, Sellars\' "catch of the day". But the urbane TV food critic can\'t stop himself from falling for the dizzy American blond.<br /><br />Humour, pathos, great script, strong performances from the leads and supporting caste.<br /><br />It\'s a great film, and the best gag is the very last line.<br /><br />Try it, you\'ll like it.'

In [0]:
corpus = " ".join([x for x in df['review']])


In [0]:
corpus_tokens = nltk.word_tokenize(corpus)

In [0]:
freq_dist = nltk.FreqDist(corpus_tokens)
freq_dist.most_common(15)

[('the', 56925),
 (',', 54445),
 ('.', 46610),
 ('a', 30925),
 ('and', 30682),
 ('of', 28584),
 ('to', 26245),
 ('is', 21456),
 ('/', 20084),
 ('>', 20082),
 ('<', 20063),
 ('br', 20040),
 ('in', 17267),
 ('I', 16314),
 ('it', 15152)]

In [0]:
from bs4 import BeautifulSoup
cleantext = BeautifulSoup("<html>What a beautiful website!</html>", "lxml").text
cleantext

'What a beautiful website!'

In [0]:
df['review_clean'] = df['review'].apply(lambda x: BeautifulSoup(x.lower(), "lxml").text)

In [0]:
corpus_cl = " ".join([x for x in df['review_clean']])
corpus_cl_tokens = nltk.word_tokenize(corpus_cl)
freq_dist = nltk.FreqDist(corpus_cl_tokens)
freq_dist.most_common(15)

[('the', 64760),
 (',', 54443),
 ('.', 46013),
 ('a', 32062),
 ('and', 31973),
 ('of', 28901),
 ('to', 26557),
 ('is', 21677),
 ('it', 18513),
 ('in', 18287),
 ('i', 16613),
 ('this', 14767),
 ('that', 14107),
 ("'s", 12014),
 ('was', 9841)]

In [0]:
df.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file,review_clean
0,48366,train,"Not sure why it doesn't play in Peoria, appare...",pos,8530_9.txt,"not sure why it doesn't play in peoria, appare..."
1,43357,train,A complex story laid on the background of part...,pos,4021_7.txt,a complex story laid on the background of part...
2,40357,train,Beat a path to this important documentary that...,pos,1321_10.txt,beat a path to this important documentary that...
3,37230,train,It's rare that I feel a need to write a review...,neg,9759_2.txt,it's rare that i feel a need to write a review...
4,27703,train,"I like to keep my reviews short and simple, bu...",neg,12433_3.txt,"i like to keep my reviews short and simple, bu..."


In [0]:
df['type'].value_counts()

train    2500
test     2500
Name: type, dtype: int64

In [0]:
train = df.loc[df['type'] == 'train']
test = df.loc[df['type'] == 'test']

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
count_vec = CountVectorizer()

In [0]:
count_vec = count_vec.fit(train['review_clean'])

In [0]:
transformed_train = count_vec.transform(train['review_clean'])

In [0]:
transformed_train = count_vec.fit_transform(train['review_clean'])

In [0]:
type(transformed_train)

scipy.sparse.csr.csr_matrix

In [0]:
transformed_train.shape

(2500, 28528)

In [0]:
count_vec.get_feature_names()[-15:]

['zooming',
 'zooms',
 'zoot',
 'zorak',
 'zorie',
 'zorrilla',
 'zu',
 'zulu',
 'zulus',
 'zuniga',
 'zwartboek',
 'zwick',
 'zx81',
 'zã',
 'ã¼ber']

In [0]:
count_vec = CountVectorizer(lowercase=True, stop_words='english', strip_accents='unicode', min_df = 50, max_df=1000, max_features=500)

In [0]:
count_vec = count_vec.fit(train['review_clean'])
transformed_train = count_vec.transform(train['review_clean'])

In [0]:
transformed_train.shape

(2500, 500)

In [0]:
count_vec.get_feature_names()[:10]

['absolutely',
 'act',
 'actor',
 'age',
 'american',
 'attempt',
 'audience',
 'awful',
 'based',
 'beautiful']

In [0]:
transformed_train.toarray()[0]

array([0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [0]:
'american' in train['review_clean'][0]

True

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer(max_features = 300)

In [0]:
tfidf_vec = tfidf_vec.fit(train['review_clean'])

In [0]:
train_tfidf = tfidf_vec.transform(train['review_clean'])

In [0]:
train_tfidf.toarray()[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.1008419 , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.07148209, 0.0824425 , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.12980755, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.13306956,
       0.        , 0.09657877, 0.        , 0.        , 0.        ,
       0.        , 0.16872522, 0.        , 0.        , 0.        ,
       0.1673935 , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.14412194, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.13621194,
       0.        , 0.        , 0.        , 0.        , 0.     

In [0]:
tfidf_vec.get_feature_names()

['10',
 'about',
 'acting',
 'action',
 'actors',
 'actually',
 'after',
 'again',
 'all',
 'almost',
 'also',
 'although',
 'always',
 'am',
 'an',
 'and',
 'another',
 'any',
 'anyone',
 'anything',
 'are',
 'around',
 'as',
 'at',
 'audience',
 'away',
 'back',
 'bad',
 'be',
 'beautiful',
 'because',
 'been',
 'before',
 'being',
 'believe',
 'best',
 'better',
 'between',
 'big',
 'bit',
 'black',
 'book',
 'both',
 'budget',
 'but',
 'by',
 'can',
 'cast',
 'character',
 'characters',
 'come',
 'comedy',
 'comes',
 'could',
 'course',
 'day',
 'did',
 'didn',
 'different',
 'director',
 'do',
 'does',
 'doesn',
 'don',
 'done',
 'down',
 'dvd',
 'each',
 'effects',
 'end',
 'ending',
 'enough',
 'episode',
 'especially',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'fact',
 'family',
 'far',
 'feel',
 'few',
 'film',
 'films',
 'find',
 'first',
 'for',
 'found',
 'from',
 'fun',
 'funny',
 'get',
 'gets',
 'girl',
 'give',
 'go',
 'goes',
 'going',
 'good',
 'got',
 

In [0]:
train_tfidf_array = train_tfidf.toarray()

In [0]:
type(train_tfidf_array)

numpy.ndarray

In [0]:
np.max(train_tfidf_array)

0.7930853274278427

In [0]:
df.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file,review_clean
0,48366,train,"Not sure why it doesn't play in Peoria, appare...",pos,8530_9.txt,"not sure why it doesn't play in peoria, appare..."
1,43357,train,A complex story laid on the background of part...,pos,4021_7.txt,a complex story laid on the background of part...
2,40357,train,Beat a path to this important documentary that...,pos,1321_10.txt,beat a path to this important documentary that...
3,37230,train,It's rare that I feel a need to write a review...,neg,9759_2.txt,it's rare that i feel a need to write a review...
4,27703,train,"I like to keep my reviews short and simple, bu...",neg,12433_3.txt,"i like to keep my reviews short and simple, bu..."


In [0]:
df['label'].value_counts()

pos    2511
neg    2489
Name: label, dtype: int64

In [0]:
from sklearn.preprocessing import LabelEncoder

In [0]:
label_enc = LabelEncoder()

In [0]:
label_enc = label_enc.fit(df['label'])

In [0]:
Y_train = label_enc.transform(train['label'])
Y_test = label_enc.transform(test['label'])

In [0]:
Y_train[:5]

array([1, 1, 1, 0, 0])

In [0]:
train.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file,review_clean
0,48366,train,"Not sure why it doesn't play in Peoria, appare...",pos,8530_9.txt,"not sure why it doesn't play in peoria, appare..."
1,43357,train,A complex story laid on the background of part...,pos,4021_7.txt,a complex story laid on the background of part...
2,40357,train,Beat a path to this important documentary that...,pos,1321_10.txt,beat a path to this important documentary that...
3,37230,train,It's rare that I feel a need to write a review...,neg,9759_2.txt,it's rare that i feel a need to write a review...
4,27703,train,"I like to keep my reviews short and simple, bu...",neg,12433_3.txt,"i like to keep my reviews short and simple, bu..."


In [0]:
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

In [0]:
X_train = train_tfidf_array
X_test = tfidf_vec.transform(test['review_clean']).toarray()

In [0]:
X_test[0]

array([0.        , 0.13754444, 0.06258271, 0.        , 0.        ,
       0.14383747, 0.        , 0.07367021, 0.04114035, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.05046396, 0.        , 0.0616752 , 0.        , 0.        ,
       0.        , 0.        , 0.17885765, 0.08251281, 0.        ,
       0.        , 0.07118599, 0.        , 0.03892757, 0.        ,
       0.        , 0.11516334, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.03329576,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.13189255, 0.        , 0.        , 0.        ,
       0.05803834, 0.06899018, 0.        , 0.05854284, 0.        ,
       0.        , 0.        , 0.08717125, 0.        , 0.06816406,
       0.        , 0.        , 0.        , 0.        , 0.05249

In [0]:
model = LinearSVC()
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)

In [0]:
len(Y_test)

2500

In [0]:
acc = accuracy_score(Y_test, y_pred)

print("Accuracy on the IMDB dataset: {:.2f}".format(acc*100))

Accuracy on the IMDB dataset: 77.96


**Word Embeddings**



1.   Word2Vec
  ![](https://fasttext.cc/img/cbo_vs_skipgram.png)
2.   FastText
3.   Any you can think of



