# Importing Text corpus

In [3]:
from sklearn.datasets import load_files

review_train = load_files('data/aclImdb/train')
text_train, y_train = review_train.data, review_train.target

len(text_train)

75000

In [5]:
review_test = load_files('data/aclImdb/test')
text_test, y_test = review_test.data, review_test.target

len(text_test)

25000

In [6]:
# removing <br /> tags from the datasets

text_train = [doc.replace(b"<br />", b" ") for doc in text_train]
text_test = [doc.replace(b"<br />", b" ") for doc in text_test]

In [7]:
text_train[6]

b'Gloomy Sunday - Ein Lied von Liebe und Tod directed by Rolf Sch\xc3\xbcbel in 1999 is a romantic, absorbing, beautiful, and heartbreaking movie. It started like Jules and Jim; it ended as one of Agatha Christie\'s books, and in between it said something about love, friendship, devotion, jealousy, war, Holocaust, dignity, and betrayal, and it did better than The Black Book which is much more popular. It is not perfect, and it made me, a cynic, wonder in the end on the complexity of the relationships and sensational revelations, and who is who to whom but the movie simply overwhelmed me. Perfect or not, it is unforgettable. All four actors as the parts of the tragic not even a triangle but a rectangle were terrific. I do believe that three men could fell deeply for one girl as beautiful and dignified as Ilona in a star-making performance by young Hungarian actress Erica Marozs\xc3\xa1n and who would not? The titular song is haunting, sad, and beautiful, and no doubt deserves the movie 

# Bag of Words

In [9]:
# toy dataset

from sklearn.feature_extraction.text import CountVectorizer

bards_words = ['The fool doth think he is wise,', 'but the wise man knows himself to be a fool']
vect = CountVectorizer().fit(bards_words)

print('Vocabulary Size: {}'.format(len(vect.vocabulary_)))
print('Vocabulary Content:\n{}'.format(vect.vocabulary_))

bag_of_words = vect.transform(bards_words)
print('Bag of Words:\n{}'.format(bag_of_words.toarray()))

Vocabulary Size: 13
Vocabulary Content:
{'the': 9, 'fool': 3, 'doth': 2, 'think': 10, 'he': 4, 'is': 6, 'wise': 12, 'but': 1, 'man': 8, 'knows': 7, 'himself': 5, 'to': 11, 'be': 0}
Bag of Words:
[[0 0 1 1 1 0 1 0 0 1 1 0 1]
 [1 1 0 1 0 1 0 1 1 1 0 1 1]]


In [11]:
# movie review dataset

vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)

feature_names = vect.get_feature_names()
print('Number of features: {}'.format(len(feature_names)))
print('First 20 features:\n{}'.format(feature_names[:20]))
print('Every 2000th feature:\n{}'.format(feature_names[::2000]))

Number of features: 124255
First 20 features:
['00', '000', '0000', '0000000000000000000000000000000001', '0000000000001', '000000001', '000000003', '00000001', '000001745', '00001', '0001', '00015', '0002', '0007', '00083', '000ft', '000s', '000th', '001', '002']
Every 2000th feature:
['00', '_require_', 'aideed', 'announcement', 'asteroid', 'banquière', 'besieged', 'bollwood', 'btvs', 'carboni', 'chcialbym', 'clotheth', 'consecration', 'cringeful', 'deadness', 'devagan', 'doberman', 'duvall', 'endocrine', 'existent', 'fetiches', 'formatted', 'garard', 'godlie', 'gumshoe', 'heathen', 'honoré', 'immatured', 'interested', 'jewelry', 'kerchner', 'köln', 'leydon', 'lulu', 'mardjono', 'meistersinger', 'misspells', 'mumblecore', 'ngah', 'oedpius', 'overwhelmingly', 'penned', 'pleading', 'previlage', 'quashed', 'recreating', 'reverent', 'ruediger', 'sceme', 'settling', 'silveira', 'soderberghian', 'stagestruck', 'subprime', 'tabloids', 'themself', 'tpf', 'tyzack', 'unrestrained', 'videoed', 

In [20]:
# using min_df

vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)

feature_names = vect.get_feature_names()
print('Number of features: {}'.format(len(feature_names)))
print('First 20 features:\n{}'.format(feature_names[:20]))
print('Every 2000th feature:\n{}'.format(feature_names[::2000]))

Number of features: 44532
First 20 features:
['00', '000', '001', '007', '00am', '00pm', '00s', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '100', '1000', '1001']
Every 2000th feature:
['00', 'anxiety', 'bevy', 'capitalists', 'compliance', 'deck', 'drilling', 'extinguished', 'gals', 'haute', 'ineffectually', 'knifed', 'malcomb', 'morrow', 'ott', 'policies', 'rebuffed', 'sadder', 'sir', 'strick', 'tinkles', 'uprising', 'wreaks']


# Stopwords

In [19]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

print('Number of Stopwords: {}'.format(len(ENGLISH_STOP_WORDS)))

Number of Stopwords: 318


In [21]:
vect = CountVectorizer(min_df=5, stop_words='english').fit(text_train)
X_train = vect.transform(text_train)

feature_names = vect.get_feature_names()
print('Number of features: {}'.format(len(feature_names)))

Number of features: 44223


# tf-idf

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=5).fit(text_train)
X_train = vectorizer.transform(text_train)

max_value = X_train.max(axis=0).toarray().ravel()
sorted_by_tdidf = max_value.argsort()
feature_names = np.array(vectorizer.get_feature_names())

print('Features with lowest tfidf:\n{}'.format(feature_names[sorted_by_tfidf[:20]]))
print('Features with highest tfidf:\n{}'.format(feature_names[sorted_by_tfidf[-20:]]))

Features with lowest tfidf:
['the' 'and' 'of' 'to' 'this' 'is' 'it' 'in' 'that' 'but' 'for' 'with'
 'was' 'as' 'on' 'movie' 'not' 'one' 'be' 'have']
Features with highest tfidf:
['donegal' 'hotheaded' 'donatello' 'rylance' 'ruptured' 'téa' 'tz'
 'tyzack' 'domini' 'hpl' 'ryo' 'hoyts' 'asbestos' 'hoverboy' 'bunks'
 'housesitter' 'ruts' 'rw' 'russells' 'shipments']


In [28]:
import numpy as np

sorted_by_idf = np.argsort(vectorizer.idf_)

print('Features with lowest idf:\n{}'.format(feature_names[sorted_by_idf[:100]]))

Features with lowest idf:
['the' 'and' 'of' 'to' 'this' 'is' 'it' 'in' 'that' 'but' 'for' 'with'
 'was' 'as' 'on' 'movie' 'not' 'one' 'be' 'have' 'are' 'film' 'you' 'all'
 'at' 'an' 'by' 'from' 'so' 'like' 'who' 'there' 'they' 'his' 'if' 'out'
 'just' 'about' 'he' 'or' 'has' 'what' 'some' 'can' 'good' 'when' 'more'
 'up' 'time' 'very' 'even' 'only' 'no' 'see' 'would' 'my' 'story' 'really'
 'which' 'well' 'had' 'me' 'than' 'their' 'much' 'were' 'get' 'other' 'do'
 'been' 'most' 'also' 'into' 'don' 'her' 'first' 'great' 'how' 'made'
 'people' 'will' 'make' 'because' 'way' 'could' 'bad' 'we' 'after' 'them'
 'too' 'any' 'then' 'movies' 'watch' 'she' 'think' 'seen' 'acting' 'its'
 'characters']


# n-Grams (Bag-of-Words with More than 1 word)

In [7]:
# bigrams
# two tokens following each-other (2,2)

from sklearn.feature_extraction.text import CountVectorizer

bards_words = ['The fool doth think he is wise,', 'but the wise man knows himself to be a fool']
vect = CountVectorizer(ngram_range=(2,2)).fit(bards_words)

print('Vocabulary Size: {}'.format(len(vect.vocabulary_)))
print('Vocabulary Content:\n{}'.format(vect.vocabulary_))

Vocabulary Size: 14
Vocabulary Content:
{'the fool': 9, 'fool doth': 3, 'doth think': 2, 'think he': 11, 'he is': 4, 'is wise': 6, 'but the': 1, 'the wise': 10, 'wise man': 13, 'man knows': 8, 'knows himself': 7, 'himself to': 5, 'to be': 12, 'be fool': 0}


In [4]:
# trigrams

vect = CountVectorizer(ngram_range=(1,3)).fit(bards_words)

print('Vocabulary Size: {}'.format(len(vect.vocabulary_)))
print('Vocabulary Content:\n{}'.format(vect.vocabulary_))

Vocabulary Size: 39
Vocabulary Content:
{'the': 25, 'fool': 8, 'doth': 5, 'think': 30, 'he': 11, 'is': 17, 'wise': 36, 'the fool': 26, 'fool doth': 9, 'doth think': 6, 'think he': 31, 'he is': 12, 'is wise': 18, 'the fool doth': 27, 'fool doth think': 10, 'doth think he': 7, 'think he is': 32, 'he is wise': 13, 'but': 2, 'man': 22, 'knows': 19, 'himself': 14, 'to': 33, 'be': 0, 'but the': 3, 'the wise': 28, 'wise man': 37, 'man knows': 23, 'knows himself': 20, 'himself to': 15, 'to be': 34, 'be fool': 1, 'but the wise': 4, 'the wise man': 29, 'wise man knows': 38, 'man knows himself': 24, 'knows himself to': 21, 'himself to be': 16, 'to be fool': 35}


# Stemming and Lemmatization

In [12]:
import spacy
import nltk

en_nlp = spacy.load('en')
stemmer = nltk.stem.PorterStemmer()

def compare_normalization(doc):
    doc_spacy = en_nlp(doc)
    print('Lemmatization:\n{}'.format([token.lemma_ for token in doc_spacy]))
    print('Stemming:\n{}'.format([stemmer.stem(token.norm_.lower()) for token in doc_spacy]))
    
compare_normalization(u"Our meeting today was worse than yesterday, ""I'm scared of meeting the clients tomorrow.")

Lemmatization:
['-PRON-', 'meeting', 'today', 'be', 'bad', 'than', 'yesterday', ',', '-PRON-', 'be', 'scared', 'of', 'meet', 'the', 'client', 'tomorrow', '.']
Stemming:
['our', 'meet', 'today', 'wa', 'wors', 'than', 'yesterday', ',', 'i', 'am', 'scare', 'of', 'meet', 'the', 'client', 'tomorrow', '.']
