In [1]:
import re
import os

import numpy as np
import pandas as pd

from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

from multiprocessing import Pool

from collections import Counter
from itertools import chain

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from jupyterthemes import jtplot
import pickle

ModuleNotFoundError: No module named 'seaborn'

In [320]:
jtplot.style(theme = "monokai", context = "notebook", ticks = True)

In [321]:
def read_text(files, print_n=1000):
    '''
        This function reads text files from a directory and creates a list of documents(corpus).
    '''
    docs = []
    for idx, f in enumerate(files):
        if idx % print_n == 0:
            print("Reading file number : {}".format(idx))
        with open(f, "r") as doc:
            text = doc.read()
        docs.append(text)
    return docs

In [322]:
def treebank_to_wordnet_pos_converter(treebank_pos_tag):
    '''
        This function converts tree bank POS tags into wordnet compatible POS tags.
    '''
    if treebank_pos_tag.startswith("J"):
        return wordnet.ADJ
    elif treebank_pos_tag.startswith("V"):
        return wordnet.VERB
    elif treebank_pos_tag.startswith("N"):
        return wordnet.NOUN
    elif treebank_pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [323]:
def preprocess_text(corpus, print_n=1000):
    '''
        This function performs unstructured data preprocessing.
    '''
    l_stop_words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    count_docs = len(corpus)
    processed_corpus = []
    for idx, doc in enumerate(corpus):
        if idx % print_n == 0:
            print("Processing Document : {} in {}".format(idx, count_docs))
        # 1. Convert text to lower case
        doc = doc.lower()

        # 2. Tokenize documents
        doc_tokenized = doc.split()

        # 3. POS Tagging
        l_token_pos_tags = pos_tag(doc_tokenized)

        # 4. Remove Stop Words
        doc_token_pos = [token_pos_tuple for token_pos_tuple in l_token_pos_tags
                         if token_pos_tuple[0] not in l_stop_words]

        # 5. Remove characters other than alphabet
        doc_token_pos = [(re.sub(r'[^a-zA-Z]', '', token), pos)
                         for token, pos in doc_token_pos]
        
        doc_token_pos = [(token, pos)
                         for token, pos in doc_token_pos if token!='']
        
        # 6. Lemmatization - Convert each token into lexicon
        doc_token_pos = [(token, treebank_to_wordnet_pos_converter(pos))
                         for token, pos in doc_token_pos]

        doc_token_lemmatized = [lemmatizer.lemmatize(
            token, pos) for token, pos in doc_token_pos]
        
        processed_corpus.append(doc_token_lemmatized)

    return processed_corpus

In [324]:
def return_frequent_words(corpus, threshold_percentile=0.9):
    df_term_frequency = pd.DataFrame.from_dict(
        Counter(chain(*corpus)), orient="index")
    df_term_frequency.reset_index(inplace=True)
    df_term_frequency.columns = ["token", "frequency"]
    df_term_frequency = df_term_frequency.sort_values(
        by="frequency", ascending=False)
    df_term_frequency["frequency_ratio"] = df_term_frequency["frequency"] / \
        df_term_frequency["frequency"].sum()
    df_term_frequency["cum_frequency_ratio"] = df_term_frequency["frequency_ratio"].cumsum()

    df_term_frequency = df_term_frequency.loc[df_term_frequency["cum_frequency_ratio"] < threshold_percentile]
    frequent_tokens = df_term_frequency.loc[:, "token"].unique()

    df_term_frequency["idx"] = np.arange(1, df_term_frequency.shape[0]+1, 1)

    dict_token_idx_mapping = dict(
        zip(df_term_frequency["token"].values, df_term_frequency["idx"].values))
    return frequent_tokens, dict_token_idx_mapping

In [341]:
def prepare_data(print_n=1000):

    # 1. Read Raw Text
    pos_files = [os.path.join("data/aclImdb/train/pos", f)
                 for f in os.listdir("data/aclImdb/train/pos")]
    neg_files = [os.path.join("data/aclImdb/train/neg", f)
                 for f in os.listdir("data/aclImdb/train/neg")]

    pos_reviews = read_text(pos_files, print_n)
    neg_reviews = read_text(neg_files, print_n)

    # 2. Preprocess Text
    pos_processed = preprocess_text(pos_reviews, print_n)
    neg_processed = preprocess_text(neg_reviews, print_n)

    processed_corpus = pos_processed+neg_processed

    # 3. Find Frequent Tokens
    high_occurrence_tokens, token_idx_map = return_frequent_words(
        processed_corpus)

    df_pos = pd.DataFrame()
    df_pos["review"] = pos_processed
    df_pos["sentiment"] = 1
    df_neg = pd.DataFrame()
    df_neg["review"] = neg_processed
    df_neg["sentiment"] = 0

    # 3. Create Dataframe with reviews
    df_reviews = pd.concat([df_pos, df_neg], axis=0)
    df_reviews["review_frequent_tokens"] = df_reviews["review"].apply(
        lambda review: [token for token in review if token in high_occurrence_tokens])

    # 4. Create index number per token
    df_reviews["review_frequent_tokens_idx"] = df_reviews["review_frequent_tokens"].apply(
        lambda review: [token_idx_map[token] for token in review])

    return df_reviews, high_occurrence_tokens, token_idx_map

In [342]:
df_train_reviews, high_occurrence_tokens, token_idx_map = prepare_data(print_n=1000)

Reading file number : 0
Reading file number : 1000
Reading file number : 2000
Reading file number : 3000
Reading file number : 4000
Reading file number : 5000
Reading file number : 6000
Reading file number : 7000
Reading file number : 8000
Reading file number : 9000
Reading file number : 10000
Reading file number : 11000
Reading file number : 12000
Reading file number : 0
Reading file number : 1000
Reading file number : 2000
Reading file number : 3000
Reading file number : 4000
Reading file number : 5000
Reading file number : 6000
Reading file number : 7000
Reading file number : 8000
Reading file number : 9000
Reading file number : 10000
Reading file number : 11000
Reading file number : 12000
Processing Document : 0 in 12500
Processing Document : 1000 in 12500
Processing Document : 2000 in 12500
Processing Document : 3000 in 12500
Processing Document : 4000 in 12500
Processing Document : 5000 in 12500
Processing Document : 6000 in 12500
Processing Document : 7000 in 12500
Processing Do

In [348]:
df_train_reviews.to_pickle("data/processed_data/train_reviews_processed.pkl")



In [346]:
df_train_reviews.to_hdf("data/processed_data/train_reviews_processed.h5", key = "df")


with open("data/processed_data/high_occurrence_tokens.pkl", "wb") as f:
    pickle.dump(high_occurrence_tokens, f)
    
with open("data/processed_data/token_idx_map.pkl", "wb") as f:
    pickle.dump(token_idx_map, f)

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['review', 'review_frequent_tokens', 'review_frequent_tokens_idx']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [356]:
with open("data/processed_data/token_idx_map.pkl", "rb") as f:
    c = pickle.load(f)

In [357]:
c

{'br': 1,
 'movie': 2,
 'film': 3,
 'one': 4,
 'make': 5,
 'like': 6,
 'see': 7,
 'get': 8,
 'good': 9,
 'time': 10,
 'character': 11,
 'go': 12,
 'watch': 13,
 'story': 14,
 'the': 15,
 'even': 16,
 'it': 17,
 'bad': 18,
 'would': 19,
 'really': 20,
 'think': 21,
 'well': 22,
 'show': 23,
 'scene': 24,
 'great': 25,
 'much': 26,
 'look': 27,
 'say': 28,
 'people': 29,
 'also': 30,
 'know': 31,
 'first': 32,
 'give': 33,
 'take': 34,
 'end': 35,
 'way': 36,
 'come': 37,
 'play': 38,
 'love': 39,
 'thing': 40,
 'find': 41,
 'could': 42,
 'seem': 43,
 'life': 44,
 'two': 45,
 'many': 46,
 'work': 47,
 'plot': 48,
 'want': 49,
 'never': 50,
 'little': 51,
 'actor': 52,
 'best': 53,
 'year': 54,
 'try': 55,
 'ever': 56,
 'still': 57,
 'man': 58,
 'this': 59,
 'i': 60,
 'part': 61,
 'something': 62,
 'back': 63,
 'im': 64,
 'use': 65,
 'lot': 66,
 'old': 67,
 'director': 68,
 'performance': 69,
 'real': 70,
 'funny': 71,
 'though': 72,
 'do': 73,
 'woman': 74,
 'new': 75,
 'acting': 76,
 'a

In [347]:
test= pd.read_hdf("data/processed_data/train_reviews_processed.h5")
test.head()

Unnamed: 0,review,sentiment,review_frequent_tokens,review_frequent_tokens_idx
0,"[movie, get, respect, sure, lot, memorable, qu...",1,"[movie, get, respect, sure, lot, memorable, qu...","[2, 8, 888, 167, 66, 794, 1646, 760, 1201, 630..."
1,"[bizarre, horror, movie, fill, famous, face, s...",1,"[bizarre, horror, movie, fill, famous, face, s...","[1006, 111, 2, 629, 693, 228, 661, 272, 169, 1..."
2,"[solid, unremarkable, film, matthau, einstein,...",1,"[solid, film, wonderful, favorite, part, thing...","[999, 3, 308, 385, 61, 40, 19, 5, 12, 36, 7, 2..."
3,"[strange, feeling, sit, alone, theater, occupy...",1,"[strange, feeling, sit, alone, theater, parent...","[548, 441, 355, 511, 506, 631, 146, 345, 6, 21..."
4,"[probably, already, know, now, additional, epi...",1,"[probably, already, know, now, episode, never,...","[157, 387, 31, 378, 184, 50, 708, 310, 118, 13..."
