In [None]:
import numpy as np
import pandas as pd
import os
import logging
import string
import multiprocessing as mp

In [None]:
logging.basicConfig(level=logging.INFO)

In [None]:
def read_text(files):
    '''
        This function reads text files from a directory and creates a list of documents(corpus).
    '''
    docs = []
    with open(files, "r") as doc:
        text = doc.read()
    
    return text

In [None]:
def preprocess_text(document, print_n=1000, label=None):

    import spacy
    from nltk.corpus import stopwords
    import re

    nlp = spacy.load("en_core_web_sm")
    l_stop_words = stopwords.words('english')
    document = document.lower()
    doc = nlp(document)
    lemmas = [re.sub('[^a-zA-Z]', '', token.lemma_) for token in doc 
              if token.lemma_ != "-PRON-" and
              token.lemma_ not in l_stop_words and
              token.lemma_ not in string.punctuation]
    lemmas = [lemma for lemma in lemmas if lemma!=""]

    return lemmas

In [None]:
def get_frequent_tokens(series):
    from collections import Counter
    from functools import reduce

    tokens = reduce(lambda x, y: x+y, series)
    token_count = Counter(tokens)
    df_token_count = pd.DataFrame.from_dict(
        token_count, orient="index").reset_index()
    df_token_count.rename(columns={"index": "token",
                                   0: "count"}, inplace=True)
    overall_count = df_token_count["count"].sum()
    df_token_count["token_share"] = df_token_count["count"]/overall_count
    df_token_count["token_cum_share"] = df_token_count["token_share"].cumsum()
    frequent_tokens = df_token_count.loc[df_token_count["token_cum_share"] <= 0.75, "token"].unique()
    return frequent_tokens

In [None]:
if __name__ == "__main__":

    import pickle
    # Read all file names
    data_path = "data/aclImdb/train/"
    pos_files = [os.path.join("data/aclImdb/train/pos", f)
                 for f in os.listdir("data/aclImdb/train/pos")]
    neg_files = [os.path.join("data/aclImdb/train/neg", f)
                 for f in os.listdir("data/aclImdb/train/neg")]
    logging.info("Read all file names...")

    # Read text from all files
    with mp.Pool() as pool:
        pos_reviews = pool.map(read_text, pos_files)
        neg_reviews = pool.map(read_text, neg_files)
    logging.info("Read text from all files...")

    # Preprocess text
    with mp.Pool() as pool:
        logging.info("Preprocessing Positive Reviews...")
        pos_corpus_processed = pool.map(preprocess_text, pos_reviews)
        logging.info("Preprocessing Negative Reviews...")
        neg_corpus_processed = pool.map(preprocess_text, neg_reviews)
    logging.info("Preprocessed text...")

    # Create Dataframe from preprocessed text
    df_pos = pd.DataFrame()
    df_pos["corpus"] = pos_reviews
    df_pos["processed"] = pos_corpus_processed
    df_pos["sentiment"] = 1

    df_neg = pd.DataFrame()
    df_neg["corpus"] = neg_reviews
    df_neg["processed"] = neg_corpus_processed
    df_neg["sentiment"] = 0

    df_reviews = pd.concat([df_pos, df_neg], axis=0)
    logging.info(df_reviews.head())
    logging.info("Data frame with preprocessed text created...")

    # Retain only frequent tokens
    frequent_tokens = get_frequent_tokens(df_reviews["processed"].values)
    logging.info("No. of frequent tokens: {}".format(len(frequent_tokens)))
    df_reviews["processed"] = df_reviews["processed"].apply(
        lambda review: [token for token in review if token in frequent_tokens])
    logging.info("Retaining only frequent tokens...")

    # Create token idx mapping
    dict_frequent_tokens_map = {token: idx +
                                1 for idx, token in enumerate(frequent_tokens)}
    logging.info("Creating token idx mapping...")

    # Create idx mapping
    df_reviews["processed_idx"] = df_reviews["processed"].apply(
        lambda review: [dict_frequent_tokens_map[token] for token in review])
    logging.info("Creating idx sequences...")

    # Save token-idx map to disk
    with open("data/processed_data/token_idx_mapping.pkl", "wb") as f:
        pickle.dump(dict_frequent_tokens_map, f)
    logging.info("Saving token idx mapping to disk...")

    # Save processed data to de
    df_reviews.to_pickle("data/processed_data/train_processed_data.pkl")
    logging.info("Saving preprocessed data to disk...")

    """
    Read this way:
    with open('token_idx_mapping.pkl', 'rb') as handle:
        b = pickle.load(handle)
    """

In [None]:
%run train_sentiment_classifer.ipynb

In [None]:
dict_frequent_tokens_map