# Preprocessing and pipeline

In this notebook we are going to create a processing pipeline taking into account what we saw in notebook 1.

In [1]:
#Load libraries
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline
import re
import nltk
#nltk.download("stopwords")
from nltk.corpus import stopwords # import stopwords
from nltk.stem.porter import *
stemmer = PorterStemmer()
import os
import pickle

#Load sample dataframe
def open_dataframe(path):
    data = pd.read_pickle(path)
    return data.drop(['overall'], axis=1)

#Tokenization
def review_2_words(review):
    """Convert a raw review string into a sequence of words."""
    
    text = re.sub(r"[^a-zA-Z0-9]", " ", review.lower())
    
    words = text.split()
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.extend(["play", "game", "video", "one"])
    words = [w for w in words if w not in stopwords]
    
    #words = [PorterStemmer().stem(w) for w in words]
    #We are not going to use Stemm to speed up the process and
    #because we lose a lot of information from our texts
    
    return words


In [2]:
#let's create process data function
cache_dir = os.path.join("cache", "sentiment_analysis")
os.makedirs(cache_dir, exist_ok=True)

def preprocessData(data_train, labels_train, 
                    cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
    

    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass
    
    if cache_data is None:
        words_train = list(map(review_2_words, data_train))
        
        if cache_file is not None:
            cache_data = dict(words_train=words_train, 
                              labels_train=labels_train)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        words_train, labels_train = (cache_data['words_train'],
               cache_data['labels_train'])
    
    return words_train,labels_train

In [3]:
#pipeline function
def pipeline_process (path):
    data = open_dataframe(path)
    words_train, labels_train = preprocessData(
        data["reviewText"], data["sentiment_label"])
    return words_train, labels_train

In [4]:
#Let's run our pipeline!
words_process, labels = pipeline_process("data_balanced")

Read preprocessed data from cache file: preprocessed_data.pkl
