# Preprocessing

## Training data

In [3]:
import warnings
warnings.filterwarnings("ignore")                     #Ignoring unnecessory warnings
from tqdm import tqdm
import numpy as np                                  #for large and multi-dimensional arrays
import pandas as pd                                 #for data manipulation and analysis
import nltk                                         #Natural language processing tool-kit
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords                   #Stopwords corpus
from nltk.stem import PorterStemmer                 # Stemmer
import re
from sklearn.feature_extraction.text import CountVectorizer          #For Bag of words
from sklearn.feature_extraction.text import TfidfVectorizer          #For TF-IDF
from gensim.models import Word2Vec                                   #For Word2Vec

In [2]:
camera_train = pd.read_csv('./train/camera_train.csv',sep = '\t')
grocery_train = pd.read_csv('./train/grocery_train.csv',sep = '\t')
watches_train = pd.read_csv('./train/watches_train.csv',sep = '\t')
videogames_train = pd.read_csv('./train/videogames_train.csv',sep = '\t')

In [3]:
full_train_data = pd.concat([camera_train,grocery_train,watches_train,videogames_train], axis=0, ignore_index=True)

Full train data for all four sets combined. 
Each category set remains the same.

In [4]:
full_train_data = full_train_data[['product_category','review_headline','review_body','sentiment_actual']]
camera_train = camera_train[['product_category','review_headline','review_body','sentiment_actual']]
grocery_train = grocery_train[['product_category','review_headline','review_body','sentiment_actual']]
watches_train = watches_train[['product_category','review_headline','review_body','sentiment_actual']]
videogames_train = videogames_train[['product_category','review_headline','review_body','sentiment_actual']]
full_train_data.head()

Unnamed: 0,product_category,review_headline,review_body,sentiment_actual
0,Camera,Works with a D610,Works well with my Nikon D610. The range is a ...,pos
1,Camera,What a great bag! Tenba 637-262 Medium Shoulde...,"Wow, construction of this bag is second to non...",pos
2,Camera,i should have put system in a long time ago ...,i should have put system in a long time ago. ...,pos
3,Camera,9$? Way to good to be true but it is!,"So, I'm new to the arca swiss world. And my bo...",pos
4,Camera,"Great camera, now if I can learn all its ...","Great camera, now if I can learn all its finer...",pos


### Stopword, punctuation removal and stemming

In [5]:
pos_emoticons = [":-)",":)",":-]",":]",":-3",":3",":->",":>2","8-)","8)",":-}",":}",":o)",":c)",":^)","=]",
                 "=)",":-d",":d","8-d","8d","x-d","x-d","xd","=d","=3","b^d",":-))",";-)",";)","*-)",
                 "*)",";-]",";]",";^)",";d",":-p",":p","x-p",":-?",":?",":-?",":?",":-b",
                 ":b","=p",">:p",":*",":-*","^.^","^_^","^-^","xd","<3"]
neg_emoticons = [":-(",":(",":-c",":c",":-<",":<",":-[",":[",":-||",">:[",":{",":@",">:(",":-/",":/",">:\\",
                 ">:/",":\\","=/","=\\",":l","=l",":S",":-|",":|",":-x",":x","-.-","-,-"]

In [6]:
stopwordlist = set(stopwords.words('english'))
negations = ["aren't", "aren", "couldn", "couldn't", "didn", "didn't", "doesn", "doesn't", "don", "don't",
                "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't", "isn", "isn't", "mightn", "mightn't",
                "mustn", "mustn't", "needn", "needn't", "not", "shan", "shan't", "shouldn", "shouldn't",
                "wasn", "wasn't", "weren", "weren't", "wouldn", "wouldn't", "won", "won't"]
for word in negations:
    stopwordlist.remove(word)
def isstopword(word):
    '''checks whether word in stopwordlist'''
    return word in stopwordlist

def flipnegation(review):
    '''unravels negated words'''
    review = re.sub(r"\baren't", "are not", review)
    review = re.sub(r"\baren", "are not", review)
    review = re.sub(r"\bcouldn't", "could not", review)
    review = re.sub(r"\bcouldn", "could not", review)
    review = re.sub(r"\bdidn't", "did not", review)
    review = re.sub(r"\bdidn", "did not", review)
    review = re.sub(r"\bdoesn't", "does not", review)
    review = re.sub(r"\bdoesn", "does not", review)
    review = re.sub(r"\bdon't", "do not", review)
    review = re.sub(r"\bdon", "do not", review)
    review = re.sub(r"\bhadn't", "had not", review)
    review = re.sub(r"\bhadn", "had not", review)
    review = re.sub(r"\bhasn't", "has not", review)
    review = re.sub(r"\bhasn", "has not", review)
    review = re.sub(r"\bhaven't", "have not", review)
    review = re.sub(r"\bhaven", "have not", review)
    review = re.sub(r"\bisn't", "is not", review)
    review = re.sub(r"\bisn", "is not", review)
    review = re.sub(r"\bmightn't", "might not", review)
    review = re.sub(r"\bmightn", "might not", review)
    review = re.sub(r"\bmustn't", "must not", review)
    review = re.sub(r"\bmustn", "must not", review)
    review = re.sub(r"\bneedn't", "need not", review)
    review = re.sub(r"\bneedn", "need not", review)
    review = re.sub(r"\bshan't", "shall not", review)
    review = re.sub(r"\bshan", "shall not", review)
    review = re.sub(r"\bshouldn't", "should not", review)
    review = re.sub(r"\bshouldn", "should not", review)
    review = re.sub(r"\bwasn't", "was not", review)
    review = re.sub(r"\bwasn", "was not", review)
    review = re.sub(r"\bweren't", "were not", review)
    review = re.sub(r"\bweren", "were not", review)
    review = re.sub(r"\bwouldn't", "would not", review)
    review = re.sub(r"\bwouldn", "would not", review)
    review = re.sub(r"\bwon't", "will not", review)
    review = re.sub(r"\bwon", "will not", review)
    return review

In [7]:
def process_review(review):
    '''Preprocesses review applying tokenization, cleansing, stopword removal, emoticon handling, stemming etc.'''
    processed_review = []
  
    review = str(review).lower() # set to lowercase
    review = flipnegation(review) # unroll negations
    review = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' link ', review) # replace hyperlinks with tag 'link'
    review = re.sub(r'\.{2,}', ' ', review) # replace multiple dots by whitespace
    review = re.sub(r"[!\?\.]i", "", review) # clean some unproper sentence starts 
    review = re.sub(r'\s+', ' ', review) # remove unnecessary whitespace
    
    tokens = review.split(" ") # tokenize
    stemmer = SnowballStemmer("english")

    for token in tokens:
        if not isstopword(token):
            token = token.strip("'\”")
            if token in pos_emoticons:
                token = "pos_emoticon"
            elif token in neg_emoticons:
                token = "neg_emoticon"
            else:
                token = re.sub(r"[\(\)\[\]!\?\.\-\,]", "", token) # remove punctuation/ parentheses 
                token = str(stemmer.stem(token))
                
            processed_review.append(token)

    return ' '.join(processed_review)

In [8]:
# review needs to be processed in a lighter fashion so that terms match the pretrained embedding terms

def process_review_light(review):
    '''Preprocesses review in lighter fashion applying tokenization, cleansing, stopword removal'''
    processed_review = []
  
    review = str(review).lower() # set to lowercase
    review = flipnegation(review) # unroll negations
    review = re.sub(r"[!\?\.]i", "", review) # clean some unproper sentence starts 
    review = re.sub(r'\s+', ' ', review) # remove unnecessary whitespace
    
    tokens = review.split(" ") # tokenize

    for token in tokens:
        if not isstopword(token):
            token = token.strip("'\”")
            token = re.sub(r"[\(\)\[\]!\?\.\-\,]", "", token) # remove punctuation/ parentheses    
            processed_review.append(token)

    return ' '.join(processed_review)

In [9]:
full_train_data['processed'] = full_train_data['review_body'].apply(process_review)
full_train_data['processed_light'] = full_train_data['review_body'].apply(process_review_light)

full_train_data.to_csv('full_train_data.csv', sep="\t")

In [10]:
full_train_data.head()

Unnamed: 0,product_category,review_headline,review_body,sentiment_actual,processed,processed_light
0,Camera,Works with a D610,Works well with my Nikon D610. The range is a ...,pos,work well nikon d610 rang littl short though p...,works well nikon d610 range little short thoug...
1,Camera,What a great bag! Tenba 637-262 Medium Shoulde...,"Wow, construction of this bag is second to non...",pos,wow construct bag second none plenti storag la...,wow construction bag second none plenty storag...
2,Camera,i should have put system in a long time ago ...,i should have put system in a long time ago. ...,pos,put system long time ago pleas result instal p...,put system long time ago please results instal...
3,Camera,9$? Way to good to be true but it is!,"So, I'm new to the arca swiss world. And my bo...",pos,so i'm new arca swiss world boss introduc real...,so i'm new arca swiss world boss introduced re...
4,Camera,"Great camera, now if I can learn all its ...","Great camera, now if I can learn all its finer...",pos,great camera learn finer point turn one grands...,great camera learn finer points turns one gran...


In [15]:
camera_train['processed'] = camera_train['review_body'].apply(process_review)
camera_train['processed_light'] = camera_train['review_body'].apply(process_review_light)

camera_train.to_csv('camera_train.csv', sep="\t")

KeyboardInterrupt: 

In [12]:
grocery_train['processed'] = grocery_train['review_body'].apply(process_review)
grocery_train['processed_light'] = grocery_train['review_body'].apply(process_review_light)

grocery_train.to_csv('grocery_train.csv', sep="\t")

In [13]:
watches_train['processed'] = watches_train['review_body'].apply(process_review)
watches_train['processed_light'] = watches_train['review_body'].apply(process_review_light)

watches_train.to_csv('watches_train.csv', sep="\t")

In [14]:
videogames_train['processed'] = videogames_train['review_body'].apply(process_review)
videogames_train['processed_light'] = videogames_train['review_body'].apply(process_review_light)

videogames_train.to_csv('videogames_train.csv', sep="\t")

## Testing data

In [16]:
camera_test = pd.read_csv('./test/camera_test.csv',sep = '\t')
grocery_test = pd.read_csv('./test/grocery_test.csv',sep = '\t')
watches_test = pd.read_csv('./test/watches_test.csv',sep = '\t')
videogames_test = pd.read_csv('./test/videogames_test.csv',sep = '\t')

In [17]:
full_test_data = pd.concat([camera_test,grocery_test,watches_test,videogames_test], axis=0, ignore_index=True)

In [18]:
full_test_data = full_test_data[['product_category','review_headline','review_body','sentiment_actual']]
camera_test = camera_test[['product_category','review_headline','review_body','sentiment_actual']]
grocery_test = grocery_test[['product_category','review_headline','review_body','sentiment_actual']]
watches_test = watches_test[['product_category','review_headline','review_body','sentiment_actual']]
videogames_test = videogames_test[['product_category','review_headline','review_body','sentiment_actual']]

In [19]:
full_test_data['processed'] = full_test_data['review_body'].apply(process_review)
full_test_data['processed_light'] = full_test_data['review_body'].apply(process_review_light)

full_test_data.to_csv('full_test_data.csv', sep="\t")

In [20]:
camera_test['processed'] = camera_test['review_body'].apply(process_review)
camera_test['processed_light'] = camera_test['review_body'].apply(process_review_light)

camera_test.to_csv('camera_test.csv', sep="\t")

In [21]:
grocery_test['processed'] = grocery_test['review_body'].apply(process_review)
grocery_test['processed_light'] = grocery_test['review_body'].apply(process_review_light)

grocery_test.to_csv('grocery_test.csv', sep="\t")

In [22]:
watches_test['processed'] = watches_test['review_body'].apply(process_review)
watches_test['processed_light'] = watches_test['review_body'].apply(process_review_light)

watches_test.to_csv('watches_test.csv', sep="\t")

In [23]:
videogames_test['processed'] = videogames_test['review_body'].apply(process_review)
videogames_test['processed_light'] = videogames_test['review_body'].apply(process_review_light)

videogames_test.to_csv('videogames_test.csv', sep="\t")

## Vectorization