In [48]:
import pandas as pd 
import numpy as np
## display all the columns
pd.options.display.max_columns = 120
pd.options.display.max_rows = 120

In [49]:
# import packages

import bs4 as bs
import nltk
from nltk.tokenize import sent_tokenize # tokenizes sentences
import re
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

eng_stopwords = stopwords.words('english')

In [50]:
merged_data1 = pd.read_csv('test_merged.csv',encoding = 'unicode_escape')
merged_data1

Unnamed: 0.1,Unnamed: 0,content,Movie,Startdate,Enddate
0,0,['Gara2 video ini nih yang bikin gue nggak sab...,Spectre,2015/10/23,2015/10/29
1,1,['@ cakefacee_xo hahaha I love this! Have you ...,Avengers: Age of Ultron,2015/4/19,2015/4/25
2,2,['Camren Bicondova attends the 'Batman V Super...,Batman v Superman: Dawn of Justice,2016/3/20,2016/3/26
3,3,['THE HOBBIT THE BATTLE OF THE FIVE ARMIES (20...,The Hobbit: The Battle of the Five Armies,2014/12/7,2014/12/13
4,4,"['Cinema, # teamcap # TeamIronMan # TeamTodoMu...",Captain America: Civil War,2016/4/24,2016/4/30
5,6,['Transformers: Age of Extinction movie premie...,Transformers: Age of Extinction,2014/6/22,2014/6/28
6,7,['The Amazing Spider-Man 2â²s # Xbox One rele...,The Amazing Spider-Man 2,2014/4/13,2014/4/19
7,8,['Furious 7 gave me the feelsI wanna watch fur...,Furious 7,2015/3/29,2015/4/4
8,9,['Click here to # win a prize bundle from X-me...,X-Men: Days of Future Past,2014/5/12,2014/5/18
9,10,['âInside Outâ?Easter Eggs Give Shoutouts t...,The Good Dinosaur,2015/11/11,2015/11/17


In [51]:
# 1. 
from nltk.corpus import stopwords


def review_cleaner(review):
    '''
    Clean and preprocess a review.
    
    1. Remove HTML tags
    2. Use regex to remove all special characters (only keep letters)
    3. Make strings to lower case and tokenize / word split reviews
    4. Remove English stopwords
    5. Rejoin to one string
    '''
    
    #1. Remove HTML tags
    review = bs.BeautifulSoup(review).text
    
    #2. Use regex to find emoticons
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', review)
    
    #3. Remove punctuation
    review = re.sub("[^a-zA-Z]", " ",review)
    
    #4. Tokenize into words (all lower case)
    review = review.lower().split()
    
    #5. Remove stopwords
    eng_stopwords = set(stopwords.words("english"))
    review = [w for w in review if not w in eng_stopwords]
    
    #6. Join the review to one sentence
    review = ' '.join(review+emoticons)
    # add emoticons to the end

    return(review)

In [52]:
%%time

num_reviews = len(merged_data1['content'])

review_clean_original = []

for i in range(0,num_reviews):
    if( (i+1)%100 == 0 ):
        # print progress
        print("Done with %d reviews" %(i+1)) 
    review_clean_original.append(review_cleaner(merged_data1['content'][i]))

Done with 100 reviews
Done with 200 reviews
Done with 300 reviews
Done with 400 reviews
Wall time: 983 ms


In [53]:
len(review_clean_original)

462

In [54]:
%%time
# Porter stemming on the results in review_clean_original

review_clean_ps = []

ps = PorterStemmer()


for i in range(0,num_reviews):
    if( (i+1)%100 == 0 ):
        # print progress
        print("Done with %d reviews" %(i+1)) 
    ps_stems = []
    for w in review_clean_original[i].split():
        if w == 'oed':
            continue
        ps_stems.append(ps.stem(w))
    
    review_clean_ps.append(' '.join(ps_stems))

Done with 100 reviews
Done with 200 reviews
Done with 300 reviews
Done with 400 reviews
Wall time: 8.64 s


In [55]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 'n'

In [56]:
%%time
# Lemmatizer

review_clean_wnl = []

wnl = WordNetLemmatizer()

for i in range(0,num_reviews):
    if( (i+1)%100 == 0 ):
        # print progress
        print("Done with %d reviews" %(i+1)) 
    
    wnl_stems = []
    token_tag = pos_tag(review_clean_original[i].split())
    for pair in token_tag:
        res = wnl.lemmatize(pair[0],pos=get_wordnet_pos(pair[1]))
        wnl_stems.append(res)

    review_clean_wnl.append(' '.join(wnl_stems))

Done with 100 reviews
Done with 200 reviews
Done with 300 reviews
Done with 400 reviews
Wall time: 35.5 s


In [57]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics # for confusion matrix, accuracy score etc
# CountVectorizer can actucally handle a lot of the preprocessing for us
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 100)

In [58]:
%%time
# Transform the text data to feature
# Only fit training data (to mimic real world)

vectorizer.fit(review_clean_wnl)

Wall time: 488 ms


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=100, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [59]:
train_bag = vectorizer.transform(review_clean_wnl)

In [60]:
df = pd.DataFrame(train_bag.toarray(), columns=vectorizer.get_feature_names())

In [61]:
df_final = pd.concat((merged_data1,df),axis=1)

In [62]:
df_final

Unnamed: 0.1,Unnamed: 0,content,Movie,Startdate,Enddate,age,bad,best,big,bit,book,box,boy,cinema,co,com,come,day,de,dlvr,el,en,fb,film,first,free,game,get,girl,gl,go,god,goo,good,great,home,html,http,https,ift,instagram,kill,know,la,last,life,light,like,look,love,ly,make,man,men,movie,need,new,news,next,night,office,one,ow,part,pic,play,premiere,que,really,review,room,rt,saw,say,screen,see,star,story,swarmapp,take,th,theater,think,time,today,tonight,trailer,tt,twitter,via,video,wait,walk,wanna,want,watch,way,weekend,well,win,would,www,year,youtu,youtube
0,0,['Gara2 video ini nih yang bikin gue nggak sab...,Spectre,2015/10/23,2015/10/29,0,1,0,0,9,0,0,0,4,2,22,1,6,10,1,0,3,0,6,6,0,0,4,0,0,0,0,0,9,4,0,1,30,1,2,2,0,1,6,0,1,0,5,1,8,6,0,1,0,1,2,3,0,2,0,0,2,0,0,6,1,1,2,0,0,0,4,0,1,0,13,3,2,0,1,1,0,3,2,1,4,1,2,10,2,1,0,1,0,0,7,2,0,6,5,0,5,0,1,4
1,1,['@ cakefacee_xo hahaha I love this! Have you ...,Avengers: Age of Ultron,2015/4/19,2015/4/25,89,0,1,0,7,0,2,0,2,1,33,0,5,18,2,1,2,4,4,0,1,0,1,0,1,0,0,1,8,1,0,0,49,6,3,4,0,0,8,1,0,0,0,0,2,9,3,0,0,7,0,2,0,0,2,1,2,3,1,4,0,1,2,0,1,0,0,0,0,0,4,6,0,14,2,1,1,0,3,1,1,0,3,5,1,2,1,0,0,1,4,0,0,0,1,0,15,0,7,2
2,2,['Camren Bicondova attends the 'Batman V Super...,Batman v Superman: Dawn of Justice,2016/3/20,2016/3/26,0,0,1,0,1,2,0,0,6,0,45,1,0,12,0,0,2,12,3,0,0,1,1,0,2,0,0,2,1,1,0,3,87,0,0,0,1,0,4,0,0,2,22,0,1,1,0,1,0,16,0,13,2,0,1,0,0,0,0,4,0,14,1,2,23,0,2,0,1,0,2,0,0,18,0,0,2,1,0,1,0,2,2,5,4,24,0,0,0,0,12,0,0,0,0,1,27,0,31,27
3,3,['THE HOBBIT THE BATTLE OF THE FIVE ARMIES (20...,The Hobbit: The Battle of the Five Armies,2014/12/7,2014/12/13,0,1,2,0,5,1,0,0,3,1,40,1,1,7,1,3,1,5,9,2,0,0,1,0,2,1,0,2,3,0,0,1,65,6,0,2,0,0,0,3,0,0,4,1,1,6,0,0,0,6,0,0,1,0,0,0,1,1,0,2,0,2,2,0,10,0,0,0,0,1,5,5,0,30,0,0,0,0,1,1,0,5,0,3,4,5,2,0,1,0,5,0,0,0,0,0,33,4,17,7
4,4,"['Cinema, # teamcap # TeamIronMan # TeamTodoMu...",Captain America: Civil War,2016/4/24,2016/4/30,1,0,1,0,2,0,1,0,3,1,45,1,2,17,2,4,8,10,3,0,1,0,0,0,0,2,0,0,2,2,0,0,58,2,1,2,0,1,8,1,0,0,2,1,1,2,0,3,0,9,2,3,0,1,1,1,2,1,0,6,0,0,2,0,7,0,1,0,0,0,4,0,0,28,1,0,0,0,4,0,1,1,3,7,0,5,0,0,0,1,8,0,0,0,0,0,35,0,6,8
5,6,['Transformers: Age of Extinction movie premie...,Transformers: Age of Extinction,2014/6/22,2014/6/28,111,2,0,1,2,0,1,0,2,3,28,1,1,1,5,0,0,9,0,0,0,0,2,0,4,2,0,4,5,1,0,1,60,0,1,2,0,0,0,2,1,0,3,1,4,2,0,0,0,23,1,4,0,0,4,2,2,0,0,6,0,1,0,0,14,0,1,3,3,1,8,2,2,3,0,0,3,3,3,3,2,1,1,7,6,1,2,0,0,3,13,1,2,2,0,0,7,0,3,3
6,7,['The Amazing Spider-Man 2â²s # Xbox One rele...,The Amazing Spider-Man 2,2014/4/13,2014/4/19,0,0,1,0,5,0,0,0,1,0,35,2,1,20,2,4,8,4,4,1,0,3,1,0,3,8,0,3,3,4,1,2,66,1,0,1,0,1,9,0,0,0,6,2,0,5,1,106,2,5,1,5,1,1,0,0,3,0,1,3,0,0,4,1,5,0,1,1,1,1,5,0,0,0,0,1,0,0,0,1,1,1,2,5,3,6,1,0,0,1,3,0,0,2,1,1,2,0,8,8
7,8,['Furious 7 gave me the feelsI wanna watch fur...,Furious 7,2015/3/29,2015/4/4,0,0,1,1,0,0,0,1,1,1,8,4,2,0,0,0,0,3,4,1,0,0,6,0,1,13,0,1,5,2,0,0,12,1,0,1,0,1,1,0,0,1,7,1,1,1,2,0,0,12,1,1,0,0,3,0,0,0,0,3,0,1,0,2,1,0,0,0,2,1,21,0,0,1,1,0,2,1,3,0,1,0,0,5,2,1,0,0,3,3,9,1,2,0,0,2,1,0,3,1
8,9,['Click here to # win a prize bundle from X-me...,X-Men: Days of Future Past,2014/5/12,2014/5/18,0,0,1,1,9,0,0,0,1,10,17,1,119,6,3,0,2,4,3,2,2,1,0,0,0,6,0,0,0,1,0,2,83,0,2,1,0,2,2,1,0,0,5,4,1,11,3,1,125,23,1,12,2,2,0,0,0,2,0,2,2,9,1,1,0,0,0,0,1,7,12,4,1,0,0,0,1,1,0,0,2,2,2,1,6,5,7,0,0,1,1,1,1,1,24,0,16,0,7,4
9,10,['âInside Outâ?Easter Eggs Give Shoutouts t...,The Good Dinosaur,2015/11/11,2015/11/17,0,0,0,0,15,1,0,0,0,0,51,1,0,3,4,0,0,5,12,0,3,0,1,0,18,3,0,18,123,0,1,1,74,1,0,1,1,1,2,1,0,0,2,6,1,16,0,0,0,41,0,12,0,3,1,0,0,1,6,9,1,2,0,0,1,0,0,0,0,7,9,0,2,0,0,1,0,0,0,1,0,10,0,23,21,0,3,0,1,5,4,0,0,0,8,0,12,0,1,0


In [63]:
df_final.to_csv('twitter_bag_final.csv')