In [1]:
import argparse
import numpy as np
import sys
from pymongo import MongoClient
import re
from __future__ import division
from sklearn.cluster import KMeans 
from numbers import Number
from pandas import DataFrame
import sys, codecs, numpy
import pandas as pd
import string
from string import digits
import datetime

In [2]:
dataset = pd.read_csv('dataset.csv', delimiter=',')

# Filtering

In [3]:
import re, collections

def words(text): return re.findall('[a-z]+', text.lower()) 

def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1
    return model

NWORDS = train(words(file('big.txt').read()))

alphabet = 'abcdefghijklmnopqrstuvwxyz'

def edits1(word):
    splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes    = [a + b[1:] for a, b in splits if b]
    transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
    replaces   = [a + c + b[1:] for a, b in splits for c in alphabet if b]
    inserts    = [a + c + b     for a, b in splits for c in alphabet]
    return set(deletes + transposes + replaces + inserts)

def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)

def known(words): return set(w for w in words if w in NWORDS)

def correct(word):
    candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
    return max(candidates, key=NWORDS.get)

In [4]:
def light_process(tweet):
    tweet = re.sub('@[^\s]+','user',tweet)
    tweet = tweet.replace('?', ' question ', 1)
    tweet = tweet.replace('!', ' exclamation ', 1)
    tweet = tweet.replace('&quot', ' ')

    sad = """:-( :( :o( :[  :c( :< =[ 8( =( :{ :^( X-( </3 :_( :'( :-/ """.split()
    sad_pattern = "|".join(map(re.escape, sad))
    happy = """:-) :) :o) :] :3 :c) :> =] 8) =) :} :^)
             :D 8-D 8D 8^P x-D xD X-D XD =-D =D =-3 =3 B^D """.split()
    happy_pattern = "|".join(map(re.escape, happy))

    tweet = re.sub(happy_pattern,' happy ', tweet)
    tweet = re.sub(sad_pattern,' sad ', tweet)

    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','url',tweet)
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    tweet = regex.sub(' ', tweet)
    tweet = tweet.lower()
    pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
    tweet = pattern.sub(r"\1\1", tweet)
    tweet = re.split('(\d+)',tweet)
    tweet = ' '.join(tweet)
    tweet = tweet.translate(None, digits)
    tweet = re.sub('[\s]+', ' ', tweet)
    tweet = tweet.replace('oh', ' sighn ')
    return tweet

In [5]:
def process(tweet, vocab, wordlist):
    tweet = light_process(tweet)
    correct_words = []
    for term in tweet.split(' '):
        if len(term) >=3:
            if term in vocab:
                correct_words.append(term)
            else:
                term =  correct(term)
                if term in vocab:
                    correct_words.append(term)
                else:
                    term = crazy_process(term, vocab, wordlist)
                    if term !='':
                        correct_words.append(term)
    return ' '.join(correct_words)        

In [6]:
def InitializeWords():
    wordlist = 'dic.txt' # A file containing common english words
    content = None
    with open(wordlist) as f:
        content = f.readlines()
    return [word.rstrip('\n') for word in content]


def ParseWord(term, wordlist):
    words = []
    # Remove hashtag, split by dash
    word = FindWord(term, wordlist)    
    while word != None and len(term) > 0:
        words += [word]            
        if len(term) == len(word): # Special case for when eating rest of word
            break
        term = term[len(word):]
        word = FindWord(term, wordlist)
    return " ".join(words)


def FindWord(token, wordlist):
    i = len(token) + 1
    while i > 1:
        i -= 1
        if token[:i] in wordlist:
            return token[:i]
    return None 

In [7]:
def crazy_process(word, vocab, wordlist):   
    found = []
    word_split = ParseWord(word, wordlist)
    for w in word_split.split(' '):
        if len(w)>=3:
            if w in vocab:
                found.append(w)
            else:
                w =  correct(w)
                if w in vocab:
                    found.append(w)
    return ' '.join(found)

# Build vocabulary based on GloVe

In [8]:
class autovivify_list(dict):
        '''Pickleable class to replicate the functionality of collections.defaultdict'''
        def __missing__(self, key):
                value = self[key] = []
                return value

        def __add__(self, x):
                '''Override addition for numeric types when self is empty'''
                if not self and isinstance(x, Number):
                        return x
                raise ValueError

        def __sub__(self, x):
                '''Also provide subtraction method'''
                if not self and isinstance(x, Number):
                        return -1 * x
                raise ValueError

In [9]:
def build_word_vector_matrix(vector_file):
        '''Read a GloVe array from sys.argv[1] and return its vectors and labels as arrays'''
        numpy_arrays = []
        labels_array = []
        with codecs.open(vector_file, 'r', 'utf-8') as f:
            for c, r in enumerate(f):
               
                    sr = r.split()
                    if not sr:
                        break
                    labels_array.append(sr[0])
                    vec = numpy.array([float(i) for i in sr[1:]])
                    numpy_arrays.append(vec)
    

                    #if c == n_words:
                    #        return numpy.array( numpy_arrays ), labels_array

        return numpy.array( numpy_arrays ), labels_array

In [10]:
def generate(df, lables):
    words = lables
    vocab = {w: idx for idx, w in enumerate(words)}
    ivocab = {idx: w for idx, w in enumerate(words)}
    
    W =  df
    # normalize each word vector to unit variance
    W_norm = np.zeros(W.shape)
    d = (np.sum(W ** 2, 1) ** (0.5))
    W_norm = (W.T / d).T
    return (W_norm, vocab, ivocab)

In [11]:
input_vector_file = 'glove.6B.200d.txt' 
df, labels_array  = build_word_vector_matrix(input_vector_file)

In [12]:
W, vocab, ivocab = generate(df, labels_array)

In [18]:
dataset['SentimentText'][10000:10005]

10000    &quot;I can taste you on my lips and smell you...
10001    &quot;I did not have sexual relations with tha...
10002    &quot;I do it my way, I shit on folks the oppo...
10003    &quot;I do not like plastic!! grr! so silly an...
10004    &quot;I don't have the strength to stay away f...
Name: SentimentText, dtype: object

In [36]:
text = '@team_№? 123HeRE7 iS the $*($# &quottweet&quot we try2filter, correct ! Is it working? :). For detailes go to github:  #deeplearningcourse #bigfridayday #finalpresentationday'
text = light_process(text)
text

'user here is the tweet we try filter correct exclamation is it working question happy for detailes go to github deeplearningcourse bigfridayday finalpresentationday'

In [37]:
wordlist = InitializeWords()
process(text, vocab, wordlist)

'user here the tweet try filter correct exclamation working question happy for details github deep learning course big friday day final presentation day'

# Filter all tweets

In [13]:
def filter_dataset(dset, voc):
    tweets = dset
    wordlist = InitializeWords()
    list_ = []
    for tweet in tweets:
        t = process(tweet, voc, wordlist)
        list_.append(t)
    return list_

def multi_run_wrapper(args):
    return filter_dataset(*args)

In [14]:
dset1 = pd.read_csv("/home/user/Desktop/ds/ds1.csv", delimiter=',')
dset2 = pd.read_csv("/home/user/Desktop/ds/ds2.csv", delimiter=',')
dset3 = pd.read_csv("/home/user/Desktop/ds/ds3.csv", delimiter=',')
dset4 = pd.read_csv("/home/user/Desktop/ds/ds4.csv", delimiter=',')
dset5 = pd.read_csv("/home/user/Desktop/ds/ds5.csv", delimiter=',')
dset6 = pd.read_csv("/home/user/Desktop/ds/ds6.csv", delimiter=',')
dset7 = pd.read_csv("/home/user/Desktop/ds/ds7.csv", delimiter=',')
dset8 = pd.read_csv("/home/user/Desktop/ds/ds8.csv", delimiter=',')
print len(dset1)

197300


In [15]:
from multiprocessing import Pool
p = Pool(9)
t1 = datetime.datetime.now()
print t1
results = p.map(multi_run_wrapper,[(dset1['SentimentText'], vocab),
                                   (dset2['SentimentText'], vocab),
                                   (dset3['SentimentText'], vocab),
                                   (dset4['SentimentText'], vocab),
                                   (dset5['SentimentText'], vocab),
                                   (dset6['SentimentText'], vocab),
                                   (dset7['SentimentText'], vocab),
                                   (dset8['SentimentText'], vocab)
                                  ]
               )
print datetime.datetime.now() - t1

2016-05-24 19:24:18.357879
1:29:45.301593


In [19]:
list__ = []
for a in range(8):
    list__ = list__ + results[a]

In [22]:
dataset['SentimentText'][1500123]

'my first bday without my grandpa '

In [31]:
list__[1500123]

'first day without grandpa'

In [71]:
df = pd.DataFrame({'text': list__, 'sentiment': dataset['Sentiment']})
print(len(df))
df.to_csv("cleaned_tweet.csv", index=False)

1578625


In [74]:
df = pd.read_csv('cleaned_tweet.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1578625 entries, 0 to 1578624
Data columns (total 2 columns):
sentiment    1578625 non-null int64
text         1578564 non-null object
dtypes: int64(1), object(1)
memory usage: 24.1+ MB


In [81]:
#t = 'getthismessage ololo hello 32641try6to6process KORREkt'
t = 'billybragsters Nay worries, shame though would of been a giggle. Are there no layby\'s wide enough for an awning in Bev?'
print prosess(t, vocab, wordlist)

billy brag nay worries shame though would been giggle are there layby wide enough for awning bev
