In [1]:
import pandas as pd
import re
import collections
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv('data/twitter_sentiment_analysis_data_raw.csv', encoding='latin-1', names=['label', 'id', 'data', 'query', 'author', 'tweet'])

In [3]:
data.head()

Unnamed: 0,label,id,data,query,author,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
data.loc[data['label']==4, 'label'] = 1
data.label.value_counts()

0    800000
1    800000
Name: label, dtype: int64

In [5]:
data.head()

Unnamed: 0,label,id,data,query,author,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
df = data.groupby('label').head(1250).reset_index(drop=True)
df['bias'] = 1

In [7]:
df.label.value_counts()

0    1250
1    1250
Name: label, dtype: int64

In [8]:
df.head()

Unnamed: 0,label,id,data,query,author,tweet,bias
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",1
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,1
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,1
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,1
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",1


In [9]:
train, val_test = train_test_split(df, test_size=0.4, random_state=42)

In [10]:
def build_vocabularies(data, label_col, sentence_col):

    stop_words = ["0s","3a","3b","3d","6b","6o","a","a1","a2","a3","a4","ab","able","about","above","abst","ac","accordance","according","accordingly","across","act","actually","ad","added","adj","ae","af","affected","affecting","affects","after","afterwards","ag","again","against","ah","ain","ain't","aj","al","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","amoungst","amount","an","and","announce","another","any","anybody","anyhow","anymore","anyone","anything","anyway","anyways","anywhere","ao","ap","apart","apparently","appear","appreciate","appropriate","approximately","ar","are","aren","arent","aren't","arise","around","as","a's","aside","ask","asking","associated","at","au","auth","av","available","aw","away","awfully","ax","ay","az","b","b1","b2","b3","ba","back","bc","bd","be","became","because","become","becomes","becoming","been","before","beforehand","begin","beginning","beginnings","begins","behind","being","believe","below","beside","besides","best","better","between","beyond","bi","bill","biol","bj","bk","bl","bn","both","bottom","bp","br","brief","briefly","bs","bt","bu","but","bx","by","c","c1","c2","c3","ca","call","came","can","cannot","cant","can't","cause","causes","cc","cd","ce","certain","certainly","cf","cg","ch","changes","ci","cit","cj","cl","clearly","cm","c'mon","cn","co","com","come","comes","con","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn","couldnt","couldn't","course","cp","cq","cr","cry","cs","c's","ct","cu","currently","cv","cx","cy","cz","d","d2","da","date","dc","dd","de","definitely","describe","described","despite","detail","df","di","did","didn","didn't","different","dj","dk","dl","do","does","doesn","doesn't","doing","don","done","don't","down","downwards","dp","dr","ds","dt","du","due","during","dx","dy","e","e2","e3","ea","each","ec","ed","edu","ee","ef","effect","eg","ei","eight","eighty","either","ej","el","eleven","else","elsewhere","em","empty","en","end","ending","enough","entirely","eo","ep","eq","er","es","especially","est","et","et-al","etc","eu","ev","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","ey","f","f2","fa","far","fc","few","ff","fi","fifteen","fifth","fify","fill","find","fire","first","five","fix","fj","fl","fn","fo","followed","following","follows","for","former","formerly","forth","forty","found","four","fr","from","front","fs","ft","fu","full","further","furthermore","fy","g","ga","gave","ge","get","gets","getting","gi","give","given","gives","giving","gj","gl","go","goes","going","gone","got","gotten","gr","greetings","gs","gy","h","h2","h3","had","hadn","hadn't","happens","hardly","has","hasn","hasnt","hasn't","have","haven","haven't","having","he","hed","he'd","he'll","hello","help","hence","her","here","hereafter","hereby","herein","heres","here's","hereupon","hers","herself","hes","he's","hh","hi","hid","him","himself","his","hither","hj","ho","home","hopefully","how","howbeit","however","how's","hr","hs","http","hu","hundred","hy","i","i2","i3","i4","i6","i7","i8","ia","ib","ibid","ic","id","i'd","ie","if","ig","ignored","ih","ii","ij","il","i'll","im","i'm","immediate","immediately","importance","important","in","inasmuch","inc","indeed","index","indicate","indicated","indicates","information","inner","insofar","instead","interest","into","invention","inward","io","ip","iq","ir","is","isn","isn't","it","itd","it'd","it'll","its","it's","itself","iv","i've","ix","iy","iz","j","jj","jr","js","jt","ju","just","k","ke","keep","keeps","kept","kg","kj","km","know","known","knows","ko","l","l2","la","largely","last","lately","later","latter","latterly","lb","lc","le","least","les","less","lest","let","lets","let's","lf","like","liked","likely","line","little","lj","ll","ll","ln","lo","look","looking","looks","los","lr","ls","lt","ltd","m","m2","ma","made","mainly","make","makes","many","may","maybe","me","mean","means","meantime","meanwhile","merely","mg","might","mightn","mightn't","mill","million","mine","miss","ml","mn","mo","more","moreover","most","mostly","move","mr","mrs","ms","mt","mu","much","mug","must","mustn","mustn't","my","myself","n","n2","na","name","namely","nay","nc","nd","ne","near","nearly","necessarily","necessary","need","needn","needn't","needs","neither","never","nevertheless","new","next","ng","ni","nine","ninety","nj","nl","nn","no","nobody","non","none","nonetheless","noone","nor","normally","nos","not","noted","nothing","novel","now","nowhere","nr","ns","nt","ny","o","oa","ob","obtain","obtained","obviously","oc","od","of","off","often","og","oh","oi","oj","ok","okay","ol","old","om","omitted","on","once","one","ones","only","onto","oo","op","oq","or","ord","os","ot","other","others","otherwise","ou","ought","our","ours","ourselves","out","outside","over","overall","ow","owing","own","ox","oz","p","p1","p2","p3","page","pagecount","pages","par","part","particular","particularly","pas","past","pc","pd","pe","per","perhaps","pf","ph","pi","pj","pk","pl","placed","please","plus","pm","pn","po","poorly","possible","possibly","potentially","pp","pq","pr","predominantly","present","presumably","previously","primarily","probably","promptly","proud","provides","ps","pt","pu","put","py","q","qj","qu","que","quickly","quite","qv","r","r2","ra","ran","rather","rc","rd","re","readily","really","reasonably","recent","recently","ref","refs","regarding","regardless","regards","related","relatively","research","research-articl","respectively","resulted","resulting","results","rf","rh","ri","right","rj","rl","rm","rn","ro","rq","rr","rs","rt","ru","run","rv","ry","s","s2","sa","said","same","saw","say","saying","says","sc","sd","se","sec","second","secondly","section","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","sf","shall","shan","shan't","she","shed","she'd","she'll","shes","she's","should","shouldn","shouldn't","should've","show","showed","shown","showns","shows","si","side","significant","significantly","similar","similarly","since","sincere","six","sixty","sj","sl","slightly","sm","sn","so","some","somebody","somehow","someone","somethan","something","sometime","sometimes","somewhat","somewhere","soon","sorry","sp","specifically","specified","specify","specifying","sq","sr","ss","st","still","stop","strongly","sub","substantially","successfully","such","sufficiently","suggest","sup","sure","sy","system","sz","t","t1","t2","t3","take","taken","taking","tb","tc","td","te","tell","ten","tends","tf","th","than","thank","thanks","thanx","that","that'll","thats","that's","that've","the","their","theirs","them","themselves","then","thence","there","thereafter","thereby","thered","therefore","therein","there'll","thereof","therere","theres","there's","thereto","thereupon","there've","these","they","theyd","they'd","they'll","theyre","they're","they've","thickv","thin","think","third","this","thorough","thoroughly","those","thou","though","thoughh","thousand","three","throug","through","throughout","thru","thus","ti","til","tip","tj","tl","tm","tn","to","together","too","took","top","toward","towards","tp","tq","tr","tried","tries","truly","try","trying","ts","t's","tt","tv","twelve","twenty","twice","two","tx","u","u201d","ue","ui","uj","uk","um","un","under","unfortunately","unless","unlike","unlikely","until","unto","uo","up","upon","ups","ur","us","use","used","useful","usefully","usefulness","uses","using","usually","ut","v","va","value","various","vd","ve","ve","very","via","viz","vj","vo","vol","vols","volumtype","vq","vs","vt","vu","w","wa","want","wants","was","wasn","wasnt","wasn't","way","we","wed","we'd","welcome","well","we'll","well-b","went","were","we're","weren","werent","weren't","we've","what","whatever","what'll","whats","what's","when","whence","whenever","when's","where","whereafter","whereas","whereby","wherein","wheres","where's","whereupon","wherever","whether","which","while","whim","whither","who","whod","whoever","whole","who'll","whom","whomever","whos","who's","whose","why","why's","wi","widely","will","willing","wish","with","within","without","wo","won","wonder","wont","won't","words","world","would","wouldn","wouldnt","wouldn't","www","x","x1","x2","x3","xf","xi","xj","xk","xl","xn","xo","xs","xt","xv","xx","y","y2","yes","yet","yj","yl","you","youd","you'd","you'll","your","youre","you're","yours","yourself","yourselves","you've","yr","ys","yt","z","zero","zi","zz"]
    re_punctuations = """,|'|\.|!|\?|&|"|:|[|]|;|quot|\\/|ing|\\(|\\)"""
    
    pos_vocabulary = [
        re.sub(re_punctuations,'', word.lower())
        for tweet in df.loc[df[label_col]==1, sentence_col].values.tolist() 
        for word in tweet.split(' ') 
        if len(word) > 1 
        and '@' not in word 
        and re.sub(re_punctuations,'', word.lower()) not in stop_words 
        and 'www' not in word
        and 'http' not in word
        and '.com' not in word
    ]
    
    neg_vocabulary = [
        re.sub(re_punctuations,'', word.lower())
        for tweet in data.loc[df[label_col]==0, sentence_col].values.tolist() 
        for word in tweet.split(' ') 
        if len(word) > 1 
        and '@' not in word 
        and re.sub(re_punctuations,'', word.lower()) not in stop_words 
        and 'www' not in word
        and 'http' not in word
        and '.com' not in word
    ]

    
    return pos_vocabulary, neg_vocabulary
    
pos_vocabulary, neg_vocabulary = build_vocabularies(train, 'label', 'tweet')

In [11]:
neg_freq = collections.Counter(x for x in neg_vocabulary)
pos_freq = collections.Counter(x for x in pos_vocabulary)

In [12]:
neg_freq

Counter({'sick': 17,
         'feel': 37,
         'bit': 5,
         'medicine': 1,
         'hope': 11,
         'good': 28,
         'night': 17,
         'sleep': 33,
         'ohh': 2,
         'suppose': 1,
         'snow': 4,
         'wtf': 1,
         '_secretgarden_': 1,
         'havent': 7,
         'porn': 1,
         'spammers': 1,
         'dont': 39,
         'check': 5,
         'followers': 2,
         'tweets': 4,
         'stillllll': 1,
         'arrived': 1,
         'ordered': 2,
         'maternity': 1,
         'clothes': 2,
         'online': 3,
         'today': 27,
         'someth': 4,
         'strange': 1,
         'didnt': 14,
         'order': 3,
         'size': 1,
         'stuff': 8,
         'boredd': 2,
         'gah': 2,
         'ive': 17,
         'toooo': 1,
         'muchh': 1,
         'cooooffffeeeeeeeee': 1,
         'bahaha': 1,
         'art': 3,
         'tomorrow': 26,
         'nice': 5,
         'youve': 2,
         'joined': 1,
     

In [13]:
train.head()

Unnamed: 0,label,id,data,query,author,tweet,bias
348,0,1467899025,Mon Apr 06 22:43:06 PDT 2009,NO_QUERY,oup,"still sick. feeling a bit better, got some new...",1
1089,0,1468077685,Mon Apr 06 23:35:28 PDT 2009,NO_QUERY,ann_donnelly,_secretgarden_ I haven't gotten any porn spamm...,1
1850,1,1467934783,Mon Apr 06 22:53:09 PDT 2009,NO_QUERY,SherylBreuker,@PhoneBoy I can't wait to read that.,1
300,0,1467890079,Mon Apr 06 22:40:38 PDT 2009,NO_QUERY,BATMANNN,@jokerrrr It stillllll hasn't arrived,1
1658,1,1467898790,Mon Apr 06 22:43:02 PDT 2009,NO_QUERY,WindyPorter,@JonathanRKnight There is no way on earth that...,1


In [14]:
re_punctuations = """,|'|\.|!|\?|&|"|:|[|]|;|quot|\\/|ing|\\(|\\)"""
train['adjusted_tweets'] = train.tweet.str.lower().str.replace(re_punctuations, '', regex=True)
val_test['adjusted_tweets'] = val_test.tweet.str.lower().str.replace(re_punctuations, '', regex=True)

In [15]:
def calculate_scores(sentence, pos_frequency_dict=pos_freq, neg_frequency_dict=neg_freq):
    words_list = sentence.split(' ')
    pos_value = 0
    neg_value = 0
    for word in words_list:
        pos_value += pos_frequency_dict[word]
        neg_value += neg_frequency_dict[word]
    return pos_value, neg_value        

In [16]:
train[["pos_value", "neg_value"]] = train["adjusted_tweets"].map(calculate_scores).tolist()
val_test[["pos_value", "neg_value"]] = val_test["adjusted_tweets"].map(calculate_scores).tolist()

In [17]:
val_test.head()

Unnamed: 0,label,id,data,query,author,tweet,bias,adjusted_tweets,pos_value,neg_value
1447,1,1467861286,Mon Apr 06 22:32:57 PDT 2009,NO_QUERY,llliizzziiiee,@Westneyrhindxx hello westney what are you do...,1,@westneyrhindxx hello westney what are you do x,87,22
1114,0,1468083884,Mon Apr 06 23:37:29 PDT 2009,NO_QUERY,pstannard1,"got woken up this mornng at 7am - damn Lorry ,...",1,got woken up this mornng at 7am - damn lorry ...,145,50
1064,0,1468072530,Mon Apr 06 23:33:55 PDT 2009,NO_QUERY,SarahDeG,Why do I keep looking...I know that what I rea...,1,why do i keep looki know that what i read is g...,123,52
2287,1,1468007017,Mon Apr 06 23:13:40 PDT 2009,NO_QUERY,Hetty4Christ,Pray for my friend @growline late work...pray ...,1,pray for my friend @growline late workpray it ...,194,48
1537,1,1467864444,Mon Apr 06 22:33:48 PDT 2009,NO_QUERY,elysemize,@JonathanRKnight i do embrace the simple life...,1,@jonathanrknight i do embrace the simple life...,211,62


In [21]:
validation, test = train_test_split(val_test, test_size=0.5, random_state=42)

X_train, y_train = train[['bias', 'pos_value', 'neg_value']].to_numpy(), train['label'].to_numpy()
X_val, y_val = validation[['bias', 'pos_value', 'neg_value']].to_numpy(), validation['label'].to_numpy()
X_test, y_test = test[['bias', 'pos_value', 'neg_value']].to_numpy(), test['label'].to_numpy()