In [16]:
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# function to remove user handles
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)

    return input_txt 

In [3]:
def handle_emojis(tweet):
    """
    In this method we are converting the emojis with the text they represent. 
    Happy emojis are converted to EMO_POS and negative to EMO_NEG.
    """
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' smiling ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' laughing ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' love ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' wink ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' sad ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' cry ', tweet)
    
    return tweet

In [4]:
#converting to full meaningful words
short_word_dict = {
"121": "one to one",
"a/s/l": "age, sex, location",
"ab": "about",
"adn": "any day now",
"afaik": "as far as I know",
"afk": "away from keyboard",
"aight": "alright",
"alol": "actually laughing out loud",
"b4": "before",
"b4n": "bye for now",
"bak": "back at the keyboard",
"bf": "boyfriend",
"bff": "best friends forever",
"bfn": "bye for now",
"bg": "big grin",
"bta": "but then again",
"btw": "by the way",
"cid": "crying in disgrace",
"cnp": "continued in my next post",
"cp": "chat post",
"cu": "see you",
"cul": "see you later",
"cul8r": "see you later",
"cya": "bye",
"cyo": "see you online",
"dbau": "doing business as usual",
"fud": "fear, uncertainty, and doubt",
"fwiw": "for what it's worth",
"fyi": "for your information",
"g": "grin",
"g2g": "got to go",
"ga": "go ahead",
"gal": "get a life",
"gf": "girlfriend",
"gfn": "gone for now",
"gmbo": "giggling my butt off",
"gmta": "great minds think alike",
"h8": "hate",
"hagn": "have a good night",
"hdop": "help delete online predators",
"hhis": "hanging head in shame",
"iac": "in any case",
"ianal": "I am not a lawyer",
"ic": "I see",
"idk": "I don't know",
"imao": "in my arrogant opinion",
"imnsho": "in my not so humble opinion",
"imo": "in my opinion",
"iow": "in other words",
"ipn": "I’m posting naked",
"irl": "in real life",
"jk": "just kidding",
"l8r": "later",
"ld": "later, dude",
"ldr": "long distance relationship",
"llta": "lots and lots of thunderous applause",
"lmao": "laugh my ass off",
"lmirl": "let's meet in real life",
"lol": "laugh out loud",
"ltr": "longterm relationship",
"lulab": "love you like a brother",
"lulas": "love you like a sister",
"luv": "love",
"m/f": "male or female",
"m8": "mate",
"milf": "mother I would like to fuck",
"mkt": "market",
"oll": "online love",
"omg": "oh my god",
"otoh": "on the other hand",
"pir": "parent in room",
"ppl": "people",
"r": "are",
"rofl": "roll on the floor laughing",
"rpg": "role playing games",
"ru": "are you",
"shid": "slaps head in disgust",
"somy": "sick of me yet",
"sot": "short of time",
"thanx": "thanks",
"thx": "thanks",
"ttyl": "talk to you later",
"u": "you",
"ur": "you are",
"uw": "you’re welcome",
"wb": "welcome back",
"wfm": "works for me",
"wibni": "wouldn't it be nice if",
"wtf": "what the fuck",
"wtg": "way to go",
"wtgp": "want to go private",
"yrs": "years",
"ym": "young man",
"gr8": "great"
}

In [5]:
def dict_words(text, dictionary):
    """
    Method to replace a word with a word from the dictionary.
    The dictionary will have the word to be replaced as key and the new word as value
    """
    for word in text.split():
        if word.lower() in dictionary:
            if word.lower() in text.split():
                text = text.replace(word, dictionary[word.lower()])
                print("Old: "+word)
                print("New: "+text)
    return text

In [374]:
#pre-precessing function
def preprocess_text(data, col_name):
    
    #converting string to lower case
    data[col_name] = data[col_name].str.lower()
    
    #remove user handles
    data[col_name] = data[col_name].apply(lambda row:remove_pattern(row, "@[\w]*"))
    
    #converting html characters to strings
    data[col_name] = data[col_name].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())
    
    #converting emojis to the string they represent
    data[col_name] = data[col_name].apply(lambda x: handle_emojis(x))
    
     #removing URLs from the tweets
    data[col_name]  = data[col_name].apply(lambda x: re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', x))
    
    #converting abbreviations to their full forms
    data[col_name] = data[col_name].replace('[...…]','').str.split().apply(lambda x: ' '.join([short_word_dict.get(e, e) for e in x]))
    
    # Remove single space remaining at the front of the tweet.
    data[col_name] = data[col_name].apply(lambda x: x.lstrip(' '))
    
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    data[col_name] = data[col_name].apply(lambda x: ''.join(c for c in x if c <= '\uFFFF'))
    
     #removing non-ASCII characters
    data[col_name] = data[col_name].apply(lambda x: ''.join([i if ord(i) < 128 else ' ' for i in x]))
    
    # Remove words with 2 or fewer letters
    data[col_name] = data[col_name].apply(lambda x: re.sub(r'\b\w{1,2}\b', '', x))
    
    # Convert more than 2 letter repetitions to 2 letter
    # funnnnny --> funny
    data[col_name] = data[col_name].apply(lambda x: re.sub(r'(.)\1+', r'\1\1', x))
    
    #tokenizing data
    data[col_name] = data[col_name].apply(lambda x: nltk.word_tokenize(x))
    
    #stopwords removal
    stop_words = list(set(stopwords.words('english')))+list(punctuation)+['``', "'s", "...", "n't", "'re", "''"]
    data[col_name] = data[col_name].apply(lambda row: [word for word in row if word not in stop_words])
    
#     # stemming words
#     stemmer = PorterStemmer()
#     data[col_name] = data[col_name].apply(lambda x: [stemmer.stem(i) for i in x])

    #lemmatization
#     lemmatizer = WordNetLemmatizer()
#     data[col_name] = data[col_name].apply(lambda x: [lemmatizer.lemmatize(i) for i in x])

    
    data[col_name] = data[col_name].apply(lambda x: ' '.join(x))
    
    return data

In [17]:
train_data = pd.read_csv('data/train.csv')
train_data.dropna(inplace=True)
train_data.head()

Unnamed: 0,tweet_id,tweet,sentiment
0,1701,#sxswnui #sxsw #apple defining language of tou...,1
1,1851,Learning ab Google doodles! All doodles should...,1
2,2689,one of the most in-your-face ex. of stealing t...,2
3,4525,This iPhone #SXSW app would b pretty awesome i...,0
4,3604,Line outside the Apple store in Austin waiting...,1


In [18]:
# processed_train = preprocess_text(train_data, 'tweet')

In [19]:
processed_train = pd.read_csv('data/processed_data.csv')
processed_train.dropna(inplace=True)

In [20]:
processed_train.head(10)

Unnamed: 0,tweet_id,tweet,sentiment
0,1701,sxswnui sxsw apple defining language touch dif...,1
1,1851,learning google doodles doodles light funny in...,1
2,2689,one -your-face stealing show years sxsw apple ...,2
3,4525,iphone sxsw app would pretty awesome crash eve...,0
4,3604,line outside apple store austin waiting new ip...,1
5,966,technews one lone dude awaits ipad apple sxsw ...,1
6,1395,sxsw tips prince npr videos toy shopping zucke...,1
7,8182,user new ubersocial iphone app store includes ...,1
8,8835,free sxsw sampler itunes link freemusic,2
9,883,think might weekend without seeing ipad case t...,2


In [21]:
test_data = pd.read_csv('data/test_processed.csv')
test_data.head()

Unnamed: 0,tweet_id,tweet
0,7506,audience prototyping tools use sketchbooks/sha...
1,7992,sxsw send best photos videos .. link citizen_j...
2,247,pic winning ipad unsix sxsw link cont link
3,7688,google marissa mayer mobile phone cursor physi...
4,3294,sxsw google maps even cooler thought


In [22]:
count_vectorizer = CountVectorizer()
X_train = count_vectorizer.fit_transform(processed_train['tweet'])

In [23]:
X_train.shape

(7272, 8332)

In [24]:
X_test = count_vectorizer.transform(test_data['tweet'])

In [25]:
X_test.shape

(1819, 8332)

In [29]:
#fit and predict on upsampled data
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(max_iter=7600)
lr_model.fit(X_train, processed_train['sentiment'])
lr_pred = lr_model.predict(X_test)

In [30]:
to_submit = pd.DataFrame({'tweet_id': test_data['tweet_id'], 'sentiment': lr_pred})
to_submit.to_csv('probability_manipulation/lr_pred_basic.csv', index=False)
to_submit['sentiment'].value_counts()

1    1246
2     516
0      57
Name: sentiment, dtype: int64

In [49]:
probs = lr_model.predict_proba(X_test)

for i in range(0,1819):
    prob_0 = probs[i,0]
    prob_1 = probs[i,1]
    prob_2 = probs[i,2]
    prob_3 = probs[i,3]
    if prob_3 > 0.3:
        probs[i,0] = 0
        probs[i,1] = 0
        probs[i,2] = 0
    if prob_0 > 0.2:
        probs[i,3] = 0
        probs[i,1] = 0
        probs[i,2] = 0
    if prob_1 > 0.75:
        probs[i,0] = 0
        probs[i,2] = 0
        probs[i,3] = 0
    if prob_2 > 0.4:
        probs[i,0] = 0
        probs[i,1] = 0
        probs[i,3] = 0
        
final_class_list = []
for i in range(0,1819):
    prob_0 = probs[i,0]
    prob_1 = probs[i,1]
    prob_2 = probs[i,2]
    prob_3 = probs[i,3]
    d = {'0': prob_0, '1': prob_1, '2': prob_2, '3': prob_3}
    max_class = max(d, key=d.get)
    final_class_list.append(int(max_class))
    
final_class_arr = np.asarray(final_class_list)

to_submit = pd.DataFrame({'tweet_id': test_data['tweet_id'], 'sentiment': final_class_arr})
to_submit.to_csv('probability_manipulation/pred_proba.csv', index=False)
to_submit['sentiment'].value_counts()

<IPython.core.display.Javascript object>

1    1113
2     568
0     136
3       2
Name: sentiment, dtype: int64

In [32]:
processed_train['sentiment'].value_counts()

1    4309
2    2382
0     456
3     125
Name: sentiment, dtype: int64