# Read data

In [102]:
import nltk
import pandas as pd
import re
import numpy as np 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [80]:
test_tweets = pd.read_csv("data/test.csv", index_col="Id")
tweets = pd.read_csv("data/train.csv", index_col="Id")

In [81]:
tweets.head()

Unnamed: 0_level_0,Category,Tweet
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
635769805279248384,negative,Not Available
635930169241374720,neutral,IOS 9 App Transport Security. Mm need to check...
635950258682523648,neutral,"Mar if you have an iOS device, you should down..."
636030803433009153,negative,@jimmie_vanagon my phone does not run on lates...
636100906224848896,positive,Not sure how to start your publication on iOS?...


In [82]:
test_tweets.head()

Unnamed: 0_level_0,Tweet
Id,Unnamed: 1_level_1
628949369883000832,dear @Microsoft the newOoffice for Mac is grea...
628976607420645377,@Microsoft how about you make a system that do...
629023169169518592,Not Available
629179223232479232,Not Available
629186282179153920,If I make a game as a #windows10 Universal App...


In [83]:
tweets.describe()

Unnamed: 0,Category,Tweet
count,5969,5969
unique,3,5417
top,positive,Not Available
freq,2888,548


In [84]:
print(tweets["Category"].value_counts())
print()
print(tweets["Category"].value_counts(normalize=True))

positive    2888
neutral     2125
negative     956
Name: Category, dtype: int64

positive    0.483833
neutral     0.356006
negative    0.160161
Name: Category, dtype: float64


In [85]:
print(tweets["Category"].isna().value_counts())
print(tweets["Tweet"].isna().value_counts())
print("\nNot available", len(tweets[tweets["Tweet"] == "Not Available"]))

False    5969
Name: Category, dtype: int64
False    5969
Name: Tweet, dtype: int64

Not available 548


In [86]:
print(tweets[tweets["Tweet"] == "Not Available"]["Category"].value_counts())
print()
print(tweets[tweets["Tweet"] == "Not Available"]["Category"].value_counts(normalize=True))

positive    289
neutral     172
negative     87
Name: Category, dtype: int64

positive    0.527372
neutral     0.313869
negative    0.158759
Name: Category, dtype: float64


# Data preprocessing

In [87]:
def replace_pattern(txt, pattern, replaced_text=""):
    r = re.findall(pattern, txt)
    for i in set(r):
        txt = txt.replace(i, replaced_text)
    return txt    

tweets.iloc[3]["Tweet"]

print(tweets.iloc[3]["Tweet"])
print(replace_pattern(tweets.iloc[3]["Tweet"], r"@[\w]*", "@user"))
print(replace_pattern(tweets.iloc[3]["Tweet"], r"[^a-z]", " "))

@jimmie_vanagon my phone does not run on latest IOS which may account for problem the other day .. time it was replaced
@user my phone does not run on latest IOS which may account for problem the other day .. time it was replaced
 jimmie vanagon my phone does not run on latest     which may account for problem the other day    time it was replaced


In [88]:
def remove_hashtag(text):
    pattern = r"#[\w]*"
    r = re.findall(pattern, text)
    hashtags = []
    for i in r:
        hashtags.append(i[1:])
        text = re.sub(i, i[1:], text)
    return text, hashtags
    
#     hashatags 
# delete_hashtag(tweets[0, 1])


In [89]:
stop_words = set(nltk.corpus.stopwords.words('english')) 

def remove_stopwords(tokens):
    return [token for token in tokens if (not token in stop_words and len(token) > 2)]

In [90]:
RE_EMOTICONS = re.compile("(:-?\))|(:p)|(:d+)|(:-?\()|(:/)|(;-?\))|(<3)|(=\))|(\)-?:)|(:'\()|(8\))")
text = "Look @Qualcomm I found the 1st #Snapdragon Phone in my stuff from #Toshiba and @Microsoft. Still Working :)"
print(["".join(emot) for emot in RE_EMOTICONS.findall(text)])
text = "Not Available"
print(["".join(emot) for emot in RE_EMOTICONS.findall(text)])

[':)']
[]


In [98]:
def prepare_data(data):
    data["PreprocessedTweet"] = None
    data["Hashtags"] = None
    data["Emoticons"] = None
    data["Tokens"] = None

    for index, row in data.iterrows():
        tweet = row["Tweet"]
        prepared_tweet = tweet.lower().strip()
        prepared_tweet = replace_pattern(prepared_tweet, r"http[\w:/.]*")
        emots = ["".join(emot) for emot in RE_EMOTICONS.findall(prepared_tweet)]

        prepared_tweet = replace_pattern(prepared_tweet, r"@[\w]*")
        prepared_tweet, hashtags = remove_hashtag(prepared_tweet)
        prepared_tweet = replace_pattern(prepared_tweet, r"[^a-z]", " ")

        tokens = nltk.word_tokenize(prepared_tweet)
        tokens = remove_stopwords(tokens)

        row["PreprocessedTweet"] = prepared_tweet
        row["Hashtags"] = hashtags
        row["Emoticons"] = emots
        row["Tokens"] = tokens + emots
    return data

In [99]:
tweets = prepare_data(tweets)
tweets.head(10)

Unnamed: 0_level_0,Category,Tweet,PreprocessedTweet,Hashtags,Emoticons,Tokens
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
635769805279248384,negative,Not Available,not available,[],[],[available]
635930169241374720,neutral,IOS 9 App Transport Security. Mm need to check...,ios app transport security mm need to check...,[],[],"[ios, app, transport, security, need, check, p..."
635950258682523648,neutral,"Mar if you have an iOS device, you should down...",mar if you have an ios device you should down...,[],[],"[mar, ios, device, download, app]"
636030803433009153,negative,@jimmie_vanagon my phone does not run on lates...,my phone does not run on latest ios which may...,[],[],"[phone, run, latest, ios, may, account, proble..."
636100906224848896,positive,Not sure how to start your publication on iOS?...,not sure how to start your publication on ios ...,[],[],"[sure, start, publication, ios, live, helping,..."
636176272947744772,neutral,"Two Dollar Tuesday is here with Forklift 2, Qu...",two dollar tuesday is here with forklift qu...,[],[],"[two, dollar, tuesday, forklift, quickkey, ios..."
636276311560859648,neutral,If you're not already signed up to test my iOS...,if you re not already signed up to test my ios...,[],[],"[already, signed, test, ios, game, nows, chance]"
636302400546975744,neutral,"YouTube Gaming Officially Launches On Web, And...",youtube gaming officially launches on web and...,[webseries],[],"[youtube, gaming, officially, launches, web, a..."
636356154151575552,neutral,YouTube Gaming Launches Tomorrow with iOS and ...,youtube gaming launches tomorrow with ios and ...,"[ios, game]",[],"[youtube, gaming, launches, tomorrow, ios, and..."
636360240921972736,neutral,@astrill Yashan from BBC @bbcchinese the VPN a...,yashan from bbc the vpn access on ios may be...,[],[],"[yashan, bbc, vpn, access, ios, may, limited, ..."


In [101]:
test_tweets = prepare_data(test_tweets)
test_tweets.head(5)

Unnamed: 0_level_0,Tweet,PreprocessedTweet,Hashtags,Emoticons,Tokens
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
628949369883000832,dear @Microsoft the newOoffice for Mac is grea...,dear the newooffice for mac is great and all ...,[],[],"[dear, newooffice, mac, great, lync, update, mon]"
628976607420645377,@Microsoft how about you make a system that do...,how about you make a system that doesn t eat ...,[],[],"[make, system, eat, friggin, discs, time, happ..."
629023169169518592,Not Available,not available,[],[],[available]
629179223232479232,Not Available,not available,[],[],[available]
629186282179153920,If I make a game as a #windows10 Universal App...,if i make a game as a windows universal app ...,"[windows10, xboxone]",[],"[make, game, windows, universal, app, xboxone,..."


## Bag of words 

In [94]:
def dummy(doc):
    return doc

bow_vectorizer = CountVectorizer(tokenizer=dummy, preprocessor=dummy)

X_train_counts = bow_vectorizer.fit_transform(tweets['Tokens'])
# print(vector.shape)
bow_vectorizer.vocabulary_

{'available': 619,
 'ios': 4762,
 'app': 403,
 'transport': 9614,
 'security': 8164,
 'need': 6198,
 'check': 1562,
 'party': 6697,
 'network': 6224,
 'pod': 6956,
 'supports': 9093,
 'mar': 5629,
 'device': 2479,
 'download': 2709,
 'phone': 6835,
 'run': 7952,
 'latest': 5186,
 'may': 5727,
 'account': 55,
 'problem': 7163,
 'day': 2286,
 'time': 9461,
 'replaced': 7651,
 'sure': 9099,
 'start': 8829,
 'publication': 7279,
 'live': 5379,
 'helping': 4237,
 'ask': 509,
 'anything': 386,
 'sessions': 8247,
 'today': 9491,
 'friday': 3626,
 'two': 9761,
 'dollar': 2658,
 'tuesday': 9713,
 'forklift': 3552,
 'quickkey': 7348,
 'suite': 9040,
 'pages': 6631,
 'already': 259,
 'signed': 8424,
 'test': 9326,
 'game': 3724,
 'nows': 6357,
 'chance': 1515,
 'youtube': 10475,
 'gaming': 3734,
 'officially': 6442,
 'launches': 5195,
 'web': 10168,
 'android': 320,
 'august': 589,
 'finally': 3389,
 'going': 3892,
 'webseries': 10174,
 'tomorrow': 9519,
 'apps': 440,
 'head': 4180,
 'twitch': 97

# Models

## Multinomial Naive Bayes 

In [95]:
clf = MultinomialNB().fit(X_train_counts, tweets['Category'])

In [96]:
predicted = clf.predict(X_train_counts)
np.mean(predicted == tweets['Category'])

0.8004690902998828

In [97]:
X_test_counts = bow_vectorizer.transform(test_tweets['Tokens'])
test_predicted = clf.predict(X_test_counts)
test_predicted

submisson = pd.DataFrame(data={
    'Id': test_tweets.index,
    'Category': test_predicted,
})
# submisson
submisson.to_csv("submission.csv", index=False)