# Read data

In [1]:
import nltk
import pandas as pd
import re
import numpy as np 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [2]:
tweets_submisson = pd.read_csv("data/test.csv", index_col="Id")
tweets = pd.read_csv("data/train.csv", index_col="Id")

In [3]:
tweets.head()

Unnamed: 0_level_0,Category,Tweet
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
635769805279248384,negative,Not Available
635930169241374720,neutral,IOS 9 App Transport Security. Mm need to check...
635950258682523648,neutral,"Mar if you have an iOS device, you should down..."
636030803433009153,negative,@jimmie_vanagon my phone does not run on lates...
636100906224848896,positive,Not sure how to start your publication on iOS?...


In [4]:
tweets_submisson.head()

Unnamed: 0_level_0,Tweet
Id,Unnamed: 1_level_1
628949369883000832,dear @Microsoft the newOoffice for Mac is grea...
628976607420645377,@Microsoft how about you make a system that do...
629023169169518592,Not Available
629179223232479232,Not Available
629186282179153920,If I make a game as a #windows10 Universal App...


In [5]:
tweets.describe()

Unnamed: 0,Category,Tweet
count,5969,5969
unique,3,5417
top,positive,Not Available
freq,2888,548


In [6]:
print(tweets["Category"].value_counts())
print()
print(tweets["Category"].value_counts(normalize=True))

positive    2888
neutral     2125
negative     956
Name: Category, dtype: int64

positive    0.483833
neutral     0.356006
negative    0.160161
Name: Category, dtype: float64


In [7]:
print(tweets["Category"].isna().value_counts())
print(tweets["Tweet"].isna().value_counts())
print("\nNot available", len(tweets[tweets["Tweet"] == "Not Available"]))

False    5969
Name: Category, dtype: int64
False    5969
Name: Tweet, dtype: int64

Not available 548


In [8]:
print(tweets[tweets["Tweet"] == "Not Available"]["Category"].value_counts())
print()
print(tweets[tweets["Tweet"] == "Not Available"]["Category"].value_counts(normalize=True))

positive    289
neutral     172
negative     87
Name: Category, dtype: int64

positive    0.527372
neutral     0.313869
negative    0.158759
Name: Category, dtype: float64


# Data preprocessing

In [9]:
def replace_pattern(txt, pattern, replaced_text=""):
    r = re.findall(pattern, txt)
    for i in set(r):
        txt = txt.replace(i, replaced_text)
    return txt    

tweets.iloc[3]["Tweet"]

print(tweets.iloc[3]["Tweet"])
print(replace_pattern(tweets.iloc[3]["Tweet"], r"@[\w]*", "@user"))
print(replace_pattern(tweets.iloc[3]["Tweet"], r"[^a-z]", " "))

@jimmie_vanagon my phone does not run on latest IOS which may account for problem the other day .. time it was replaced
@user my phone does not run on latest IOS which may account for problem the other day .. time it was replaced
 jimmie vanagon my phone does not run on latest     which may account for problem the other day    time it was replaced


In [10]:
def remove_hashtag(text):
    pattern = r"#[\w]*"
    r = re.findall(pattern, text)
    hashtags = []
    for i in r:
        hashtags.append(i[1:])
        text = re.sub(i, i[1:], text)
    return text, hashtags
    
#     hashatags 
# delete_hashtag(tweets[0, 1])


In [11]:
stop_words = set(nltk.corpus.stopwords.words('english')) 

def remove_stopwords(tokens):
    return [token for token in tokens if (not token in stop_words and len(token) > 2)]

In [12]:
RE_EMOTICONS = re.compile("(:-?\))|(:p)|(:d+)|(:-?\()|(:/)|(;-?\))|(<3)|(=\))|(\)-?:)|(:'\()|(8\))")
text = "Look @Qualcomm I found the 1st #Snapdragon Phone in my stuff from #Toshiba and @Microsoft. Still Working :)"
print(["".join(emot) for emot in RE_EMOTICONS.findall(text)])
text = "Not Available"
print(["".join(emot) for emot in RE_EMOTICONS.findall(text)])

[':)']
[]


In [13]:
def prepare_data(data):
    data["PreprocessedTweet"] = None
    data["Hashtags"] = None
    data["Emoticons"] = None
    data["Tokens"] = None

    for index, row in data.iterrows():
        tweet = row["Tweet"]
        prepared_tweet = tweet.lower().strip()
        prepared_tweet = replace_pattern(prepared_tweet, r"http[\w:/.]*")
        emots = ["".join(emot) for emot in RE_EMOTICONS.findall(prepared_tweet)]

        prepared_tweet = replace_pattern(prepared_tweet, r"@[\w]*")
        prepared_tweet, hashtags = remove_hashtag(prepared_tweet)
        prepared_tweet = replace_pattern(prepared_tweet, r"[^a-z]", " ")

        tokens = nltk.word_tokenize(prepared_tweet)
        tokens = remove_stopwords(tokens)

        row["PreprocessedTweet"] = prepared_tweet
        row["Hashtags"] = hashtags
        row["Emoticons"] = emots
        row["Tokens"] = tokens + emots
    return data

In [14]:
tweets = prepare_data(tweets)
tweets.head(10)

Unnamed: 0_level_0,Category,Tweet,PreprocessedTweet,Hashtags,Emoticons,Tokens
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
635769805279248384,negative,Not Available,not available,[],[],[available]
635930169241374720,neutral,IOS 9 App Transport Security. Mm need to check...,ios app transport security mm need to check...,[],[],"[ios, app, transport, security, need, check, p..."
635950258682523648,neutral,"Mar if you have an iOS device, you should down...",mar if you have an ios device you should down...,[],[],"[mar, ios, device, download, app]"
636030803433009153,negative,@jimmie_vanagon my phone does not run on lates...,my phone does not run on latest ios which may...,[],[],"[phone, run, latest, ios, may, account, proble..."
636100906224848896,positive,Not sure how to start your publication on iOS?...,not sure how to start your publication on ios ...,[],[],"[sure, start, publication, ios, live, helping,..."
636176272947744772,neutral,"Two Dollar Tuesday is here with Forklift 2, Qu...",two dollar tuesday is here with forklift qu...,[],[],"[two, dollar, tuesday, forklift, quickkey, ios..."
636276311560859648,neutral,If you're not already signed up to test my iOS...,if you re not already signed up to test my ios...,[],[],"[already, signed, test, ios, game, nows, chance]"
636302400546975744,neutral,"YouTube Gaming Officially Launches On Web, And...",youtube gaming officially launches on web and...,[webseries],[],"[youtube, gaming, officially, launches, web, a..."
636356154151575552,neutral,YouTube Gaming Launches Tomorrow with iOS and ...,youtube gaming launches tomorrow with ios and ...,"[ios, game]",[],"[youtube, gaming, launches, tomorrow, ios, and..."
636360240921972736,neutral,@astrill Yashan from BBC @bbcchinese the VPN a...,yashan from bbc the vpn access on ios may be...,[],[],"[yashan, bbc, vpn, access, ios, may, limited, ..."


In [15]:
tweets_submisson = prepare_data(tweets_submisson)
tweets_submisson.head(5)

Unnamed: 0_level_0,Tweet,PreprocessedTweet,Hashtags,Emoticons,Tokens
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
628949369883000832,dear @Microsoft the newOoffice for Mac is grea...,dear the newooffice for mac is great and all ...,[],[],"[dear, newooffice, mac, great, lync, update, mon]"
628976607420645377,@Microsoft how about you make a system that do...,how about you make a system that doesn t eat ...,[],[],"[make, system, eat, friggin, discs, time, happ..."
629023169169518592,Not Available,not available,[],[],[available]
629179223232479232,Not Available,not available,[],[],[available]
629186282179153920,If I make a game as a #windows10 Universal App...,if i make a game as a windows universal app ...,"[windows10, xboxone]",[],"[make, game, windows, universal, app, xboxone,..."


## Bag of words 

In [16]:
def dummy(doc):
    return doc

bow_vectorizer = CountVectorizer(tokenizer=dummy, preprocessor=dummy)

X_train_counts = bow_vectorizer.fit_transform(tweets['Tokens'])
# print(vector.shape)
bow_vectorizer.vocabulary_

{'available': 619,
 'ios': 4763,
 'app': 403,
 'transport': 9610,
 'security': 8161,
 'need': 6196,
 'check': 1562,
 'party': 6694,
 'network': 6222,
 'pod': 6953,
 'supports': 9089,
 'mar': 5627,
 'device': 2479,
 'download': 2709,
 'phone': 6832,
 'run': 7949,
 'latest': 5184,
 'may': 5725,
 'account': 55,
 'problem': 7160,
 'day': 2286,
 'time': 9457,
 'replaced': 7648,
 'sure': 9095,
 'start': 8825,
 'publication': 7276,
 'live': 5377,
 'helping': 4237,
 'ask': 509,
 'anything': 386,
 'sessions': 8244,
 'today': 9487,
 'friday': 3626,
 'two': 9757,
 'dollar': 2658,
 'tuesday': 9709,
 'forklift': 3552,
 'quickkey': 7345,
 'suite': 9036,
 'pages': 6628,
 'already': 259,
 'signed': 8421,
 'test': 9322,
 'game': 3724,
 'nows': 6355,
 'chance': 1515,
 'youtube': 10471,
 'gaming': 3734,
 'officially': 6440,
 'launches': 5193,
 'web': 10164,
 'android': 320,
 'august': 589,
 'finally': 3389,
 'going': 3892,
 'webseries': 10170,
 'tomorrow': 9515,
 'apps': 440,
 'head': 4180,
 'twitch': 97

# Models

In [17]:
def train_clf_and_save_submission(clf, sub_file_name):
    clf = clf.fit(X_train_counts, tweets['Category'])
    
    predicted = clf.predict(X_train_counts)
    print(np.mean(predicted == tweets['Category']))
    
    X_test_counts = bow_vectorizer.transform(tweets_submisson['Tokens'])
    test_predicted = clf.predict(X_test_counts)
    test_predicted

    submisson = pd.DataFrame(data={
        'Id': tweets_submisson.index,
        'Category': test_predicted,
    })
    # submisson
    print(sub_file_name)
    submisson.to_csv(sub_file_name, index=False)

## Multinomial Naive Bayes 

In [18]:
# clf = MultinomialNB().fit(X_train_counts, tweets['Category'])

train_clf_and_save_submission(MultinomialNB(), "submission_nb.csv")

0.8003015580499246
submission_nb.csv


## SVC

In [19]:
from sklearn.svm import SVC

train_clf_and_save_submission(SVC(gamma='auto', random_state=42), "submission_svc.csv")

0.4838331378790417
submission_svc.csv


## SGD

In [20]:
from sklearn.linear_model import SGDClassifier

train_clf_and_save_submission(SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None), "submission_sgd.csv")

0.8364885240408779
submission_sgd.csv


# word2vec (overfitting)

In [24]:
train_set_len = int(len(tweets) * 0.1)
tweets = tweets.sample(frac=1, random_state=42).reset_index(drop=True)
# print(train_set_len)

X_train = tweets.loc[train_set_len:, "Tweet"].values
y_train = tweets.loc[train_set_len:, "Category"].values
X_test = tweets.loc[:train_set_len, "Tweet"].values
y_test = tweets.loc[:train_set_len, "Category"].values
# X_train = tweets.loc[train_set_len:, "Tokens"].values
# X_test = tweets.loc[:train_set_len, "Tokens"].values

type(X_train)
# [print(d + '\n') for d in X_train[:5]]

# def prepare_data_word2vec(data):
#     processed_data = []
#     for tweet in data:
#         processed_tweet = replace_pattern(tweet, r"@[\w]*", "user")
#         processed_tweet = replace_pattern(processed_tweet, r"http[\w:/.]*")
#         processed_data.append(processed_tweet)
#     return processed_data
        
# X_train = prepare_data_word2vec(X_train)
# X_test = prepare_data_word2vec(X_test)

# print("-" * 120)    
# [print(d + '\n') for d in X_train[:5]]


numpy.ndarray

In [25]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def prepare_y(y):
    LabelEncoder()
    encoder = LabelEncoder()
    encoder.fit(y)
    encoded_y = encoder.transform(y)
    return to_categorical(encoded_y)

y_train = prepare_y(y_train)
y_test = prepare_y(y_test)


tokenizer_obj = Tokenizer()
X = X_train + X_test
tokenizer_obj.fit_on_texts(X)

max_length = max([len(s.split()) for s in X])
vocab_size = len(tokenizer_obj.word_index) + 1

X_train_tokens = tokenizer_obj.texts_to_sequences(X_train)
X_test_tokens = tokenizer_obj.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding="post")
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding="post")

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU, Embedding, Softmax

EMBEDDING_DIM = 10
UNITS = 16

model = Sequential([
    Embedding(vocab_size, EMBEDDING_DIM, input_length=max_length),
    GRU(units=UNITS, dropout=0.2, recurrent_dropout=0.2),
    Dense(3, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

In [84]:
model.fit(X_train_pad, y_train, batch_size=16, epochs=25, validation_data=(X_test_pad, y_test), verbose=2)

Train on 5373 samples, validate on 597 samples
Epoch 1/25
5373/5373 - 30s - loss: 1.0212 - accuracy: 0.4774 - val_loss: 1.0205 - val_accuracy: 0.4774
Epoch 2/25
5373/5373 - 23s - loss: 1.0143 - accuracy: 0.4846 - val_loss: 1.0221 - val_accuracy: 0.4774
Epoch 3/25
5373/5373 - 22s - loss: 1.0137 - accuracy: 0.4846 - val_loss: 1.0205 - val_accuracy: 0.4774
Epoch 4/25
5373/5373 - 22s - loss: 1.0001 - accuracy: 0.4856 - val_loss: 1.0183 - val_accuracy: 0.4774
Epoch 5/25
5373/5373 - 22s - loss: 0.9071 - accuracy: 0.5317 - val_loss: 1.0143 - val_accuracy: 0.4774
Epoch 6/25
5373/5373 - 24s - loss: 0.8237 - accuracy: 0.5680 - val_loss: 1.0176 - val_accuracy: 0.4774
Epoch 7/25
5373/5373 - 24s - loss: 0.7690 - accuracy: 0.5896 - val_loss: 1.0137 - val_accuracy: 0.4774
Epoch 8/25
5373/5373 - 23s - loss: 0.7162 - accuracy: 0.6159 - val_loss: 1.0147 - val_accuracy: 0.4774
Epoch 9/25
5373/5373 - 25s - loss: 0.6670 - accuracy: 0.6564 - val_loss: 1.0172 - val_accuracy: 0.4791
Epoch 10/25
5373/5373 - 25

KeyboardInterrupt: 