# Angry tweets
## Piotr Przybyłowski
Przedstawione kolejne kroki wykonane w celu osiągnięcia przedstawionych wyników

+ Import wykorzystywanych bibliotek

In [1]:
import re
import pandas as pd
import nltk
from nltk.stem.porter import *
from collections import Counter
from scipy.sparse import csr_matrix
from sklearn.ensemble import RandomForestClassifier

+ Wczytanie treningowego zbioru danych

In [2]:
tweets = pd.read_csv("train.csv", sep=",", header=0).iloc[:,2]
print(tweets.head(10))

0                                        Not Available
1    IOS 9 App Transport Security. Mm need to check...
2    Mar if you have an iOS device, you should down...
3    @jimmie_vanagon my phone does not run on lates...
4    Not sure how to start your publication on iOS?...
5    Two Dollar Tuesday is here with Forklift 2, Qu...
6    If you're not already signed up to test my iOS...
7    YouTube Gaming Officially Launches On Web, And...
8    YouTube Gaming Launches Tomorrow with iOS and ...
9    @astrill Yashan from BBC @bbcchinese the VPN a...
Name: Tweet, dtype: object


+ Tokenizacja

In [6]:
RE_EMOTICONS = r"""
    (?:
        [:=;] #oczy
        [oO\-]? #nos
        [D\)\]\(\]/\\OpP] #usta
    )"""
 
RE_TOKENS = [
    RE_EMOTICONS,
    r'<[^>]+>', #tagi html
    r'(?:@[\w_]+)', #"wzmianki" z @ na początkju
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", #hasztagi
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', #adresy html
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', #liczby
    r"(?:[a-z][a-z'\-_]+[a-z])", #wyrazy z rozdzielone myslnikiem lub apostrofem
    r'(?:[\w_]+)', # inne słowa
    r'(?:\S)' # wszyzstko co pozostało
]
    
tokens_re = re.compile(r'('+'|'.join(RE_TOKENS)+')', re.VERBOSE | re.IGNORECASE)

class BeforeTokenizationNormalizer():
    @staticmethod
    def normalize(text):
        text = text.strip().lower()
        text = text.replace('&nbsp;', ' ')
        text = text.replace('&lt;', '<')
        text = text.replace('&gt;', '>')
        text = text.replace('&amp;', '&')
        text = text.replace('&pound;', u'£')
        text = text.replace('&euro;', u'€')
        text = text.replace('&copy;', u'©')
        text = text.replace('&reg;', u'®')
        return text

def tokenize(s):
    return tokens_re.findall(s)

for i in tweets.index:
     tweet = BeforeTokenizationNormalizer.normalize(tweets.iat[i])
     print(tokenize(tweet))

[u'not', u'available']
[u'ios', u'9', u'app', u'transport', u'security', u'.', u'mm', u'need', u'to', u'check', u'if', u'my', u'3', u'rd', u'party', u'network', u'pod', u'supports', u'it', u'http://t.co/fmtcfuadgj']
[u'mar', u'if', u'you', u'have', u'an', u'ios', u'device', u',', u'you', u'should', u'download', u'our', u'app', u'too', u':', u'http://t.co/gl3tn2udnd']
[u'@jimmie_vanagon', u'my', u'phone', u'does', u'not', u'run', u'on', u'latest', u'ios', u'which', u'may', u'account', u'for', u'problem', u'the', u'other', u'day', u'.', u'.', u'time', u'it', u'was', u'replaced']
[u'not', u'sure', u'how', u'to', u'start', u'your', u'publication', u'on', u'ios', u'?', u"we'll", u'be', u'live', u'helping', u'with', u'ask', u'me', u'anything', u'sessions', u'today', u'and', u'friday', u'http://t.co/kpqqgjjh3x']
[u'two', u'dollar', u'tuesday', u'is', u'here', u'with', u'forklift', u'2,', u'quickkey', u'for', u'ios', u'and', u'suite', u'for', u'pages', u'for', u'just', u'$', u'1.99', u'today',

+ Stemming

In [7]:
stemmer = PorterStemmer()
for i in tweets.index:
     tweet = BeforeTokenizationNormalizer.normalize(tweets.iat[i])
     tweettok = tokenize(tweet)
     single =[stemmer.stem(temptweet) for temptweet in tweettok]
     print(single)

[u'not', u'avail']
[u'io', u'9', u'app', u'transport', u'secur', u'.', u'mm', u'need', u'to', u'check', u'if', u'my', u'3', u'rd', u'parti', u'network', u'pod', u'support', u'it', u'http://t.co/fmtcfuadgj']
[u'mar', u'if', u'you', u'have', u'an', u'io', u'devic', u',', u'you', u'should', u'download', u'our', u'app', u'too', u':', u'http://t.co/gl3tn2udnd']
[u'@jimmie_vanagon', u'my', u'phone', u'doe', u'not', u'run', u'on', u'latest', u'io', u'which', u'may', u'account', u'for', u'problem', u'the', u'other', u'day', u'.', u'.', u'time', u'it', u'wa', u'replac']
[u'not', u'sure', u'how', u'to', u'start', u'your', u'public', u'on', u'io', u'?', u"we'll", u'be', u'live', u'help', u'with', u'ask', u'me', u'anyth', u'session', u'today', u'and', u'friday', u'http://t.co/kpqqgjjh3x']
[u'two', u'dollar', u'tuesday', u'is', u'here', u'with', u'forklift', u'2,', u'quickkey', u'for', u'io', u'and', u'suit', u'for', u'page', u'for', u'just', u'$', u'1.99', u'today', u':', u'http://t.co/bnmfoeacw5'

+ Usunięcie stopwords

In [9]:
words = Counter()

for i in tweets.index:
     tweet = BeforeTokenizationNormalizer.normalize(tweets.iat[i])
     tweettok = tokenize(tweet)
     single =[stemmer.stem(temptweet) for temptweet in tweettok]
     words.update(single)
        
punctuation = ['.',',',';',':']
stopwords = ["a", "about", "after", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been",
            "before", "being", "between", "both", "by", "could", "did", "do", "does", "doing", "during", "each",
            "for", "from", "further", "had", "has", "have", "having", "he", "her", "here", "hers", "herself", "him",
            "himself", "his", "how", "i", "in", "into", "is", "it", "its", "itself", "let", "me", "more", "most", "my",
            "myself", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "own", "sha",
            "she", "should", "so", "some", "such", "than", "that", "the", "their", "theirs", "them", "themselves",
            "then", "there", "there's", "these", "they", "this", "those", "through", "to", "until", "up", "very",
            "was", "we", "were", "what", "when", "where", "which", "while", "who","whom", "with", "would", "you",
            "your", "yours", "yourself", "yourselves",
            "n't", "'s", "'ll", "'re", "'d", "'m", "'ve",
            "above", "again", "against", "below", "but", "cannot", "down", "few", "if", "no", "nor", "not", "off",
            "out", "over", "same", "too", "under", "why"]

for word in punctuation:
    if word in words:
        del words[word]

for word in stopwords:
    if word in words:
        del words[word]
print(words)

Counter({u'!': 1530, u'may': 1037, u'?': 868, u'"': 798, u'tomorrow': 756, u'th': 680, u'-': 579, u'avail': 576, u'day': 531, u'just': 507, u'thi': 505, u'go': 504, u'1': 502, u'wa': 489, u'will': 462, u'2': 458, u'st': 389, u'see': 376, u'time': 362, u'like': 349, u'get': 348, u"i'm": 347, u'3': 343, u'&': 343, u"it'": 318, u'(': 318, u')': 308, u'one': 286, u'watch': 286, u"'": 275, u'new': 273, u'friday': 272, u'sunday': 269, u'make': 253, u'want': 249, u'come': 247, u'ha': 237, u'hi': 231, u'rd': 224, u'night': 224, u'now': 223, u'/': 221, u'can': 217, u'nd': 217, u'say': 197, u'jurass': 196, u'think': 193, u'10': 193, u'know': 186, u'world': 185, u'monday': 175, u'4': 173, u"don't": 172, u'obama': 171, u'5': 170, u'look': 169, u'trump': 167, u'love': 163, u'back': 163, u'today': 162, u'show': 160, u'saturday': 159, u'good': 158, u'us': 154, u'play': 153, u'still': 153, u'need': 147, u'game': 147, u'sun': 147, u'plan': 143, u'6': 140, u"valentine'": 136, u'first': 136, u'last': 132

+ Stworzenie bag-of-words

In [12]:
def create_bow(documents, features):
    row = []
    col = []
    data = []

    labels = []

    for i in documents.index:
        tweet = BeforeTokenizationNormalizer.normalize(documents.iloc[i, 2])
        label = documents.iloc[i, 1]      
        
        tweettok = tokenize(tweet)
        tweet_tokens =[stemmer.stem(temptweet) for temptweet in tweettok]

        labels.append(label)
        for token in set(tweet_tokens):
            if token not in features:
                continue
            row.append(i)
            col.append(features[token])
            data.append(1)
    return csr_matrix((data, (row, col)), shape=(len(documents), len(features))), labels

def create_bow2(documents, features):
    row = []
    col = []
    data = []
    
    for i in documents.index:
        
        tweet = BeforeTokenizationNormalizer.normalize(documents.iloc[i, 1])
        tweettok = tokenize(tweet)
        tweet_tokens =[stemmer.stem(temptweet) for temptweet in tweettok]

        for token in set(tweet_tokens):
            if token not in features:
                continue
            row.append(i)
            col.append(features[token])
            data.append(1)
    return csr_matrix((data, (row, col)), shape=(len(documents), len(features)))

+ Klasyfikacja

In [13]:
min_word_count =0

train_tweets = pd.read_csv("train.csv", sep=",", header=0)
test_tweets = pd.read_csv("test.csv", sep=",", header=0)
common_words = list([k for k, v in words.most_common() if v > min_word_count])
feature_dict = {}
for word in common_words:
    feature_dict[word] = len(feature_dict)

print("Training classifier...")
X_train, y_train = create_bow(train_tweets, feature_dict)
list_of_labels = list(set(y_train))
classifier = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=23)
classifier.fit(X_train, y_train)

X_test = create_bow2(test_tweets, feature_dict)

predicted = classifier.predict(X_test)

print(predicted)

Training classifier...
['positive' 'positive' 'positive' ..., 'positive' 'positive' 'neutral']


+ Zapis predykcji do pliku

In [15]:
f1=open('./wyniki.csv', 'w+')
f1.write("Id,Category\n")
for i in range(predicted.size):
    f1.write(','.join((str(test_tweets.iloc[i, 0]),predicted[i])))
    f1.write('\n')