# Tweet topic classification alternative approach

In [233]:
import os
import re
import nltk
import glob
import random
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

Read and preprocess data. Distribute data sets.

In [234]:
filenames = glob.glob("data/*.txt")
print(filenames)

['data\\business.txt', 'data\\entertainment.txt', 'data\\health.txt', 'data\\politics.txt', 'data\\sports.txt', 'data\\technology.txt']


Extract tweets combined with their labeled target and shuffle them

In [235]:
labled_tweets = []
stopwords = stopwords.words('english')

for filename in filenames:
    file = open(filename, encoding='utf-8').read()
    for tweet in file.split('\n'):
        tweet = re.sub(r'[^\w\s]','', tweet)
        tweet = re.sub(" \d+", " ", tweet)
        tweet = [i.lower() for i in list(set(nltk.word_tokenize(tweet)) - set(stopwords))]
        labled_tweets.append((tweet, filename[5:-4]))  # extract target names from filename

random.shuffle(labled_tweets)
print(labled_tweets[0:5])

[(['going', 'sports', 'let', 'win', 'team', 'individual', 'blame', 'championship', 'national', 'tennis', 'golf', 'rest'], 'sports'), (['mark', 'wired', 'spaces', 'world', 'is', 'facebook', 'why', 'bought', 'exactly', 'bizarre', 'oculus', 'zuckerberg', 'the', 'glorious', 'vr'], 'technology'), (['indias', 'champions', 'trophy', 'answered', 'faqs', 'participation'], 'sports'), (['looking', 'music', 'videoclip', 'particular'], 'entertainment'), (['winner', 'pick', 'stock', 'month', 'return', 'results', 'short', 'april', 'market', 'the', 'ehs4290', 'gbt', 'contest'], 'business')]


Break it down into a list of tweets and a list of numerical targets

In [236]:
data = [entry[0] for entry in labled_tweets]

targets = []
for entry in labled_tweets:
    t = entry[1]
    if t == 'business': 
        targets.append(1)
    if t == 'entertainment':
        targets.append(2)
    if t == 'health':
        targets.append(3)
    if t == 'politics':
        targets.append(4)
    if t == 'sports':
        targets.append(5)
    if t == 'technology':
        targets.append(6)

print(data[0])
print(targets[0])

['going', 'sports', 'let', 'win', 'team', 'individual', 'blame', 'championship', 'national', 'tennis', 'golf', 'rest']
5


Split into train and test datasets

In [237]:
n = 400
train_data = data[n:]
test_data = data[:n]
train_targets = targets[n:] 
test_targets = targets[:n]

Train classifiers

In [238]:
count_vect = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
train_vect = count_vect.fit_transform(train_data)
test_vect = count_vect.transform(test_data)
print(train_vect.shape)

(2733, 8301)


In [239]:
count_vect.vocabulary_.get(u'algorithm')

252

In [240]:
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_vect)
test_tfidf = tfidf_transformer.transform(test_vect)
print(train_tfidf.shape)

(2733, 8301)


In [241]:
MNB_classifier = MultinomialNB().fit(train_tfidf, train_targets)

In [242]:
SGD_classifier = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3,
                               random_state=42, max_iter=5, tol=None).fit(train_tfidf, train_targets)

Predict

In [243]:
predicted = MNB_classifier.predict(test_tfidf)
print(np.mean(predicted == test_targets) * 100)

72.25


In [244]:
predicted = SGD_classifier.predict(test_tfidf)
print(np.mean(predicted == test_targets) * 100)

76.5
