In [4]:
from os.path import join
from tqdm import tqdm
import json

from nltk.stem.snowball import SnowballStemmer
import numpy as np

from sklearn import svm
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve
import sklearn.metrics as smet
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

import re
import string
import random

import matplotlib.pyplot as plt

def in_washington(location):
    white_list = [
        "seattle",
        "washington",
        "wa",
        "kirkland"
    ]

    black_list = [
        "dc",
        "d.c.",
        "d.c."
    ]

    flag = False
    location = location.split()

    for s in white_list:
        if s in location:
            flag = True
            break

    for s in black_list:
        if s in location:
            flag = False
            break

    return flag

def in_mas(location):
    white_list = [
        "ma",
        "massachusetts",
        "boston",
        "worcester",
        "salem",
        "plymouth",
        "springfield",
        "arlington",
        "scituate",
        "northampton"
    ]

    location = location.split()

    black_list = [
        "ohio",
    ]
    flag = False

    for s in white_list:
        if s in location:
            flag = True
            break

    for s in black_list:
        if s in location:
            flag = False
            break

    return flag

class StemTokenizer(object):
    def __init__(self):
        self.snowball_stemmer = SnowballStemmer("english")

    def __call__(self, doc):
        doc = re.sub('[,.-:/()?{}*$#&]', ' ', doc)
        doc = ''.join(ch for ch in doc if ch not in string.punctuation)
        doc = ''.join(ch for ch in doc if ord(ch) < 128)
        doc = doc.lower()
        words = doc.split()
        words = [word for word in words if word not in text.ENGLISH_STOP_WORDS]

        return [
            self.snowball_stemmer.stem(word) for word in words
        ]

def get_vectorizer():
    return CountVectorizer(
        tokenizer=StemTokenizer(),
        lowercase=True,
        min_df = 2,
        max_df = 0.99
    )

def get_tfid_transformer():
    return TfidfTransformer(
        norm='l2',
        sublinear_tf=True
    )

def get_svd():
    return TruncatedSVD(n_components=100)

def print_statistics(actual, predicted):
    print ("Accuracy is ", smet.accuracy_score(actual, predicted) * 100)
    print ("Precision is ", smet.precision_score(actual, predicted, average='macro') * 100)
    print ("Recall is ", smet.recall_score(actual, predicted, average='macro') * 100)
    print ("Confusion Matrix is ", smet.confusion_matrix(actual, predicted))

def plot_roc(actual, predicted, classifier_name):
    x, y, _ = roc_curve(actual, predicted)
    plt.plot(x, y, label="ROC Curve")
    plt.plot([0, 1], [0, 1])

    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.2])

    plt.xlabel('False Positive Rate (FPR)')
    plt.ylabel('True Positive Rate (TPR)')
    plt.title('ROC Curves for ' + classifier_name + 'Classifier')
    plt.legend(loc="best")

    plt.savefig('plots/' + classifier_name + '.png', format='png')
    plt.show()
    plt.clf()


def classify(X, Y, classifier, cname):
    b = 0.85 * X.shape[0]
    X_train = X[:b, :]
    Y_train = Y[:b]

    X_test = X[b:, :]
    Y_test = Y[b:]

    classifier.fit(X_train, Y_train)
    predicted = classifier.predict(X_test)
    predicted_probs = classifier.predict_proba(X_test)

    print_statistics(Y_test, predicted)
    plot_roc(Y_test, predicted_probs[:, 1], cname)

print ("Loading superbowl tweets")
lcount = 1348767

with open(join('tweet_data', 'tweets_#superbowl.txt'), 'r') as f:
    X = []
    Y = []
    for i, line in tqdm(enumerate(f), total=lcount):
        tweet_data = json.loads(line)
        location = tweet_data.get("tweet").get("user").get("location").lower()

        if in_washington(location):
            X.append(tweet_data.get("title"))
            Y.append(0)
        elif in_mas(location):
            X.append(tweet_data.get("title"))
            Y.append(1)

    pipeline = Pipeline(
        [
            ('vectorize', get_vectorizer()),
            ('tf-idf', get_tfid_transformer()),
            ('svd', get_svd())
        ]
    )

    print ("Computing the LSI representation of the dataset")
    X = pipeline.fit_transform(X)
    Y = np.array(Y)

    # Randomly shuffle data
    indexes = range(X.shape[0])
    random.shuffle(indexes)
    indexes = indexes
    X_ = X[indexes, :]
    Y_ = Y[indexes]

    print ("Statistics of SVM classifier:")
    classify(X_, Y_, svm.SVC(kernel='linear', probability=True), "SVM")

    print ("Statistics of AdaBoost Classifier are")
    classify(X_, Y_, AdaBoostClassifier(), "AdaBoost")

    print ("Statistics of Random Forest Classifier are")
    classify(X_, Y_, RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), "RandomForestClassifier")

    print ("Statistics of Neural Network Classifier are")
    classify(X_, Y_, MLPClassifier(alpha=1), "Neural Network Classifier")


  0%|          | 0/188136 [00:00<?, ?it/s]

Training Models First
###
# gohawks:
###


100%|██████████| 188136/188136 [00:18<00:00, 10281.61it/s]
  0%|          | 0/259024 [00:00<?, ?it/s]

###
# nfl:
###


100%|██████████| 259024/259024 [00:23<00:00, 11085.26it/s]


###
# superbowl:
###


100%|██████████| 1348767/1348767 [03:38<00:00, 6175.61it/s]
100%|██████████| 365/365 [00:00<00:00, 8301.72it/s]
100%|██████████| 730/730 [00:00<00:00, 9763.56it/s]

Predicting test data
Predicting number of tweets for file sample10_period3.txt
Predictions for the 5 hour window are: [ 15.6202574    7.09296663  10.63803572   0.79901979  -6.0463673 ]
Prediction errors are: 54.9792175506
Predicting number of tweets for file sample1_period1.txt
Predictions for the 5 hour window are: [-31.43287436  -9.19346291  36.87217555  12.6683037   32.93661493]
Prediction errors are: 110.229848617
Predicting number of tweets for file sample2_period2.txt



100%|██████████| 212273/212273 [00:16<00:00, 12761.06it/s]
 19%|█▉        | 704/3638 [00:00<00:00, 7037.76it/s]

Predictions for the 5 hour window are: [-413164.28422589 -371910.05692859 -494440.30252183 -395258.06171855
 -480072.10706794]
Prediction errors are: 471905.362493
Predicting number of tweets for file sample3_period3.txt


100%|█████████▉| 3628/3638 [00:00<00:00, 9681.17it/s]
100%|██████████| 1646/1646 [00:00<00:00, 11570.86it/s]
  0%|          | 0/2059 [00:00<?, ?it/s]

Predictions for the 5 hour window are: [-109.52969612    0.94110755  -65.25657106 -279.49045782    1.83490791]
Prediction errors are: 727.700141908
Predicting number of tweets for file sample4_period1.txt
Predictions for the 5 hour window are: [ 77.01307327 -17.2248619  -40.10264019 -82.78064712 -46.42164035]
Prediction errors are: 267.303343257
Predicting number of tweets for file sample5_period1.txt


100%|██████████| 2059/2059 [00:00<00:00, 13206.84it/s]
  0%|          | 558/205554 [00:00<00:36, 5575.94it/s]

Predictions for the 5 hour window are: [-113.62915959 -314.13982053 -120.38835974  -97.20123714 -116.96064572]
Prediction errors are: 495.863844546
Predicting number of tweets for file sample6_period2.txt


100%|██████████| 205554/205554 [00:17<00:00, 11844.79it/s]
100%|██████████| 528/528 [00:00<00:00, 5753.13it/s]
100%|██████████| 229/229 [00:00<00:00, 6324.42it/s]
  0%|          | 0/11311 [00:00<?, ?it/s]

Predictions for the 5 hour window are: [ -335277.14869096   136885.32360838  3391851.59924414  2849580.62612228
  1910258.7574437 ]
Prediction errors are: 1689028.09102
Predicting number of tweets for file sample7_period3.txt
Predictions for the 5 hour window are: [-81.0689205  -63.00961061 -13.97155776   8.62436914 -22.5929161 ]
Prediction errors are: 115.003727166
Predicting number of tweets for file sample8_period1.txt
Predictions for the 5 hour window are: [-38.76508767 -62.50521337 -50.66003519 -34.24180419]
Prediction errors are: 91.5430351041
Predicting number of tweets for file sample9_period2.txt


100%|██████████| 11311/11311 [00:00<00:00, 12963.27it/s]

Predictions for the 5 hour window are: [-352981.70702158 -402228.97930408 -457532.89555471 -506424.51255364
 -458654.59951995]
Prediction errors are: 437480.938791



