# Swiss Tweet Votations

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

## Data retrieving
Since we want to focus on one particular event, we don't need to work on the complete dataset. We download the tweets from the cluster to be able to work on them remotely.

In [None]:
# Download the dataset from Hadoop

import glob
from os.path import basename, splitext
import requests as rx
from urllib.request import urlretrieve
from time import sleep

swiss_tweet_url = 'http://iccluster060.iccluster.epfl.ch:50070/webhdfs/v1/datasets/swiss-tweet'
operation_list = '?op=LISTSTATUS'
operation_open = '?op=OPEN'

req = rx.get(swiss_tweet_url + operation_list)
if req.status_code != 200:
    raise Exception("Failed to load list of files")

remote_swiss_tweet_files = set(map(lambda f: f['pathSuffix'], req.json()['FileStatuses']['FileStatus']))
local_swiss_tweet_files = set(map(lambda f: basename(f), glob.glob('data/harvest3r_twitter_data_*.json')))

missing_swiss_tweet_files = remote_swiss_tweet_files - local_swiss_tweet_files
missing_index, missing_count = 0, len(missing_swiss_tweet_files)

if missing_count == 0:
    print("Your dataset is complete, nothing to download!")

for swiss_tweet_file in missing_swiss_tweet_files:
    missing_index += 1
    print("Downloading {} ({} of {} files)".format(swiss_tweet_file, missing_index, missing_count))
    
    frm = swiss_tweet_url + '/' + swiss_tweet_file + operation_open
    to = 'data/' + swiss_tweet_file
    
    urlretrieve(frm, to)
    sleep(1)

## Preprocessing: Filtering

As we only want tweets containing hashtags, we keep only those and save them so there are less tweets to process afterwards.

In [None]:
i = 0
datasets = glob.glob('data/harvest3r_twitter_data_*.json')
for datapath in datasets:
    i += 1
    print("Working on file {} of {}: {}".format(i, len(datasets), datapath))
    tweets = pd.DataFrame( pd.read_json(datapath)._source.tolist() )
    tagged_tweets = tweets.dropna(subset=['tags'])
    tagged_tweets.tags = tagged_tweets.tags.apply(lambda ts: set([t.lower() for t in ts]))
    print("Amount of tagged tweets found: {}".format(tagged_tweets.main.size))
    taggedpath = 'data/tagged_' + splitext(basename(datapath))[0] + '.json'
    print("Tagged tweets written to: {}".format(taggedpath))
    tagged_tweets.to_json(taggedpath)

## Preprocessing: Exploration

First of all, we need to discover hashtags used for the votations. We begin our search by finding all tweets tagged #chvote, a popular hashtag when votations are around the corner. We will then count every other hashtags associated – in general, the subject of the votation is also in the hashtags.

In [None]:
i = 0
chvote_tweets = pd.DataFrame()
datasets = glob.glob('data/tagged_harvest3r_twitter_data_*.json')
for datapath in datasets:
    i += 1
    print("Working on file {} of {}: {}".format(i, len(datasets), datapath))
    tweets = pd.read_json(datapath)
    tweets_found = tweets.select(lambda t: 'chvote' in tweets.loc[t].tags)
    print("Amount of #chvote found: {} / {}".format(tweets_found.main.size, tweets.main.size))
    chvote_tweets = chvote_tweets.append(tweets_found, ignore_index=True)
    
chvotepath = 'data/tweets-chvote.json'
print("Tweets #chvote written to: ", chvotepath)
chvote_tweets.to_json(chvotepath)

In [None]:
# Extract tags
chvote_tags = list()
for tags in chvote_tweets.tags:
    chvote_tags += [t for t in tags]

# Popular tags
chvote_tags = pd.Series(chvote_tags)
chvote_tags.value_counts().head(10)

## Preprocessing: selecting tweets with given tags

We found a list of popular tags associated with the votations (more info on these in the README): "Abst16", "AVSplus", "LRens", "RBI". For each of those, we create a separate json file containing only those, once again to ease distribution and reuse – each generated file is less than 10MB.

In [None]:
i = 0
tags = ['abst16', 'avsplus', 'lrens', 'rbi']
tweets_by_tag = { tag: pd.DataFrame() for tag in tags }

# Collect tweets by tags
datasets = glob.glob('data/tagged_harvest3r_twitter_data_*.json')
for datapath in datasets:
    i += 1
    print("Working on file {} of {}: {}".format(i, len(datasets), datapath))
    tweets = pd.read_json(datapath)
    for tag in tweets_by_tag.keys():
        tweets_found = tweets.select(lambda t: tag in tweets.loc[t].tags)
        tweets_by_tag[tag] = tweets_by_tag[tag].append(tweets_found, ignore_index=True)
        print("Amount of #{} found: {} / {}".format(tag, tweets_found.main.size, tweets.main.size))
        print("Current amount of #{}: {}".format(tag, tweets_by_tag[tag].main.size))

# Write tweets for future uses
for tag, tweets_tag in tweets_by_tag.items():
    tagpath = "data/tweets-{}.json".format(tag)
    print("Writting {} tweets to {}".format(tweets_tag.main.size, tagpath))
    tweets_tag.to_json(tagpath)

In [None]:
# Display stats on collected tags
tags = ['abst16', 'avsplus', 'lrens', 'rbi']
for t in tags:
    ts = pd.read_json('data/tweets-{}.json'.format(t))
    ts = ts.dropna(subset=['main'])
    print("Tweets for {}: {}".format(t, ts.main.size))

The amount of tweets being pretty low, we need to find more of them. The issue is that a votation is referred with different hashtags, for instance the name of the law in different languages. Looking for only one of them means we miss some part of the discussions. Looking to the other tags associated with our first selection can indicate us which other hashtags has been used. We also made some research manually on Twitter to find the other hashtags. We decided to drop #avsplus and came came up with the following list for each votation:

In [None]:
i = 0
votations = {
    'abst16': {'abst16', 'abstimmung', 'abstimmungen', 'atomausstieg', 'atomausstiegja', 'atomausstiegsinitiative', 'ausstiegsinitiative', 'energiewende', 'sortirdunucleaire', 'atome', 'nucleaire', 'nucléaire'},
    'lrens': {'lrens', 'lrensnon', 'lrensoui', 'loirenseignement', 'lscpt', 'etatfouineur', 'saveprivacy', 'ndg', 'ndgnein', 'stopndg', 'ndgja', 'ndb', 'stopndb', 'src', 'bupf', 'surveillance', 'privacy'},
    'rbi': {'rbi', 'rbioui', 'rentabásicauniversal', 'revenuuniversel', 'ubi', 'basicincome', 'grundeinkommen', 'rbi16', 'rbi2016', 'revenudebase', 'revenuuniversel', 'allocationuniverselle'}
}
tweets_by_votation = { votation: pd.DataFrame() for votation in votations.keys() }

# Collect tweets by tags
datasets = glob.glob('data/tagged_harvest3r_twitter_data_*.json')
for datapath in datasets:
    i += 1
    print("Working on file {} of {}: {}".format(i, len(datasets), datapath))
    tweets = pd.read_json(datapath)
    for votation, tags_set in votations.items():
        tweets_found = tweets.select(lambda t: votation in tweets.loc[t].tags)
        tweets_tag = tweets_tag.append(tweets_found, ignore_index=True)
        print("Amount of #{} found: {} / {}".format(votation, tweets_found.main.size, tweets.main.size))
        print("Current amount of #{}: {}".format(votation, tweets_tag.main.size))

# Write tweets for future uses
for tag, tweets_tag in tweets_by_tag.items():
    tagpath = "data/tweets-{}-full.json".format(tag)
    print("Writting {} tweets to {}".format(tweets_tag.main.size, tagpath))
    tweets_tag.to_json(tagpath)

## Data exploration

There, we look at an example a votation (LRens) to get a grip of the kind of data we have and do some basic tests/exploration around it.

In [None]:
tweets = pd.read_json('data/tweets-lrens.json')
tweets.columns

In [None]:
# Here, a time plot of the number of tweets mentionning #LRens (votation of the 25 sept 2016)

from datetime import datetime

date_parser = lambda d: datetime.strptime(d[:10], "%Y-%m-%d").date()
tweets['date_found'] = tweets['date_found'].apply(date_parser)
counts = pd.DataFrame({"count": tweets.groupby('date_found').size()})
plt.figure()
counts.plot(kind='bar', figsize=(15, 7))

In [None]:
# Dumb test to see the relevance of a dirt simple classifier (to classify tweets into "will vote YES/NO")

noes = ["non", "nein", "no"] 
yess = ['oui', 'ya', 'yes']

no_tweets = []
yes_tweets = []

for i,c in df.iterrows():
    comparable = c['main'].lower().split()
    if any([no in comparable for no in noes]):
        print("NO:  ", c['main'], c['tags'])
        no_tweets.append(c['main'])
    if any([yes in comparable for yes in yess]):
        print("YES: ", c['main'], c['tags'])
        yes_tweets.append(c['main'])

### The exploration that made us a bit skeptical of basing our analysis on the sentiment score... 

First we can see that there are less than 5% of the tweets that aren't classified as "NEUTRAL" which is quite a low number to base our observations on (especially if there isn't much data to begin with). Then, by looking at the tweets we have, it's easy to see – as humans – that the subject is polarizing and there are subtleties and references to History that are hard to pickup by a computer. Example:
  "@benoitgaillard ouais, on a vu, le sujet des écoutes allemandes a été fort bien traité avant la votation sur #LRens -.-"

In [None]:
print(tweets['sentiment'].value_counts())

for idx, t in tweets[tweets['sentiment'] == 'NEUTRAL'].iterrows():
    print(t['main'])

## Computing the sentiments

In [None]:
print("# Positive tweets set")
pos_train_tweets_30 = pd.read_json('data/pos_harvest3r_twitter_data_30-10_0.json')
pos_train_tweets_31 = pd.read_json('data/pos_harvest3r_twitter_data_31-10_0.json')
print("Sizes of 30 ({}) and 31 ({})".format(pos_train_tweets_30.main.size, pos_train_tweets_31.main.size))
print("Expected size: {}".format(pos_train_tweets_30.size + pos_train_tweets_31.size))
pos_train_tweets = pos_train_tweets_30.append(pos_train_tweets_31, ignore_index=True)
print("Final size: {}".format(pos_train_tweets.main.size))

print("# Negative tweets set")
neg_train_tweets_30 = pd.read_json('data/neg_harvest3r_twitter_data_30-10_0.json')
neg_train_tweets_31 = pd.read_json('data/neg_harvest3r_twitter_data_31-10_0.json')
print("Sizes of 30 ({}) and 31 ({})".format(neg_train_tweets_30.size, neg_train_tweets_31.size))
print("Expected size: {}".format(neg_train_tweets_30.size + neg_train_tweets_31.size))
neg_train_tweets = neg_train_tweets_30.append(neg_train_tweets_31, ignore_index=True)
print("Final size: {}".format(neg_train_tweets.main.size))

print("# Full tweets set")
train_tweets = pos_train_tweets.append(neg_train_tweets, ignore_index=True)
print("Final size: {}".format(train_tweets.main.size))

print("# Test tweets set")
pos_test_tweets = pd.read_json('data/pos_harvest3r_twitter_data_01-09_0.json')
neg_test_tweets = pd.read_json('data/neg_harvest3r_twitter_data_01-09_0.json')
test_tweets = pos_test_tweets.append(neg_test_tweets, ignore_index=True)
print("Final size: {}".format(test_tweets.main.size))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score

vectorizer = TfidfVectorizer()
vectorized_train_tweets = vectorizer.fit_transform(train_tweets.main)
classifier = SGDClassifier(loss='modified_huber', penalty='l2', alpha=2e-06, max_iter=100, n_jobs=-1)
model = classifier.fit(vectorized_train_tweets, train_tweets.sentiment)
print("Cross validation: ", np.mean(cross_val_score(classifier, vectorized_train_tweets, train_tweets.sentiment, cv=5, n_jobs=-1)))

vectorized_test_tweets = vectorizer.transform(test_tweets.main)
test_tweets['our_sentiment'] = model.predict(vectorized_test_tweets)
accuracy_score(test_tweets.sentiment, test_tweets.our_sentiment)

In [None]:
import datetime as dt

date2week = lambda y, m, d: int(dt.date(y, m, d).strftime('%W'))
date_parser = lambda d: dt.datetime.strptime(d[:10], "%Y-%m-%d").date()

tags = ['abst16', 'lrens', 'rbi']
tweets_by_tags = dict()
votation_dates = { 'abst16': date2week(2016, 11, 27), 'lrens': date2week(2016, 9, 25), 'rbi': date2week(2016, 6, 5) }

for tag in tags:
    tweets = pd.read_json('data/tweets-{}.json'.format(tag))
    vectorized_tweets = vectorizer.transform(tweets.main)
    tweets['date_found'] = tweets.date_found.apply(date_parser)
    tweets['our_sentiment'] = model.predict(vectorized_tweets)
    tweets.votation_week = votation_dates[tag]
    tweets_by_tags[tag] = tweets
    for (id, t) in tweets[['main', 'our_sentiment']].head(10).iterrows():
        print("{}: {}".format(t.our_sentiment, t.main))

In [None]:
for tag, tweets in tweets_by_tags.items():
    print("#{}: {}".format(tag, tweets.main.size))
    hists = {}
    tweets['week_found'] = tweets.date_found.apply(lambda d: int(d.strftime('%W')))
    for sentiment, data in tweets.groupby('our_sentiment'):
        hist = np.histogram(data.week_found, bins=52, range=[1, 52])
        hists[sentiment] = hist[0]
        index = hist[1][1:]

    pd.DataFrame(hists, columns=['POSITIVE', 'NEGATIVE']).plot.bar(figsize = (12, 6), width = .9)
    plt.axvline(tweets.votation_week + 0.45, color='red', linewidth=1)
    
    plt.title('#' + tag)
    plt.xlabel('Weeks of 2016')
    plt.ylabel('Tweets count')