In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import vincent
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
import unicodedata
import pickle
import folium

In [2]:
# Match: Portugal vs France.

# Get all the tweets from the .csv files into pandas DataFrame.
t_por = pd.read_csv('/home/tiago/Desktop/Tiago/Proyectos/Twitter_data/euro_copa/7_por_fra/por_fra_por.csv', encoding='utf-8')
t_fra = pd.read_csv('/home/tiago/Desktop/Tiago/Proyectos/Twitter_data/euro_copa/7_por_fra/por_fra_fra.csv', encoding='utf-8')

# Make all the text lowercase.
t_por['text'] = t_por['text'].str.lower()
t_fra['text'] = t_fra['text'].str.lower()

# Concatenate both dataframes and remove duplicate tweets.
t_por_fra = pd.concat([t_por, t_fra])
t_por_fra.drop_duplicates(inplace=True)

# Transforme the 'created_at' field into a datetime.
t_por_fra['created_at'] = pd.to_datetime(t_por_fra['created_at'])
t_por_fra.head()

Unnamed: 0,text,created_at,geo,source,retweet_count,tweet_id,coordinates,favorite_count,in_reply_to_status_id,in_reply_to_user_id,lang
0,rt @bbcsport: ronaldo looks up for this...\n\n...,2016-07-10 18:46:11,,RoundTeam,0,752212336736428033,,0,,,en
1,rt @luisfigo: is the moment lets go my portuga...,2016-07-10 18:46:11,,Twitter for Android,0,752212336568438784,,0,,,en
2,portugal can do this man cmon #por,2016-07-10 18:46:11,,Twitter for iPhone,0,752212337046777856,,0,,,en
3,"rt @andrewdasnyt: i'm totally biased, but i th...",2016-07-10 18:46:11,,Twitter for Android,0,752212336971100161,,0,,,en
4,rt @uefaeuro: the teams are in! william carval...,2016-07-10 18:46:11,,Twitter for Android,0,752212337029820416,,0,,,en


In [4]:
# Remove the tweets made before and after the match.

mask1 = (t_por_fra['created_at'] > '2016-07-10 10:30:00') & (t_por_fra['created_at'] <= '2016-07-10 18:55:00')
t_por_fra.loc[mask1]
t_por_fra.drop(t_por_fra.loc[mask1].index, inplace=True)

mask2 = (t_por_fra['created_at'] > '2016-07-10 23:50:00') & (t_por_fra['created_at'] <= '2016-07-10 23:59:59')
t_por_fra.loc[mask2]
t_por_fra.drop(t_por_fra.loc[mask2].index, inplace=True)

In [5]:
# Print some information.

print "Total tweets: " + str(len(t_por_fra.index))

t_rt = t_por_fra[t_por_fra['text'].str.contains('rt')]
print "Count of retweets: " + str(len(t_rt))

from __future__ import division

porc = len(t_rt)*100/len(t_por_fra.index)
print "Percentage of retweets in dataset: " + str(porc)

Total tweets: 789702
Count of retweets: 622538
Percentage of retweets in dataset: 78.8320151146


In [6]:
# Plot tweets vs retweets in dataset.

pre_data = []
for x in range(0,167164):
    pre_data.append('tweet')
for x in range(0,622538):
    pre_data.append('re-tweet')
    
s = pd.Series(pre_data)
s.value_counts(normalize=True)

df_s = pd.DataFrame(s.value_counts(normalize=True))
vincent.core.initialize_notebook()

donut = vincent.Pie(df_s, inner_radius=80, outer_radius=100)
donut.colors(brew="Set2")
donut.legend('Categories')
donut.display()

In [7]:
print "Percentage of languages"

print t_por_fra['lang'].value_counts(normalize=True)[0:9]

Percentage of languages
en     0.366476
fr     0.196498
pt     0.144828
es     0.135403
de     0.041768
in     0.025468
und    0.020465
nl     0.017895
it     0.009573
Name: lang, dtype: float64


In [8]:
# Plot languages:
df_lang = pd.DataFrame(t_por_fra['lang'].value_counts(normalize=True)[0:8])

vincent.core.initialize_notebook()

donut = vincent.Pie(df_lang, inner_radius=80, outer_radius=100)
donut.colors(brew="Set2")
donut.legend('Languages')
donut.display()

In [9]:
print "Sources percentage"

print t_por_fra['source'].value_counts(normalize=True)[0:9]

Sources percentage
Twitter for iPhone           0.391036
Twitter for Android          0.380603
Twitter Web Client           0.079406
Twitter for iPad             0.022243
TweetDeck                    0.016623
Facebook                     0.014169
Twitter for Windows Phone    0.011745
Mobile Web (M2)              0.011114
Mobile Web (M5)              0.009595
Name: source, dtype: float64


In [10]:
# Plot the sources:
df_source = pd.DataFrame(t_por_fra['source'].value_counts(normalize=True)[0:5])

vincent.core.initialize_notebook()

donut = vincent.Pie(df_source, inner_radius=80, outer_radius=100)
donut.colors(brew="Set2")
donut.legend('Sources')
donut.display()

In [11]:
# Group the tweets in 1 minute intervals.

tweets2 = t_por_fra.set_index(t_por_fra['created_at'])
tweets2.index.name = None
tweets3 = tweets2.resample('1min').count()

# Plot the amount of tweets per minute during the match.

vincent.core.initialize_notebook()

area = vincent.Area(tweets3['created_at'])
area.colors(brew='Spectral')
area.display()

In [12]:
# Print the mean of tweets per minute made during the match.

print tweets3['created_at'].mean()

4645.30588235


In [13]:
# Analyze the most frecuent words made during the match.

local_stopwords = ['#porfra', 'portugal', 'france', 'rt', "c'est", u'\xe9', 'va']
stop_words = stopwords.words('english') + stopwords.words('french') + stopwords.words('portuguese') + stopwords.words('spanish') + local_stopwords
text = t_por_fra['text']

tokens = []
for txt in text.values:
    tokens.extend([t.lower().strip(":,.-!") for t in txt.split()])


filtered_tokens = [w for w in tokens if not w in stop_words]

freq_dist = nltk.FreqDist(filtered_tokens)
freq_dist.pop('')
lmc = freq_dist.most_common(20)
for elem in lmc:
    print elem

(u'ronaldo', 116335)
(u'#euro2016', 107128)
(u'#euro2016final', 85687)
(u'#por', 66802)
(u'stade', 52648)
(u'final', 45352)
(u'#frapor', 39942)
(u'win', 35489)
(u'2016', 32850)
(u'cristiano', 30523)
(u'#fra', 29981)
(u'euro', 27184)
(u'si', 24821)
(u'vs', 23092)
(u'@uefaeuro', 22044)
(u'vamos', 18644)
(u'@cristiano', 17579)
(u'time', 16694)
(u'go', 16464)
(u'#portugal', 15989)


In [14]:
# Analyze the most used bi-grams.
bgs = nltk.bigrams(filtered_tokens)

fdist = nltk.FreqDist(bgs)
for elem in fdist.most_common(10):
    print elem

((u'cristiano', u'ronaldo'), 17703)
((u'euro', u'2016'), 13225)
((u'2016', u'stade'), 12949)
((u'1998', u'stade'), 12901)
((u'stade', u'ronaldo'), 12227)
((u'#euro2016', u'#por'), 12194)
((u'ronaldo', u'2016'), 10796)
((u'ronaldo', u'1998'), 10219)
((u'#por', u'#euro2016final'), 10007)
((u'#euro2016final', u'#euro2016'), 8524)


In [15]:
# Analyze the most used tri-grams.
tgs = nltk.trigrams(filtered_tokens)

fdist = nltk.FreqDist(tgs)
for elem in fdist.most_common(10):
    print elem

((u'1998', u'stade', u'ronaldo'), 12024)
((u'ronaldo', u'2016', u'stade'), 9975)
((u'stade', u'ronaldo', u'2016'), 9941)
((u'ronaldo', u'1998', u'stade'), 9928)
((u'#euro2016', u'#por', u'#euro2016final'), 6574)
((u'2016', u'stade', u'spooky'), 6192)
((u'euro', u'2016', u'final'), 5579)
((u"can't", u'believe', u'ronaldo'), 5569)
((u'believe', u'ronaldo', u'cmon'), 5552)
((u'honestly', u'shook', u"can't"), 5552)


In [16]:
# Find the most retweeted tweets
mrt = pd.DataFrame(data=t_por_fra['text'].value_counts())
mrt[:10]

Unnamed: 0,text
rt @shawnmendes: honestly shook up. can't believe ronaldo is out. cmon portugal ❤️🇵🇹,5549
rt @footbalifights: ronaldo 1998. stade de france. ronaldo 2016. stade de france. spooky https://t.co/u2b4lc30tc,3773
rt @trollfootball: ronaldos do not like the stade de france... https://t.co/mpaeyeioim,2616
rt @shawnmendes: okay portugal. you got this. 🇵🇹🇵🇹,2055
rt @delpieroale: forza @cristiano ! #porfra,1944
"rt @9gag: ronaldo '98/ronaldo '16 both in finals, against france, same stadium #euro2016final #ronaldo\nhttps://t.co/ipyckmcqaz https://t.co…",1887
rt @sarasampaio: vamos é ganhar esta merda!!!!! portugal,1821
rt @foxsports_br: adversário: frança\nlocal: stade de france\ncoincidência ou maldição? https://t.co/ob8vpcjmqa,1371
rt @footyfiashbacks: ronaldo 1998. stade de france. ronaldo 2016. stade de france... https://t.co/iw215yqr1c,1363
watching portugal vs france,1143


In [16]:
# Los jugadores mas nombrados fueron:
# 1- (u'cristiano', 30523),(u'ronaldo', 116335)
# 2- (u'payet', 11508)
# 3- (u'sissoko', 10138)
# 4- (u'eder', 7528)
# 5- (u'quaresma', 5636)
# 6- (u'griezmann', 5549)
# 7- (u'rui', 5521)
# 8- (u'giroud', 4854)
# 9- (u'pepe', 4498)
# 10-(u'gignac', 3909)

In [17]:
# Plot the most named players of the match (and when where they named).

t1 = t_por_fra[t_por_fra['text'].str.contains('ronaldo')]
t1 = t1.set_index(t1['created_at'])
t1.index.name = None
t1 = t1.resample('1min').count()

t2 = t_por_fra[t_por_fra['text'].str.contains('payet')]
t2 = t2.set_index(t2['created_at'])
t2.index.name = None
t2 = t2.resample('1min').count()

t3 = t_por_fra[t_por_fra['text'].str.contains('sissoko')]
t3 = t3.set_index(t3['created_at'])
t3.index.name = None
t3 = t3.resample('1min').count()

t4 = t_por_fra[t_por_fra['text'].str.contains('eder')]
t4 = t4.set_index(t4['created_at'])
t4.index.name = None
t4 = t4.resample('1min').count()

t5 = t_por_fra[t_por_fra['text'].str.contains('quaresma')]
t5 = t5.set_index(t5['created_at'])
t5.index.name = None
t5 = t5.resample('1min').count()

vincent.core.initialize_notebook()

t1.drop(['created_at', 'geo', 'source', 'retweet_count', 'tweet_id', 'coordinates', 'favorite_count', 'in_reply_to_status_id', 'in_reply_to_user_id', 'lang'],axis=1, inplace=True)
t1.columns = ['Ronaldo']
t2.drop(['created_at', 'geo', 'source', 'retweet_count', 'tweet_id', 'coordinates', 'favorite_count', 'in_reply_to_status_id', 'in_reply_to_user_id', 'lang'],axis=1, inplace=True)
t2.columns = ['Payet']
t3.drop(['created_at', 'geo', 'source', 'retweet_count', 'tweet_id', 'coordinates', 'favorite_count', 'in_reply_to_status_id', 'in_reply_to_user_id', 'lang'],axis=1, inplace=True)
t3.columns = ['Sissoko']
t4.drop(['created_at', 'geo', 'source', 'retweet_count', 'tweet_id', 'coordinates', 'favorite_count', 'in_reply_to_status_id', 'in_reply_to_user_id', 'lang'],axis=1, inplace=True)
t4.columns = ['Eder']
t5.drop(['created_at', 'geo', 'source', 'retweet_count', 'tweet_id', 'coordinates', 'favorite_count', 'in_reply_to_status_id', 'in_reply_to_user_id', 'lang'],axis=1, inplace=True)
t5.columns = ['Quaresma']

t_fin = t1.join(t2)
t_fin = t_fin.join(t3)
t_fin = t_fin.join(t4)
t_fin = t_fin.join(t5)

lines = vincent.Line(t_fin)
lines.legend(title='Players')
lines.display()

In [18]:
# Analyze Quaresma's spike.

t_ram = t_por_fra[t_por_fra['text'].str.contains('quaresma')]

tweets2 = t_ram.set_index(t_ram['created_at'])
tweets2.index.name = None
tweets3 = tweets2.resample('1min').count()

# Find the spike.
print tweets3['created_at'].idxmax()

# Isolate the spike.
mask3 = (t_ram['created_at'] < '2016-07-10 19:25:00') | (t_ram['created_at'] > '2016-07-10 19:27:00')
tweets4 = t_ram.copy()
tweets4.loc[mask3]
tweets4.drop(tweets4.loc[mask3].index, inplace=True)
tweets4.count()


# Analyze the most frequent words during the spike.
local_stopwords = ['rt', 'vs']
stop_words = stopwords.words('english') + stopwords.words('french') + stopwords.words('portuguese') + local_stopwords
text = tweets4['text']


tokens = []
for txt in text.values:
    tokens.extend([t.lower().strip(":,.-") for t in txt.split()])


filtered_tokens = [w for w in tokens if not w in stop_words]

# Print the most common words during the spike.
freq_dist = nltk.FreqDist(filtered_tokens)
freq_dist.pop('')
lmc = freq_dist.most_common(20)
print lmc

# Print the most retweeted tweets during the spike
mrt = pd.DataFrame(data=tweets4['text'].value_counts())
print mrt[:10]


2016-07-10 19:26:00
[(u'quaresma', 303), (u'ronaldo', 190), (u'#porfra', 189), (u'portugal', 115), (u'ricardo', 103), (u'#euro2016', 97), (u'cristiano', 84), (u'#euro2016final', 79), (u'@uefaeuro', 62), (u'#por', 62), (u'stretcher', 62), (u'pitch', 61), (u'f\xfcr', 61), (u'24', 61), (u'leaves', 60), (u'replaced', 53), (u'go', 52), (u"can't", 52), (u'heartbreaking', 51), (u"he's", 51)]
                                                    text
rt @uefaeuro: 24 - quaresma is on for ronaldo, ...    59
rt @espnfc: cristiano ronaldo can't go on. he's...    51
rt @fcbayern: rückschlag für #por: @cristiano m...    28
rt @5livesport: cristiano's night is over.\n\nq...    12
rt @bola24pt: franceses e portugueses aplaudem ...    11
rt @catarinaliz: bora, tragam o quaresma, calma...     7
rt @plantaofutebol2: não deu! cristiano ronaldo...     6
rt @haberkartali: quaresma, 23. dakikada sakatl...     5
rt @catarinaliz: o teu máximo quaresma, precisa...     4
rt @footballitalia: cristiano ronaldo stre

In [19]:
# Popular hashtags.
hashtags = [x for x in filtered_tokens if x.startswith('#')]

hashtag_fd = nltk.FreqDist(hashtags)
hmc = hashtag_fd.most_common(20)
print hmc[0:10]

[(u'#porfra', 189), (u'#euro2016', 97), (u'#euro2016final', 79), (u'#por', 62), (u'#quaresma', 10), (u'#portugal', 4), (u'#fra', 3), (u'#cr7', 2), (u'#eurocopa', 2), (u'#frapor', 2)]


In [20]:
# Popular mentions ('@username')
mentions = [x for x in filtered_tokens if x.startswith('@')]

mentions_fd = nltk.FreqDist(mentions)
mmc = mentions_fd.most_common(20)
print mmc[0:10]

[(u'@uefaeuro', 62), (u'@espnfc', 51), (u'@cristiano', 33), (u'@fcbayern', 28), (u'@5livesport', 12), (u'@catarinaliz', 11), (u'@bola24pt', 11), (u'@plantaofutebol2', 6), (u'@haberkartali', 5), (u'@footballitalia', 2)]


In [21]:
# Amount of geographical data.
print len(t_por_fra['coordinates'].dropna())

1740


In [23]:
# Map the geographical data.

# Get the geographical data.
res = t_por_fra['coordinates'].dropna()
res2 = res.value_counts()
aux = res2.to_dict()

import json

l = []
for k,v in aux.iteritems():
    kj = json.loads(k)
    l.append((kj['coordinates'],v))
    

map_por_fra = folium.Map()

for elem in l:
    if elem[1] < 5:
        folium.CircleMarker(location=[elem[0][1], elem[0][0]], radius=500, color='#3186cc',fill_color='#3186cc').add_to(map_por_fra)
    elif elem[1] < 10:
        folium.CircleMarker(location=[elem[0][1], elem[0][0]], radius=500, color='yellow',fill_color='yellow').add_to(map_por_fra)
    elif elem[1] < 15:
        folium.CircleMarker(location=[elem[0][1], elem[0][0]], radius=500, color='orange',fill_color='orange').add_to(map_por_fra)
    else:
        folium.CircleMarker(location=[elem[0][1], elem[0][0]], radius=500, color='red',fill_color='red').add_to(map_por_fra)
        
map_por_fra