# PROJECT TITLE: FINDING THE MOST INTERESTING TWEETS AND FAN SENTIMENT DURING A GAME TELECAST IN REALTIME

# PART 4: PREPROCESSING OF TWEET DATA:

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('FIFA.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 530000 entries, 0 to 529999
Data columns (total 16 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   ID                530000 non-null  int64 
 1   lang              530000 non-null  object
 2   Date              530000 non-null  object
 3   Source            530000 non-null  object
 4   len               530000 non-null  int64 
 5   Orig_Tweet        530000 non-null  object
 6   Tweet             529449 non-null  object
 7   Likes             530000 non-null  int64 
 8   RTs               530000 non-null  int64 
 9   Hashtags          468457 non-null  object
 10  UserMentionNames  455841 non-null  object
 11  UserMentionID     455841 non-null  object
 12  Name              529945 non-null  object
 13  Place             390710 non-null  object
 14  Followers         530000 non-null  int64 
 15  Friends           530000 non-null  int64 
dtypes: int64(6), object(10)
memory usage: 

The Following Columns are important for sentiment analysis:
1. Date - Map Event in matches to tweets
2. Tweet - Actual Tweet Content
3. Likes - Weight of Sentiment
4. RTs - Weight of Sentiment
5. Hashtags - Hashtags often give reference to player / team
6. UserMentionedNames / IDs - IDs give reference to player / team

In [3]:
df.drop(['ID', 'lang', 'Source', 'len', 'Orig_Tweet', 'Name','Place', 'Followers', 'Friends'], axis = 1, inplace = True)
df

Unnamed: 0,Date,Tweet,Likes,RTs,Hashtags,UserMentionNames,UserMentionID
0,2018-07-02 01:35:45,Only two goalkeepers have saved three penaltie...,0,477,"WorldCup,POR,ENG",Squawka Football,Squawka
1,2018-07-02 01:35:44,scores the winning penalty to send into the qu...,0,1031,WorldCup,"FC Barcelona,Ivan Rakitic,HNS | CFF","FCBarcelona,ivanrakitic,HNS_CFF"
2,2018-07-02 01:35:42,Tonight we have big game,0,488,worldcup,"Javier Fernandez,Evgeni Plushenko","javierfernandez,EvgeniPlushenko"
3,2018-07-02 01:35:41,We get stronger Turn the music up now We got t...,0,0,"PowerByEXO,WorldCup,FIFAStadiumDJ,XiuminLeague","EXO,FIFA World Cup ?","weareoneEXO,FIFAWorldCup"
4,2018-07-02 01:35:40,Only two goalkeepers have saved three penaltie...,0,477,"WorldCup,POR,ENG",Squawka Football,Squawka
...,...,...,...,...,...,...,...
529995,2018-07-15 22:49:12,France have won the FIFA in Moscow,0,63163,"FRA,WorldCup,FRACRO,WorldCupFinal",FIFA World Cup,FIFAWorldCup
529996,2018-07-15 22:49:12,Beyonc JAY performed in blue jersey to celebra...,0,687,"WorldCup,OTRII,Paris,Round2",BEYONCÉ LEGION,BeyLegion
529997,2018-07-15 22:49:12,They don say immigrants are ruining France whe...,0,119,WorldCup,Khaled Beydoun,KhaledBeydoun
529998,2018-07-15 22:49:12,starts for in todays final,0,1013,"FRA,WorldCup","Manchester United,Paul Pogba","ManUtd,paulpogba"


In [4]:
df1 = df['Date']
date_array = []
time_array = []
for i in df1:
    x = i.split(" ")
    date_array.append(x[0])
    time_array.append(x[1])
df['date'] = date_array
df['time'] = time_array
df.drop(['Date'], axis = 1, inplace = True)
df

Unnamed: 0,Tweet,Likes,RTs,Hashtags,UserMentionNames,UserMentionID,date,time
0,Only two goalkeepers have saved three penaltie...,0,477,"WorldCup,POR,ENG",Squawka Football,Squawka,2018-07-02,01:35:45
1,scores the winning penalty to send into the qu...,0,1031,WorldCup,"FC Barcelona,Ivan Rakitic,HNS | CFF","FCBarcelona,ivanrakitic,HNS_CFF",2018-07-02,01:35:44
2,Tonight we have big game,0,488,worldcup,"Javier Fernandez,Evgeni Plushenko","javierfernandez,EvgeniPlushenko",2018-07-02,01:35:42
3,We get stronger Turn the music up now We got t...,0,0,"PowerByEXO,WorldCup,FIFAStadiumDJ,XiuminLeague","EXO,FIFA World Cup ?","weareoneEXO,FIFAWorldCup",2018-07-02,01:35:41
4,Only two goalkeepers have saved three penaltie...,0,477,"WorldCup,POR,ENG",Squawka Football,Squawka,2018-07-02,01:35:40
...,...,...,...,...,...,...,...,...
529995,France have won the FIFA in Moscow,0,63163,"FRA,WorldCup,FRACRO,WorldCupFinal",FIFA World Cup,FIFAWorldCup,2018-07-15,22:49:12
529996,Beyonc JAY performed in blue jersey to celebra...,0,687,"WorldCup,OTRII,Paris,Round2",BEYONCÉ LEGION,BeyLegion,2018-07-15,22:49:12
529997,They don say immigrants are ruining France whe...,0,119,WorldCup,Khaled Beydoun,KhaledBeydoun,2018-07-15,22:49:12
529998,starts for in todays final,0,1013,"FRA,WorldCup","Manchester United,Paul Pogba","ManUtd,paulpogba",2018-07-15,22:49:12


In [5]:
df1 = df['Likes']
df2 = df['RTs']
like_array = []
rt_array = []
weight_array = []

for i in df1:
    like_array.append(i)
for i in df2:
    rt_array.append(i)

for i in range(len(like_array)):
    weight_array.append(like_array[i] + rt_array[i])

df['weight'] = weight_array
df.drop(['Likes', 'RTs'], axis = 1, inplace = True)
df

Unnamed: 0,Tweet,Hashtags,UserMentionNames,UserMentionID,date,time,weight
0,Only two goalkeepers have saved three penaltie...,"WorldCup,POR,ENG",Squawka Football,Squawka,2018-07-02,01:35:45,477
1,scores the winning penalty to send into the qu...,WorldCup,"FC Barcelona,Ivan Rakitic,HNS | CFF","FCBarcelona,ivanrakitic,HNS_CFF",2018-07-02,01:35:44,1031
2,Tonight we have big game,worldcup,"Javier Fernandez,Evgeni Plushenko","javierfernandez,EvgeniPlushenko",2018-07-02,01:35:42,488
3,We get stronger Turn the music up now We got t...,"PowerByEXO,WorldCup,FIFAStadiumDJ,XiuminLeague","EXO,FIFA World Cup ?","weareoneEXO,FIFAWorldCup",2018-07-02,01:35:41,0
4,Only two goalkeepers have saved three penaltie...,"WorldCup,POR,ENG",Squawka Football,Squawka,2018-07-02,01:35:40,477
...,...,...,...,...,...,...,...
529995,France have won the FIFA in Moscow,"FRA,WorldCup,FRACRO,WorldCupFinal",FIFA World Cup,FIFAWorldCup,2018-07-15,22:49:12,63163
529996,Beyonc JAY performed in blue jersey to celebra...,"WorldCup,OTRII,Paris,Round2",BEYONCÉ LEGION,BeyLegion,2018-07-15,22:49:12,687
529997,They don say immigrants are ruining France whe...,WorldCup,Khaled Beydoun,KhaledBeydoun,2018-07-15,22:49:12,119
529998,starts for in todays final,"FRA,WorldCup","Manchester United,Paul Pogba","ManUtd,paulpogba",2018-07-15,22:49:12,1013


In [6]:
ht_arr = []
df.Hashtags = df.Hashtags.fillna('')
df1 = df['Hashtags']
for i in df1:
    x = i.split(',')
    ht_arr.append(x)
df['HTs'] = ht_arr
df.drop(['Hashtags'], axis = 1, inplace = True)
df

Unnamed: 0,Tweet,UserMentionNames,UserMentionID,date,time,weight,HTs
0,Only two goalkeepers have saved three penaltie...,Squawka Football,Squawka,2018-07-02,01:35:45,477,"[WorldCup, POR, ENG]"
1,scores the winning penalty to send into the qu...,"FC Barcelona,Ivan Rakitic,HNS | CFF","FCBarcelona,ivanrakitic,HNS_CFF",2018-07-02,01:35:44,1031,[WorldCup]
2,Tonight we have big game,"Javier Fernandez,Evgeni Plushenko","javierfernandez,EvgeniPlushenko",2018-07-02,01:35:42,488,[worldcup]
3,We get stronger Turn the music up now We got t...,"EXO,FIFA World Cup ?","weareoneEXO,FIFAWorldCup",2018-07-02,01:35:41,0,"[PowerByEXO, WorldCup, FIFAStadiumDJ, XiuminLe..."
4,Only two goalkeepers have saved three penaltie...,Squawka Football,Squawka,2018-07-02,01:35:40,477,"[WorldCup, POR, ENG]"
...,...,...,...,...,...,...,...
529995,France have won the FIFA in Moscow,FIFA World Cup,FIFAWorldCup,2018-07-15,22:49:12,63163,"[FRA, WorldCup, FRACRO, WorldCupFinal]"
529996,Beyonc JAY performed in blue jersey to celebra...,BEYONCÉ LEGION,BeyLegion,2018-07-15,22:49:12,687,"[WorldCup, OTRII, Paris, Round2]"
529997,They don say immigrants are ruining France whe...,Khaled Beydoun,KhaledBeydoun,2018-07-15,22:49:12,119,[WorldCup]
529998,starts for in todays final,"Manchester United,Paul Pogba","ManUtd,paulpogba",2018-07-15,22:49:12,1013,"[FRA, WorldCup]"


In [7]:
umn_arr = []
df.UserMentionNames = df.UserMentionNames.fillna('')
df1 = df['UserMentionNames']
for i in df1:
    x = i.split(',')
    umn_arr.append(x)
df['UMN'] = umn_arr
df.drop(['UserMentionNames'], axis = 1, inplace = True)
df

Unnamed: 0,Tweet,UserMentionID,date,time,weight,HTs,UMN
0,Only two goalkeepers have saved three penaltie...,Squawka,2018-07-02,01:35:45,477,"[WorldCup, POR, ENG]",[Squawka Football]
1,scores the winning penalty to send into the qu...,"FCBarcelona,ivanrakitic,HNS_CFF",2018-07-02,01:35:44,1031,[WorldCup],"[FC Barcelona, Ivan Rakitic, HNS | CFF]"
2,Tonight we have big game,"javierfernandez,EvgeniPlushenko",2018-07-02,01:35:42,488,[worldcup],"[Javier Fernandez, Evgeni Plushenko]"
3,We get stronger Turn the music up now We got t...,"weareoneEXO,FIFAWorldCup",2018-07-02,01:35:41,0,"[PowerByEXO, WorldCup, FIFAStadiumDJ, XiuminLe...","[EXO, FIFA World Cup ?]"
4,Only two goalkeepers have saved three penaltie...,Squawka,2018-07-02,01:35:40,477,"[WorldCup, POR, ENG]",[Squawka Football]
...,...,...,...,...,...,...,...
529995,France have won the FIFA in Moscow,FIFAWorldCup,2018-07-15,22:49:12,63163,"[FRA, WorldCup, FRACRO, WorldCupFinal]",[FIFA World Cup]
529996,Beyonc JAY performed in blue jersey to celebra...,BeyLegion,2018-07-15,22:49:12,687,"[WorldCup, OTRII, Paris, Round2]",[BEYONCÉ LEGION]
529997,They don say immigrants are ruining France whe...,KhaledBeydoun,2018-07-15,22:49:12,119,[WorldCup],[Khaled Beydoun]
529998,starts for in todays final,"ManUtd,paulpogba",2018-07-15,22:49:12,1013,"[FRA, WorldCup]","[Manchester United, Paul Pogba]"


In [8]:
umid_arr = []
df.UserMentionID = df.UserMentionID.fillna('')
df1 = df['UserMentionID']
for i in df1:
    x = i.split(',')
    umid_arr.append(x)
df['UMID'] = umid_arr
df.drop(['UserMentionID'], axis = 1, inplace = True)
df

Unnamed: 0,Tweet,date,time,weight,HTs,UMN,UMID
0,Only two goalkeepers have saved three penaltie...,2018-07-02,01:35:45,477,"[WorldCup, POR, ENG]",[Squawka Football],[Squawka]
1,scores the winning penalty to send into the qu...,2018-07-02,01:35:44,1031,[WorldCup],"[FC Barcelona, Ivan Rakitic, HNS | CFF]","[FCBarcelona, ivanrakitic, HNS_CFF]"
2,Tonight we have big game,2018-07-02,01:35:42,488,[worldcup],"[Javier Fernandez, Evgeni Plushenko]","[javierfernandez, EvgeniPlushenko]"
3,We get stronger Turn the music up now We got t...,2018-07-02,01:35:41,0,"[PowerByEXO, WorldCup, FIFAStadiumDJ, XiuminLe...","[EXO, FIFA World Cup ?]","[weareoneEXO, FIFAWorldCup]"
4,Only two goalkeepers have saved three penaltie...,2018-07-02,01:35:40,477,"[WorldCup, POR, ENG]",[Squawka Football],[Squawka]
...,...,...,...,...,...,...,...
529995,France have won the FIFA in Moscow,2018-07-15,22:49:12,63163,"[FRA, WorldCup, FRACRO, WorldCupFinal]",[FIFA World Cup],[FIFAWorldCup]
529996,Beyonc JAY performed in blue jersey to celebra...,2018-07-15,22:49:12,687,"[WorldCup, OTRII, Paris, Round2]",[BEYONCÉ LEGION],[BeyLegion]
529997,They don say immigrants are ruining France whe...,2018-07-15,22:49:12,119,[WorldCup],[Khaled Beydoun],[KhaledBeydoun]
529998,starts for in todays final,2018-07-15,22:49:12,1013,"[FRA, WorldCup]","[Manchester United, Paul Pogba]","[ManUtd, paulpogba]"


In [9]:
day_arr = []
mon_arr = []
df1 = df['date']
for i in df1:
    x = i.split('-')
    mon_arr.append(int(x[1]))
    day_arr.append(int(x[2]))
df['day'] = day_arr
df['mon'] = mon_arr
df.drop(['date'], axis = 1, inplace = True)
df

Unnamed: 0,Tweet,time,weight,HTs,UMN,UMID,day,mon
0,Only two goalkeepers have saved three penaltie...,01:35:45,477,"[WorldCup, POR, ENG]",[Squawka Football],[Squawka],2,7
1,scores the winning penalty to send into the qu...,01:35:44,1031,[WorldCup],"[FC Barcelona, Ivan Rakitic, HNS | CFF]","[FCBarcelona, ivanrakitic, HNS_CFF]",2,7
2,Tonight we have big game,01:35:42,488,[worldcup],"[Javier Fernandez, Evgeni Plushenko]","[javierfernandez, EvgeniPlushenko]",2,7
3,We get stronger Turn the music up now We got t...,01:35:41,0,"[PowerByEXO, WorldCup, FIFAStadiumDJ, XiuminLe...","[EXO, FIFA World Cup ?]","[weareoneEXO, FIFAWorldCup]",2,7
4,Only two goalkeepers have saved three penaltie...,01:35:40,477,"[WorldCup, POR, ENG]",[Squawka Football],[Squawka],2,7
...,...,...,...,...,...,...,...,...
529995,France have won the FIFA in Moscow,22:49:12,63163,"[FRA, WorldCup, FRACRO, WorldCupFinal]",[FIFA World Cup],[FIFAWorldCup],15,7
529996,Beyonc JAY performed in blue jersey to celebra...,22:49:12,687,"[WorldCup, OTRII, Paris, Round2]",[BEYONCÉ LEGION],[BeyLegion],15,7
529997,They don say immigrants are ruining France whe...,22:49:12,119,[WorldCup],[Khaled Beydoun],[KhaledBeydoun],15,7
529998,starts for in todays final,22:49:12,1013,"[FRA, WorldCup]","[Manchester United, Paul Pogba]","[ManUtd, paulpogba]",15,7


In [10]:
hr_arr = []
mnt_arr = []
df1 = df['time']
for i in df1:
    x = i.split(':')
    hr_arr.append(int(x[0]))
    mnt_arr.append(int(x[1]))
df['hr'] = hr_arr
df['mnt'] = mnt_arr
df.drop(['time'], axis = 1, inplace = True)
df

Unnamed: 0,Tweet,weight,HTs,UMN,UMID,day,mon,hr,mnt
0,Only two goalkeepers have saved three penaltie...,477,"[WorldCup, POR, ENG]",[Squawka Football],[Squawka],2,7,1,35
1,scores the winning penalty to send into the qu...,1031,[WorldCup],"[FC Barcelona, Ivan Rakitic, HNS | CFF]","[FCBarcelona, ivanrakitic, HNS_CFF]",2,7,1,35
2,Tonight we have big game,488,[worldcup],"[Javier Fernandez, Evgeni Plushenko]","[javierfernandez, EvgeniPlushenko]",2,7,1,35
3,We get stronger Turn the music up now We got t...,0,"[PowerByEXO, WorldCup, FIFAStadiumDJ, XiuminLe...","[EXO, FIFA World Cup ?]","[weareoneEXO, FIFAWorldCup]",2,7,1,35
4,Only two goalkeepers have saved three penaltie...,477,"[WorldCup, POR, ENG]",[Squawka Football],[Squawka],2,7,1,35
...,...,...,...,...,...,...,...,...,...
529995,France have won the FIFA in Moscow,63163,"[FRA, WorldCup, FRACRO, WorldCupFinal]",[FIFA World Cup],[FIFAWorldCup],15,7,22,49
529996,Beyonc JAY performed in blue jersey to celebra...,687,"[WorldCup, OTRII, Paris, Round2]",[BEYONCÉ LEGION],[BeyLegion],15,7,22,49
529997,They don say immigrants are ruining France whe...,119,[WorldCup],[Khaled Beydoun],[KhaledBeydoun],15,7,22,49
529998,starts for in todays final,1013,"[FRA, WorldCup]","[Manchester United, Paul Pogba]","[ManUtd, paulpogba]",15,7,22,49


In [11]:
df1 = df.sort_values(by = ['mon','day','hr','mnt'])
df1.reset_index(inplace = True, drop = True)
df1

Unnamed: 0,Tweet,weight,HTs,UMN,UMID,day,mon,hr,mnt
0,Want to make this summer football that little ...,543,"[Win, WorldCup]",[Bargain Booze],[BargainBooze],29,6,23,56
1,Didier Deschamps has confirmed Benjamin Mendy ...,23,"[FRA, ARG]",[City Watch],[City_Watch],29,6,23,56
2,Please play Power by EXO EXO also performed du...,1300,[],"[Aeri Days, FIFA World Cup ?]","[aeridays, FIFAWorldCup]",29,6,23,56
3,weeks teams games goals The group stage was ev...,1259,[WorldCup],[B/R Football],[brfootball],29,6,23,56
4,Please play Power by EXO EXO also performed du...,1300,[],"[Aeri Days, FIFA World Cup ?]","[aeridays, FIFAWorldCup]",29,6,23,56
...,...,...,...,...,...,...,...,...,...
529995,Please guys please please help me ouuttt Pleas...,87,[],"[ShakaZuluOnTheBeats, #RecognizeMeEp - 10 Augu...","[K_SUPREME_ZA, TBanSA]",15,7,23,14
529996,Dear France Congratulations on winning the of ...,96217,[WorldCup],[Khaled Beydoun],[KhaledBeydoun],15,7,23,14
529997,France have won the FIFA in Moscow,63116,"[FRA, WorldCup, FRACRO, WorldCupFinal]",[FIFA World Cup],[FIFAWorldCup],15,7,23,14
529998,Islamophobia Xenophobia and racism are global ...,1,[],[Alex Florack],[crimsonsith720],15,7,23,14


In [12]:
df1.to_csv('tweets_proc.csv')