# PROJECT TITLE: FINDING THE MOST INTERESTING TWEETS AND FAN SENTIMENT DURING A GAME TELECAST IN REALTIME

# PART 2: ANALYSIS OF SCHEDULE & EVENT DATA:

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## DATASET CONTENT:

knockouts.csv has the following columns:
1. Category - Match Category in Knockout Tournament
2. DateTime - Start of Match Date and Time (Russia)
3. Team1 - First Team
4. Team2 - Second Team
5. Score1 - No of Goals Scored by First Team
6. Score2 - No of Goals Scored by Second Team
7. PenScore1 - No of Goals Scored by First Team in Penalties
8. PenScore2 - No of Goals Scored by Second Team in Penalties
9. Winner - Team that won

In [2]:
df = pd.read_csv('knockouts.csv')
df['PenScore1'] = df['PenScore1'].fillna(0)
df['PenScore2'] = df['PenScore2'].fillna(0)
df

Unnamed: 0,Category,DateTime,Team1,Team2,Score1,Score2,PenScore1,PenScore2,Winner
0,ROUND OF 16,30-06-2018 17:00,FRANCE,ARGENTINA,4,3,0.0,0.0,FRANCE
1,ROUND OF 16,30-06-2018 21:00,URUGUAY,PORTUGAL,2,1,0.0,0.0,URUGUAY
2,ROUND OF 16,01-07-2018 17:00,SPAIN,RUSSIA,1,1,3.0,4.0,RUSSIA
3,ROUND OF 16,01-07-2018 21:00,CROATIA,DENMARK,1,1,3.0,2.0,CROATIA
4,ROUND OF 16,02-07-2018 18:00,BRAZIL,MEXICO,2,0,0.0,0.0,BRAZIL
5,ROUND OF 16,02-07-2018 21:00,BELGIUM,JAPAN,3,2,0.0,0.0,BELGIUM
6,ROUND OF 16,03-07-2018 17:00,SWEDEN,SWITZERLAND,1,0,0.0,0.0,SWEDEN
7,ROUND OF 16,03-07-2018 21:00,COLOMBIA,ENGLAND,1,1,3.0,4.0,ENGLAND
8,QUARTER FINALS,06-07-2018 17:00,URUGUAY,FRANCE,0,2,0.0,0.0,FRANCE
9,QUARTER FINALS,06-07-2018 21:00,BRAZIL,BELGIUM,1,2,0.0,0.0,BELGIUM


## 1. Get Total Count of Goals:

In [3]:
gc1 = 0
for i in range(16):
    gc1 += df['Score1'][i]
    gc1 += df['Score2'][i]
    gc1 += df['PenScore1'][i]
    gc1 += df['PenScore2'][i]
print('Total Goals in Knockouts : ', gc1)

Total Goals in Knockouts :  73.0


## 2. Check if Winners are Valid:

In [4]:
for i in range(16):
    if(df['Score1'][i] + df['PenScore1'][i] > df['Score2'][i] + df['PenScore2'][i]):
        if(df['Winner'][i] != df['Team1'][i]):
            print('Error')
    else:
        if(df['Winner'][i] != df['Team2'][i]):
            print('Error')

## 3. Check Distribution of Matches Played by Teams:

In [5]:
team_list = []
for i in range(16):
    team_list.append(df['Team1'][i])
    team_list.append(df['Team2'][i])

team_set = []
for i in team_list:
    if i not in team_set:
        team_set.append(i)

for i in team_set:
    count = team_list.count(i)
    print("Team : ", i)
    print("Match Count : ", count)
    if (count<1 or count>4):
        print("Error")
    print("\n")

Team :  FRANCE
Match Count :  4


Team :  ARGENTINA
Match Count :  1


Team :  URUGUAY
Match Count :  2


Team :  PORTUGAL
Match Count :  1


Team :  SPAIN
Match Count :  1


Team :  RUSSIA
Match Count :  2


Team :  CROATIA
Match Count :  4


Team :  DENMARK
Match Count :  1


Team :  BRAZIL
Match Count :  2


Team :  MEXICO
Match Count :  1


Team :  BELGIUM
Match Count :  4


Team :  JAPAN
Match Count :  1


Team :  SWEDEN
Match Count :  2


Team :  SWITZERLAND
Match Count :  1


Team :  COLOMBIA
Match Count :  1


Team :  ENGLAND
Match Count :  4




# DATASET CONTENT:

events.csv has the following columns:
1. Timestamp - Indicates Time in Match
2. ExtraMins - Indicates Overtime Given in Half or Extra Time
3. Team1 - First Team
4. Team2 - Second Team
5. Player - Player Name
6. Team - Team Name of Player
7. Event - Event Occured at Given Timestamp by given player

In [6]:
df1 = pd.read_csv('events.csv')
df1

Unnamed: 0,Timestamp,ExtraMins,Team1,Team2,Player,Team,Event
0,0,0,FRANCE,ARGENTINA,,,START
1,1,0,FRANCE,ARGENTINA,GIROUD,FRANCE,FOUL
2,3,0,FRANCE,ARGENTINA,POGBA,FRANCE,FOUL
3,3,0,FRANCE,ARGENTINA,MATUIDI,FRANCE,FOUL
4,8,0,FRANCE,ARGENTINA,MASCHERANO,ARGENTINA,FOUL
...,...,...,...,...,...,...,...
1461,90,1,FRANCE,CROATIA,VRSALJKO,CROATIA,FOUL
1462,90,2,FRANCE,CROATIA,VRSALJKO,CROATIA,FOUL
1463,90,3,FRANCE,CROATIA,VRSALJKO,CROATIA,YELLOW CARD
1464,90,4,FRANCE,CROATIA,FEKIR,FRANCE,FOUL


## 4. Check Distribution of Events of Teams:

In [7]:
event_list = []
for i in range(1465):
    event_list.append(df1['Team'][i])

event_set = []
for i in event_list:
    if i not in event_set:
        event_set.append(i)

event_set = [x for x in event_set if str(x) != 'nan']
for i in event_set:
    count = event_list.count(i)
    print("Team : ", i)
    print("Event Count : ", count)
    print("\n")

Team :  FRANCE
Event Count :  129


Team :  ARGENTINA
Event Count :  37


Team :  URUGUAY
Event Count :  57


Team :  PORTUGAL
Event Count :  48


Team :  RUSSIA
Event Count :  87


Team :  SPAIN
Event Count :  43


Team :  DENMARK
Event Count :  43


Team :  CROATIA
Event Count :  207


Team :  MEXICO
Event Count :  42


Team :  BRAZIL
Event Count :  89


Team :  JAPAN
Event Count :  28


Team :  BELGIUM
Event Count :  155


Team :  SWITZERLAND
Event Count :  46


Team :  SWEDEN
Event Count :  66


Team :  COLUMBIA
Event Count :  72


Team :  ENGLAND
Event Count :  225




## 5. Check Distribution of Goals:

In [8]:
goal_list = []
for i in range(1465):
    if df1["Event"][i] == 'GOAL':
        tmp = []
        tmp.append(df1['Player'][i])
        tmp.append(df1['Team'][i])
        goal_list.append(tmp)

goal_set = []
for i in goal_list:
    if i not in goal_set:
        goal_set.append(i)

gc2 = 0
for i in goal_set:
    count = goal_list.count(i)
    print("Player : ", i[0])
    print("Team : ", i[1])
    print("Goal Count : ", count)
    gc2 += count
    print("\n")

Player :  GRIEZMANN
Team :  FRANCE
Goal Count :  3


Player :  DI MARIA
Team :  ARGENTINA
Goal Count :  1


Player :  MERCADO
Team :  ARGENTINA
Goal Count :  1


Player :  PAVARD
Team :  FRANCE
Goal Count :  1


Player :  MBAPPE
Team :  FRANCE
Goal Count :  3


Player :  KUN AGUERO
Team :  ARGENTINA
Goal Count :  1


Player :  CAVANI
Team :  URUGUAY
Goal Count :  2


Player :  PEPE
Team :  PORTUGAL
Goal Count :  1


Player :  IGNASHEVICH
Team :  RUSSIA
Goal Count :  3


Player :  DZYUBA
Team :  RUSSIA
Goal Count :  1


Player :  INIESTA
Team :  SPAIN
Goal Count :  1


Player :  SMOLOV
Team :  RUSSIA
Goal Count :  1


Player :  PIQUE
Team :  SPAIN
Goal Count :  1


Player :  GOLOVIN
Team :  RUSSIA
Goal Count :  1


Player :  RAMOS
Team :  SPAIN
Goal Count :  1


Player :  CHERYSHEV
Team :  RUSSIA
Goal Count :  2


Player :  JORGENSEN
Team :  DENMARK
Goal Count :  1


Player :  MANDZUKIC
Team :  CROATIA
Goal Count :  4


Player :  KJAER
Team :  DENMARK
Goal Count :  1


Player :  KRAMARI

## 6. Check Distribution of Fouls:

In [9]:
foul_list = []
for i in range(1465):
    if df1["Event"][i] == 'FOUL':
        tmp = []
        tmp.append(df1['Player'][i])
        tmp.append(df1['Team'][i])
        foul_list.append(tmp)

foul_set = []
for i in foul_list:
    if i not in foul_set:
        foul_set.append(i)

for i in foul_set:
    count = foul_list.count(i)
    print("Player : ", i[0])
    print("Team : ", i[1])
    print("Foul Count : ", count)
    print("\n")

Player :  GIROUD
Team :  FRANCE
Foul Count :  11


Player :  POGBA
Team :  FRANCE
Foul Count :  11


Player :  MATUIDI
Team :  FRANCE
Foul Count :  7


Player :  MASCHERANO
Team :  ARGENTINA
Foul Count :  4


Player :  MARCOS ROJO
Team :  ARGENTINA
Foul Count :  1


Player :  PEREZ
Team :  ARGENTINA
Foul Count :  2


Player :  TAGLIAFICO
Team :  ARGENTINA
Foul Count :  1


Player :  PAVARD
Team :  FRANCE
Foul Count :  3


Player :  DI MARIA
Team :  ARGENTINA
Foul Count :  1


Player :  GRIEZMANN
Team :  FRANCE
Foul Count :  3


Player :  MERCADO
Team :  ARGENTINA
Foul Count :  5


Player :  OTAMENDI
Team :  ARGENTINA
Foul Count :  1


Player :  KANTE
Team :  FRANCE
Foul Count :  7


Player :  FEKIR
Team :  FRANCE
Foul Count :  3


Player :  HERNANDEZ
Team :  FRANCE
Foul Count :  6


Player :  TOLISSO
Team :  FRANCE
Foul Count :  1


Player :  CAVANI
Team :  URUGUAY
Foul Count :  1


Player :  TORREIRA
Team :  URUGUAY
Foul Count :  3


Player :  NANDEZ
Team :  URUGUAY
Foul Count :  4




## 7. Check Distribution of Free Kicks:

In [10]:
fk_list = []
for i in range(1465):
    if df1["Event"][i] == 'FREE KICK':
        tmp = []
        tmp.append(df1['Player'][i])
        tmp.append(df1['Team'][i])
        fk_list.append(tmp)

fk_set = []
for i in fk_list:
    if i not in fk_set:
        fk_set.append(i)

for i in fk_set:
    count = fk_list.count(i)
    print("Player : ", i[0])
    print("Team : ", i[1])
    print("Free Kick Count : ", count)
    print("\n")

Player :  GRIEZMANN
Team :  FRANCE
Free Kick Count :  3


Player :  POGBA
Team :  FRANCE
Free Kick Count :  1


Player :  MESSI
Team :  ARGENTINA
Free Kick Count :  1


Player :  SUAREZ
Team :  URUGUAY
Free Kick Count :  1


Player :  RONALDO
Team :  PORTUGAL
Free Kick Count :  1


Player :  PERISIC
Team :  CROATIA
Free Kick Count :  1


Player :  NEYMAR JR
Team :  BRAZIL
Free Kick Count :  1


Player :  HONDA
Team :  JAPAN
Free Kick Count :  1


Player :  RODRIGUEZ
Team :  SWITZERLAND
Free Kick Count :  1


Player :  FORSBERG
Team :  SWEDEN
Free Kick Count :  2


Player :  TOIVONEN
Team :  SWEDEN
Free Kick Count :  1


Player :  KANE
Team :  ENGLAND
Free Kick Count :  2


Player :  OSPINA
Team :  COLUMBIA
Free Kick Count :  8


Player :  TRIPPER
Team :  ENGLAND
Free Kick Count :  8


Player :  YOUNG
Team :  ENGLAND
Free Kick Count :  8


Player :  PICKFORD
Team :  ENGLAND
Free Kick Count :  18


Player :  SANCHEZ
Team :  COLUMBIA
Free Kick Count :  2


Player :  WALKER
Team :  ENGLAND

## 8. Check Distribution of Attempts:

In [11]:
attempt_list = []
for i in range(1465):
    if df1["Event"][i] == 'ATTEMPT GOAL':
        tmp = []
        tmp.append(df1['Player'][i])
        tmp.append(df1['Team'][i])
        attempt_list.append(tmp)

attempt_set = []
for i in attempt_list:
    if i not in attempt_set:
        attempt_set.append(i)

for i in attempt_set:
    count = attempt_list.count(i)
    print("Player : ", i[0])
    print("Team : ", i[1])
    print("Attempt Count : ", count)
    print("\n")

Player :  GRIEZMANN
Team :  FRANCE
Attempt Count :  10


Player :  POGBA
Team :  FRANCE
Attempt Count :  4


Player :  BANEGA
Team :  ARGENTINA
Attempt Count :  1


Player :  MESSI
Team :  ARGENTINA
Attempt Count :  4


Player :  MATUIDI
Team :  FRANCE
Attempt Count :  3


Player :  GIROUD
Team :  FRANCE
Attempt Count :  9


Player :  KUN AGUERO
Team :  ARGENTINA
Attempt Count :  1


Player :  DI MARIA
Team :  ARGENTINA
Attempt Count :  2


Player :  BERNARDO
Team :  PORTUGAL
Attempt Count :  2


Player :  RONALDO
Team :  PORTUGAL
Attempt Count :  6


Player :  FONTE
Team :  PORTUGAL
Attempt Count :  1


Player :  SUAREZ
Team :  URUGUAY
Attempt Count :  3


Player :  GUEDES
Team :  PORTUGAL
Attempt Count :  3


Player :  WILLIAM
Team :  PORTUGAL
Attempt Count :  1


Player :  CAVANI
Team :  URUGUAY
Attempt Count :  1


Player :  RAPHAEL
Team :  PORTUGAL
Attempt Count :  2


Player :  ADRIEN
Team :  PORTUGAL
Attempt Count :  1


Player :  FERNANDES
Team :  PORTUGAL
Attempt Count :  2




## 9. Check Distribution of Blocks:

In [12]:
block_list = []
for i in range(1465):
    if df1["Event"][i] == 'BLOCK':
        tmp = []
        tmp.append(df1['Player'][i])
        tmp.append(df1['Team'][i])
        block_list.append(tmp)

block_set = []
for i in block_list:
    if i not in block_set:
        block_set.append(i)

for i in block_set:
    count = block_list.count(i)
    print("Player : ", i[0])
    print("Team : ", i[1])
    print("Block Count : ", count)
    print("\n")

Player :  OSPINA
Team :  COLUMBIA
Block Count :  1


Player :  MINA
Team :  COLUMBIA
Block Count :  2


Player :  SANCHEZ
Team :  COLUMBIA
Block Count :  1


Player :  MAGUIRE
Team :  ENGLAND
Block Count :  2


Player :  PICKFORD
Team :  ENGLAND
Block Count :  13


Player :  WALKER
Team :  ENGLAND
Block Count :  2


Player :  ARIAS
Team :  COLUMBIA
Block Count :  1


Player :  BARRIOS
Team :  COLUMBIA
Block Count :  1


Player :  GRANQVIST
Team :  SWEDEN
Block Count :  5


Player :  KRAFTH
Team :  SWEDEN
Block Count :  1


Player :  HENDERSON
Team :  ENGLAND
Block Count :  1


Player :  STONES
Team :  ENGLAND
Block Count :  5


Player :  VIDA
Team :  CROATIA
Block Count :  1


Player :  DELE
Team :  ENGLAND
Block Count :  1


Player :  LOVREN
Team :  CROATIA
Block Count :  1


Player :  PERISIC
Team :  CROATIA
Block Count :  1


Player :  VRSALJKO
Team :  CROATIA
Block Count :  1


Player :  DIER
Team :  ENGLAND
Block Count :  1


Player :  VERTONGHEN
Team :  BELGIUM
Block Count :  1



## 10. Check Distribution of Corners:

In [13]:
corner_list = []
for i in range(1465):
    if df1["Event"][i] == 'CORNER':
        tmp = []
        tmp.append(df1['Player'][i])
        tmp.append(df1['Team'][i])
        corner_list.append(tmp)

corner_set = []
for i in corner_list:
    if i not in corner_set:
        corner_set.append(i)

for i in corner_set:
    count = corner_list.count(i)
    print("Player : ", i[0])
    print("Team : ", i[1])
    print("Corner Count : ", count)
    print("\n")

Player :  DI MARIA
Team :  ARGENTINA
Corner Count :  2


Player :  BANEGA
Team :  ARGENTINA
Corner Count :  1


Player :  MESSI
Team :  ARGENTINA
Corner Count :  1


Player :  TORREIRA
Team :  URUGUAY
Corner Count :  6


Player :  RAPHAEL
Team :  PORTUGAL
Corner Count :  1


Player :  BERNARDO
Team :  PORTUGAL
Corner Count :  2


Player :  MARIO
Team :  PORTUGAL
Corner Count :  3


Player :  QUARESMA
Team :  PORTUGAL
Corner Count :  3


Player :  FERNANDES
Team :  PORTUGAL
Corner Count :  1


Player :  SAMEDOV
Team :  RUSSIA
Corner Count :  3


Player :  ZHIRKOV
Team :  RUSSIA
Corner Count :  1


Player :  ISCO
Team :  SPAIN
Corner Count :  3


Player :  ASENSIO
Team :  SPAIN
Corner Count :  2


Player :  KOKE
Team :  SPAIN
Corner Count :  1


Player :  GOLOVIN
Team :  RUSSIA
Corner Count :  4


Player :  MODRIC
Team :  CROATIA
Corner Count :  21


Player :  BRAITHWAITE
Team :  DENMARK
Corner Count :  1


Player :  ERIKSEN
Team :  DENMARK
Corner Count :  3


Player :  GUARDADO
Team :  

## 11. Check Distribution of Offsides:

In [14]:
offside_list = []
for i in range(1465):
    if df1["Event"][i] == 'OFFSIDE':
        tmp = []
        tmp.append(df1['Player'][i])
        tmp.append(df1['Team'][i])
        offside_list.append(tmp)

offside_set = []
for i in offside_list:
    if i not in offside_set:
        offside_set.append(i)

for i in offside_set:
    count = offside_list.count(i)
    print("Player : ", i[0])
    print("Team : ", i[1])
    print("Offside Count : ", count)
    print("\n")

Player :  DI MARIA
Team :  ARGENTINA
Offside Count :  1


Player :  GUEDES
Team :  PORTUGAL
Offside Count :  1


Player :  DZYUBA
Team :  RUSSIA
Offside Count :  1


Player :  COSTA
Team :  SPAIN
Offside Count :  1


Player :  MANDZUKIC
Team :  CROATIA
Offside Count :  2


Player :  KRAMARIC
Team :  CROATIA
Offside Count :  1


Player :  HERNANDEZ
Team :  MEXICO
Offside Count :  1


Player :  CARLOS
Team :  MEXICO
Offside Count :  1


Player :  OSAKO
Team :  JAPAN
Offside Count :  1


Player :  FELLAINI
Team :  BELGIUM
Offside Count :  1


Player :  TOIVONEN
Team :  SWEDEN
Offside Count :  1


Player :  FALCAO
Team :  COLUMBIA
Offside Count :  1


Player :  KANE
Team :  ENGLAND
Offside Count :  3


Player :  VARDY
Team :  ENGLAND
Offside Count :  1


Player :  NEYMAR JR
Team :  BRAZIL
Offside Count :  1


Player :  STERLING
Team :  ENGLAND
Offside Count :  1


Player :  BERG
Team :  SWEDEN
Offside Count :  1


Player :  SMOLOV
Team :  RUSSIA
Offside Count :  1


Player :  LUKAKU
Team :

## 12. Check Distribution of Cards:

In [15]:
yc_list = []
for i in range(1465):
    if df1["Event"][i] == 'YELLOW CARD':
        tmp = []
        tmp.append(df1['Player'][i])
        tmp.append(df1['Team'][i])
        yc_list.append(tmp)

yc_set = []
for i in yc_list:
    if i not in yc_set:
        yc_set.append(i)

for i in yc_set:
    count = yc_list.count(i)
    print("Player : ", i[0])
    print("Team : ", i[1])
    print("Yellow Card Count : ", count)
    print("\n")

Player :  MARCOS ROJO
Team :  ARGENTINA
Yellow Card Count :  1


Player :  TAGLIAFICO
Team :  ARGENTINA
Yellow Card Count :  1


Player :  MASCHERANO
Team :  ARGENTINA
Yellow Card Count :  1


Player :  BANEGA
Team :  ARGENTINA
Yellow Card Count :  1


Player :  MATUIDI
Team :  FRANCE
Yellow Card Count :  1


Player :  PAVARD
Team :  FRANCE
Yellow Card Count :  1


Player :  OTAMENDI
Team :  ARGENTINA
Yellow Card Count :  1


Player :  GIROUD
Team :  FRANCE
Yellow Card Count :  1


Player :  RONALDO
Team :  PORTUGAL
Yellow Card Count :  1


Player :  PIQUE
Team :  SPAIN
Yellow Card Count :  1


Player :  KUTEPOV
Team :  RUSSIA
Yellow Card Count :  1


Player :  ZOBNIN
Team :  RUSSIA
Yellow Card Count :  1


Player :  JORGENSEN
Team :  DENMARK
Yellow Card Count :  1


Player :  ALVAREZ
Team :  MEXICO
Yellow Card Count :  1


Player :  LUIS
Team :  BRAZIL
Yellow Card Count :  1


Player :  HERRERA
Team :  MEXICO
Yellow Card Count :  1


Player :  CASEMIRO
Team :  BRAZIL
Yellow Card Count

In [16]:
rc_list = []
for i in range(1465):
    if df1["Event"][i] == 'RED CARD':
        tmp = []
        tmp.append(df1['Player'][i])
        tmp.append(df1['Team'][i])
        rc_list.append(tmp)

rc_set = []
for i in rc_list:
    if i not in rc_set:
        rc_set.append(i)

for i in rc_set:
    count = rc_list.count(i)
    print("Player : ", i[0])
    print("Team : ", i[1])
    print("Red Card Count : ", count)
    print("\n")

Player :  LANG
Team :  SWITZERLAND
Red Card Count :  1




## 13. Check Match List:

In [17]:
match_list = []
for i in range(1465):
    tmp = []
    tmp.append(df1['Team1'][i])
    tmp.append(df1['Team2'][i])
    match_list.append(tmp)

match_set = []
for i in match_list:
    if i not in match_set:
        match_set.append(i)

match_set               

[['FRANCE', 'ARGENTINA'],
 ['URUGUAY', 'PORTUGAL'],
 ['SPAIN', 'RUSSIA'],
 ['CROATIA', 'DENMARK'],
 ['BRAZIL', 'MEXICO'],
 ['BELGIUM', 'JAPAN'],
 ['SWEDEN', 'SWITZERLAND'],
 ['COLUMBIA', 'ENGLAND'],
 ['URUGUAY', 'FRANCE'],
 ['BRAZIL', 'BELGIUM'],
 ['SWEDEN', 'ENGLAND'],
 ['RUSSIA', 'CROATIA'],
 ['FRANCE', 'BELGIUM'],
 ['CROATIA', 'ENGLAND'],
 ['BELGIUM', 'ENGLAND'],
 ['FRANCE', 'CROATIA']]

## 14. Validate Goal Count:

In [18]:
if gc1 == gc2:
    print('Valid Goal Count')

Valid Goal Count
