In [1]:
##### Import relevant libraries #####
import pandas as pd
import plotly.express as px
import sklearn 
import pendulum
import numpy as np
#####                           #####

In [2]:
# import the games data
games = pd.read_csv("../data/games.csv")
games.head()

Unnamed: 0,gameId,season,week,gameDate,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr,homeFinalScore,visitorFinalScore
0,2022090800,2022,1,9/8/2022,20:20:00,LA,BUF,10,31
1,2022091100,2022,1,9/11/2022,13:00:00,ATL,NO,26,27
2,2022091101,2022,1,9/11/2022,13:00:00,CAR,CLE,24,26
3,2022091102,2022,1,9/11/2022,13:00:00,CHI,SF,19,10
4,2022091103,2022,1,9/11/2022,13:00:00,CIN,PIT,20,23


In [3]:
# shape of the dataset
games.shape

(136, 9)

In [4]:
# get the dataset info
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   gameId             136 non-null    int64 
 1   season             136 non-null    int64 
 2   week               136 non-null    int64 
 3   gameDate           136 non-null    object
 4   gameTimeEastern    136 non-null    object
 5   homeTeamAbbr       136 non-null    object
 6   visitorTeamAbbr    136 non-null    object
 7   homeFinalScore     136 non-null    int64 
 8   visitorFinalScore  136 non-null    int64 
dtypes: int64(5), object(4)
memory usage: 9.7+ KB


In [5]:
games["gameTimeNY"] = games["gameDate"] + " " + games["gameTimeEastern"]
games["gameTimeNY"] = pd.to_datetime(games["gameTimeNY"], format="%m/%d/%Y %H:%M:%S")


def convert_to_paris(row):
    # Create a Pendulum instance in Eastern Time
    eastern_tz = "America/New_York"
    eastern_time = pendulum.instance(row["gameTimeNY"], eastern_tz)
    # Convert to Paris timezone
    paris_time = eastern_time.in_tz("Europe/Paris")

    return paris_time.strftime("%d-%m-%Y %H:%M:%S")

games["gameTimeParis"] = games.apply(convert_to_paris, axis=1)
games["gameTimeParis"] = pd.to_datetime(games["gameTimeParis"], format="%d-%m-%Y %H:%M:%S")

games.head()

Unnamed: 0,gameId,season,week,gameDate,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr,homeFinalScore,visitorFinalScore,gameTimeNY,gameTimeParis
0,2022090800,2022,1,9/8/2022,20:20:00,LA,BUF,10,31,2022-09-08 20:20:00,2022-09-09 02:20:00
1,2022091100,2022,1,9/11/2022,13:00:00,ATL,NO,26,27,2022-09-11 13:00:00,2022-09-11 19:00:00
2,2022091101,2022,1,9/11/2022,13:00:00,CAR,CLE,24,26,2022-09-11 13:00:00,2022-09-11 19:00:00
3,2022091102,2022,1,9/11/2022,13:00:00,CHI,SF,19,10,2022-09-11 13:00:00,2022-09-11 19:00:00
4,2022091103,2022,1,9/11/2022,13:00:00,CIN,PIT,20,23,2022-09-11 13:00:00,2022-09-11 19:00:00


### Information of the dataset:
* **gameID**:  Game identifier, unique (numeric)
* **season**: Season of game (numeric)
* **week**: Week of game (numeric)
* **gameDate**: Game Date (time, mm/dd/yyyy)
* **gameTimeEastern**: Start time of game (time, HH:MM:SS, EST)
* **homeTeamAbbr**: Home team three-letter code (text)
* **visitorTeamAbbr**: Visiting team three-letter code (text)
* **homeFinalScore**: The total amount of points scored by the home team in the game (numeric)
* **visitorFinalScore**: The total amount of points scored by the visiting team in the game (numeric)

In [6]:
# check the missing values
games.isna().sum()

gameId               0
season               0
week                 0
gameDate             0
gameTimeEastern      0
homeTeamAbbr         0
visitorTeamAbbr      0
homeFinalScore       0
visitorFinalScore    0
gameTimeNY           0
gameTimeParis        0
dtype: int64

In [7]:
# check for duplicates
games.duplicated().sum()

0

In [8]:
# check the unique values
games.nunique()

gameId               136
season                 1
week                   9
gameDate              27
gameTimeEastern        8
homeTeamAbbr          32
visitorTeamAbbr       32
homeFinalScore        38
visitorFinalScore     35
gameTimeNY            57
gameTimeParis         57
dtype: int64

### Unique values insights
* **Number of teams**: We have data of the 32 different teams as both visitor and home team
* **Season data**: We have data of only the 2022 season
* **Number of games covered**:We have data of 136 different games

In [9]:
# check the statistics
games.describe()

Unnamed: 0,gameId,season,week,homeFinalScore,visitorFinalScore,gameTimeNY,gameTimeParis
count,136.0,136.0,136.0,136.0,136.0,136,136
mean,2022099000.0,2022.0,4.845588,22.669118,20.948529,2022-10-08 10:15:06.617647104,2022-10-08 16:08:29.558823424
min,2022091000.0,2022.0,1.0,3.0,0.0,2022-09-08 20:20:00,2022-09-09 02:20:00
25%,2022093000.0,2022.0,3.0,17.0,14.75,2022-09-25 13:00:00,2022-09-25 19:00:00
50%,2022101000.0,2022.0,5.0,22.5,20.0,2022-10-09 13:00:00,2022-10-09 19:00:00
75%,2022102000.0,2022.0,7.0,27.0,27.0,2022-10-23 13:46:15,2022-10-23 19:46:15
max,2022111000.0,2022.0,9.0,49.0,48.0,2022-11-07 20:15:00,2022-11-08 02:15:00
std,5996.614,0.0,2.567254,8.659874,9.731446,,


### Insights
* **Relevant infos**: The relevant data for the insight are homeFinalScore and visitorFinalScore
* **STD**: The STD are very close - between 8.65 and 9.73
* **Quartiles**: The quartiles of the two features are close but there is a slight score advantage for the home teams



### Question
* Does the home advantage has an influence on the results of the game

In [17]:
# create a column with value H of home and V of visitor for the team that won
games["homeOrVisitorWin"] = np.where(games["homeFinalScore"] > games["visitorFinalScore"], "Home", "Visitor")

In [18]:
games.head()

Unnamed: 0,gameId,season,week,gameDate,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr,homeFinalScore,visitorFinalScore,gameTimeNY,gameTimeParis,homeOrVisitorWin
0,2022090800,2022,1,9/8/2022,20:20:00,LA,BUF,10,31,2022-09-08 20:20:00,2022-09-09 02:20:00,Visitor
1,2022091100,2022,1,9/11/2022,13:00:00,ATL,NO,26,27,2022-09-11 13:00:00,2022-09-11 19:00:00,Visitor
2,2022091101,2022,1,9/11/2022,13:00:00,CAR,CLE,24,26,2022-09-11 13:00:00,2022-09-11 19:00:00,Visitor
3,2022091102,2022,1,9/11/2022,13:00:00,CHI,SF,19,10,2022-09-11 13:00:00,2022-09-11 19:00:00,Home
4,2022091103,2022,1,9/11/2022,13:00:00,CIN,PIT,20,23,2022-09-11 13:00:00,2022-09-11 19:00:00,Visitor


In [37]:
import plotly.express as px

# Count home and visitor wins
count_wins = games["homeOrVisitorWin"].value_counts()
homeOrVisitor = count_wins.index
countWins = count_wins.values

palette_colors = {
    "Midnight green": "#095256",
    "Teal": "#087f8c",
}

# Plot using Plotly (Bar Chart)
fig = px.bar(
    x=homeOrVisitor,
    y=countWins,
    color=homeOrVisitor,
    color_discrete_map={
        "Home": palette_colors["Midnight green"],
        "Visitor": palette_colors["Teal"]
    },
    labels={"x": "Team location", "y": "Number of Wins"},
    title="Home vs Visitor Wins",
)

# Set white background for the bar chart
fig.update_layout(
    plot_bgcolor='white',
)

fig.show()

# Plot using Plotly (Pie Chart)
fig_pie = px.pie(
    values=countWins,
    names=homeOrVisitor,
    title='Wins per Team Location',
    color_discrete_sequence=[palette_colors["Midnight green"], palette_colors["Teal"]],
)

fig_pie.show()
