In [1]:
##### Import relevant libraries #####
import pandas as pd
import plotly as px
import sklearn 
import pendulum
#####                           #####

In [2]:
# import the games data
games = pd.read_csv("../data/games.csv")
games.head()

Unnamed: 0,gameId,season,week,gameDate,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr,homeFinalScore,visitorFinalScore
0,2022090800,2022,1,9/8/2022,20:20:00,LA,BUF,10,31
1,2022091100,2022,1,9/11/2022,13:00:00,ATL,NO,26,27
2,2022091101,2022,1,9/11/2022,13:00:00,CAR,CLE,24,26
3,2022091102,2022,1,9/11/2022,13:00:00,CHI,SF,19,10
4,2022091103,2022,1,9/11/2022,13:00:00,CIN,PIT,20,23


In [3]:
# shape of the dataset
games.shape

(136, 9)

In [4]:
# get the dataset info
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136 entries, 0 to 135
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   gameId             136 non-null    int64 
 1   season             136 non-null    int64 
 2   week               136 non-null    int64 
 3   gameDate           136 non-null    object
 4   gameTimeEastern    136 non-null    object
 5   homeTeamAbbr       136 non-null    object
 6   visitorTeamAbbr    136 non-null    object
 7   homeFinalScore     136 non-null    int64 
 8   visitorFinalScore  136 non-null    int64 
dtypes: int64(5), object(4)
memory usage: 9.7+ KB


In [9]:
def convert_to_paris(row):
    # Create the temp gameTimeNY column
    games["gameTimeNY"] = games["gameDate"] + " " + games["gameTimeEastern"]
    games["gameTimeNY"] = pd.to_datetime(games["gameTimeNY"], format="%m/%d/%Y %H:%M:%S")
    eastern_tz = "America/New_York"
    # Create a Pendulum instance in Eastern Time
    eastern_time = pendulum.instance(row["gameTimeNY"], eastern_tz)
    # Convert to Paris timezone
    paris_time = eastern_time.in_tz("Europe/Paris")

    return paris_time.strftime("%m-%d-%Y %H:%M:%S")

games["gameTimeParis"] = games.apply(convert_to_paris, axis=1)

games.head()

Unnamed: 0,gameId,season,week,gameDate,gameTimeEastern,homeTeamAbbr,visitorTeamAbbr,homeFinalScore,visitorFinalScore,gameTimeNY,gameTimeParis
0,2022090800,2022,1,9/8/2022,20:20:00,LA,BUF,10,31,2022-09-08 20:20:00,09-09-2022 02:20:00
1,2022091100,2022,1,9/11/2022,13:00:00,ATL,NO,26,27,2022-09-11 13:00:00,09-11-2022 19:00:00
2,2022091101,2022,1,9/11/2022,13:00:00,CAR,CLE,24,26,2022-09-11 13:00:00,09-11-2022 19:00:00
3,2022091102,2022,1,9/11/2022,13:00:00,CHI,SF,19,10,2022-09-11 13:00:00,09-11-2022 19:00:00
4,2022091103,2022,1,9/11/2022,13:00:00,CIN,PIT,20,23,2022-09-11 13:00:00,09-11-2022 19:00:00


### Information of the dataset:
* **gameID**:  Game identifier, unique (numeric)
* **season**: Season of game (numeric)
* **week**: Week of game (numeric)
* **gameDate**: Game Date (time, mm/dd/yyyy)
* **gameTimeEastern**: Start time of game (time, HH:MM:SS, EST)
* **homeTeamAbbr**: Home team three-letter code (text)
* **visitorTeamAbbr**: Visiting team three-letter code (text)
* **homeFinalScore**: The total amount of points scored by the home team in the game (numeric)
* **visitorFinalScore**: The total amount of points scored by the visiting team in the game (numeric)

In [None]:
# check the missing values
games.isna().sum()

In [None]:
# check for duplicates
games.duplicated().sum()

In [None]:
# check the unique values
games.nunique()

### Unique values insights
* **Number of teams**: We have data of the 32 different teams as both visitor and home team
* **Season data**: We have data of only the 2022 season
* **Number of games covered**:We have data of 136 different games

In [None]:
# check the statistics
games.describe()

### Insights
* **Relevant infos**: The relevant data for the insight are homeFinalScore and visitorFinalScore
* **STD**: The STD are very close - between 8.65 and 9.73
* **Quartiles**: The quartiles of the two features are close but there is a slight score advantage for the home teams



In [None]:
# take the relevant categorical features
cat_types = ["object", "category"]
categorical_features = games.select_dtypes(include=cat_types).columns

categorical_features