# Milestone 1

**Comprendre les données et les outils de base pour l'analyse de données :**

- Data acquisition
- Debugging tools
- Cleaning data
- Visualisations simples
- Visualisations avancées


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from src import *


# 1 - Data acquisition
---

In [None]:
# Define the start and end years for the data acquisition
start_year = 2016
end_year = 2022

# Get the data from the NHL API
nhl_data_provider = get_data_from(start_year, end_year)

In [None]:
# Display the first 5 rows of the regular season data for the 123rd game of the 2022 season
pprint.pprint(nhl_data_provider.regular_season[2020][123]['plays'][0:5])


# 2 - Debugging tools
---

In [None]:
# List of seasons
season_list = np.arange(start_year, end_year + 1).tolist()

# 
ipywidgets.interact(plot_nhl_data, nhl_data_provider=ipywidgets.fixed(nhl_data_provider),
                    game_type=['regular', 'playoff'], season=season_list)

# 3 - Data cleaning
---

In [None]:
# Clean the data
clean_regular_season, clean_playoff = clean_data(nhl_data_provider)

In [None]:
# Display the first 5 rows of the cleaned regular season data for the 123rd game of the 2022 season
clean_regular_season[2020][123].head()


# 4 - Visualisations simples
---

## Introduction

In [None]:
# Convert regular season and playoff data to dataframes for the 2018, 2019 and 2020 seasons
df_2018_to_2020 = convert_dictionaries_to_dataframes(clean_regular_season,
                                                     clean_playoff,
                                                     [2018, 2019, 2020])

# Add a column to categorize the shot distance (Under 7 ft., 7-14.9 ft., 15-29.9 ft., 30-44.9 ft., Over 45 ft.)
df_2018_to_2020['GroupShotDistance'] = df_2018_to_2020['shotDistance'].apply(lambda x: 'a.Under 7 ft.' if x < 7 else (
    'b.7 - 14.9 ft.' if x < 15 else (
        'c.15 - 29.9 ft.' if x < 30 else ('d.30 - 44.9 ft.' if x < 45 else 'e.Over 45 ft.'))))

# Get the data for the season (2018, 2019, 2020)
df_2018 = df_2018_to_2020[df_2018_to_2020['Year'] == "2018"].reset_index()
df_2019 = df_2018_to_2020[df_2018_to_2020['Year'] == "2019"].reset_index()
df_2020 = df_2018_to_2020[df_2018_to_2020['Year'] == "2020"].reset_index()


df_2018_to_2020.head()

### Quel tir semble être le plus dangereux ? Le plus fréquent ?

**Première observation sur l'année 2022** :
- Le tir le plus dangereux devrait le cradle (50% de buts) et le bat (25.59% de buts).
- Le tir le plus frequent est le tip-in (N=5726).

In [None]:
# Get the correlation between the goal and the type of shot
df_2022 = convert_dictionaries_to_dataframes(clean_regular_season, clean_playoff, [2022])
corr_2022 = get_correlations_2variables(df_2022, 'shotType', 'typeDescKey')
corr_2022.head()

## Q1 - Corrélation entre la distance de tir et la réussite du but.

### Observations des années 2018, 2019 et 2020

### Saison 2018

In [None]:
# Plot the correlation between the goal and the type of shot
corr_2018 = get_correlations_2variables(df_2018, 'shotType', 'typeDescKey')

# Plot the correlation between the goal and the type of shot
plot_correlations_2variables(corr_2018, 'Total', 'goal')

### Saison 2019

In [None]:
# Get the correlation between the goal and the type of shot
corr_2019 = get_correlations_2variables(df_2019, 'shotType', 'typeDescKey')

# Plot the correlation between the goal and the type of shot
plot_correlations_2variables(corr_2019, 'Total', 'goal')

### Saison 2020

In [None]:
# Get the correlation between the goal and the type of shot
corr_2020 = get_correlations_2variables(df_2020, 'shotType', 'typeDescKey')

# Plot the correlation between the goal and the type of shot
plot_correlations_2variables(corr_2020, 'Total', 'goal')

### Observations au fil des années

In [None]:
# Correlation between 3 variables: goal, shot distance, and season (2018 to 2020)
corr_2018_to_2020 = get_correlations_3variables(df_2018_to_2020, 'GroupShotDistance', 'Year', 'typeDescKey', 'goal')
corr_2018_to_2020.head()

### Courbe d'évolution

In [None]:
plot_graph_correlations(corr_2018_to_2020, 'goal', df_2018_to_2020, 'Year')

### Boxplot

In [None]:
df_2018_to_2020.drop("GroupShotDistance", axis=1, inplace=True)
df_2018_to_2020.reset_index(drop=True, inplace=True)
plot_boxplot_correlations(df_2018_to_2020, 'Year', 'shotDistance', 'typeDescKey')

## Q2 - Analyse plus approfondie de la correlation sur l'année 2020.

### Statistiques descriptives pour la distance de tir

In [None]:
corr_2020 = get_correlations_3variables(df_2020, 'GroupShotDistance', 'shotType', 'typeDescKey', 'goal')

### Graphique à barres

In [None]:
plot_graph_correlations(q=corr_2020, modality='goal', df=df_2020, column='shotType')

### Boxplot

In [None]:
plot_boxplot_correlations(df_2020, 'shotType', 'shotDistance', 'typeDescKey')

# 5. Visualisation avancée
---


In [None]:
years = range(start_year, end_year + 1)  # Années à analyser
total_games_per_year = [len(clean_regular_season[year]) + 1 for year in years]  # Nombre de parties pour chaque année

all_team_shots_coords = {}  # Dictionnaire contenant les coordonnées de chaque tir par équipes par saison
for year in range(start_year, end_year + 1):
    all_team_shots_coords[year] = get_team_shots(clean_regular_season, clean_playoff,year)

In [None]:
# Example of shots mapping for the Ottawa Senators in 2018
plot_team_shots(clean_regular_season, 2018, 'Senators')

### 2. Calculer le taux de tir moyen par heure de la ligue par emplacement

In [None]:
# Dictionnaire contenant le nombre de tirs par emplacement par partie pour chaque année
df_avg_shots_dict = avg_shots_per_game_per_location(years, total_games_per_year)

# Exemple d'accès avec l'année 2018
df_avg_shots_2018 = df_avg_shots_dict[2018]
df_avg_shots_2018.sort_values(by='Average Shots per Game', ascending=False).head(10)

In [None]:
heatmap(df_avg_shots_2018, bins=51)

In [None]:
# preparation 2016-2017 pour avoir 2016-2020
df_data_2016_2017 = convert_dictionaries_to_dataframes(clean_regular_season, clean_playoff, [2016, 2017])
# 2016-2020
df_data_2016_2020 = pd.concat([df_data_2016_2017, df_2018_to_2020], axis=0, ignore_index=True)
# df_data_2016_2020

df = df_data_2016_2020
df.insert(0, 'idPlay', range(1, len(df) + 1))

### Taux de tir moyen par heure dans la ligue : 2016-2020

In [None]:
mean_shots_game_ligue = mean_shots_game_ligue(df, 'Year', 'idGame', 'idPlay')
mean_shots_game_ligue.head()

### Difference taux tir moyen des équipes par rapport à la ligue

In [None]:
mean_df = mean_shots_game_team(df, 2017, 'eventOwnerTeam', 'idGame', 'idPlay', mean_shots_game_ligue)

#### Lissage avec KDE par noyau gaussien

In [None]:
estimation_kde_noyau_gaussien(2017, mean_df)

#### 4. Afficher les données sur la zone offensive à l'aide de lissage par noyau gaussien

In [None]:
smooth_heatmap(df_avg_shots_2018, bandwidth=1.0, grid_size=150)

#### 5 et 6. Graphique intéractif utilisant plotly. Un graphique pour chaque saison de 2016 à 2020.

In [None]:
interactive_smooth_heatmap(year=2016)

In [None]:
interactive_smooth_heatmap(year=2017)

In [None]:
interactive_smooth_heatmap(year=2018)

In [None]:
interactive_smooth_heatmap(year=2019)

In [None]:
interactive_smooth_heatmap(year=2020)