In [4]:
import pandas as pd
# Correct relative path from the notebooks directory to the csv file
print('sample set from raw data')
data = pd.read_csv('../data/raw/20222023_Detroit_Red_Wings_game_results.csv')
print(data.head())

sample set from raw data
             home_team  pim  shots  powerPlayPercentage  powerPlayGoals  \
0  Pittsburgh Penguins   18     21                100.0             2.0   
1    Detroit Red Wings    6     22                 25.0             1.0   
2    Detroit Red Wings    6     31                  0.0             0.0   
3   Chicago Blackhawks    8     35                  0.0             0.0   
4    Detroit Red Wings    4     24                100.0             2.0   

   powerPlayOpportunities  faceOffWinPercentage  blocked  takeaways  \
0                     2.0                  55.3       15         20   
1                     4.0                  43.2       10         14   
2                     2.0                  55.6       11          2   
3                     4.0                  43.5       10         16   
4                     2.0                  54.0       16          4   

   giveaways  hits  home_goals            away_team  away_goals  
0          3    39           2 

In [5]:
data.describe()

Unnamed: 0,GP,GF,GA,W,L,OL,Att.
count,82.0,82.0,82.0,82.0,82.0,82.0,82.0
mean,41.5,3.719512,2.158537,32.743902,5.792683,2.963415,18189.170732
std,23.815261,1.533771,1.444184,18.106551,3.780188,2.16849,2940.841329
min,1.0,0.0,0.0,1.0,0.0,0.0,4600.0
25%,21.25,3.0,1.0,18.25,3.0,0.0,17850.0
50%,41.5,4.0,2.0,32.5,5.0,4.0,17850.0
75%,61.75,5.0,3.0,48.75,8.0,5.0,18401.75
max,82.0,7.0,7.0,65.0,12.0,5.0,39243.0


In [7]:
import os
import pandas as pd
import numpy as np

# Define the directories
csv_directory = '../data/raw/'
processed_directory = '../data/processed/'

# Ensure the processed directory exists
os.makedirs(processed_directory, exist_ok=True)

# Process CSV files
for file_name in os.listdir(csv_directory):
    if file_name.endswith('.csv'):
        file_path = os.path.join(csv_directory, file_name)
        
        # Read the CSV file
        df = pd.read_csv(file_path)

        # Verify and convert percentage columns
        for col in ['powerPlayPercentage', 'faceOffWinPercentage']:
            if df[col].max() > 1:
                df[col] = df[col] / 100
        
        # Fill NaN values with zeros or an appropriate value/method
        df.fillna(0, inplace=True)

        # Dynamic numeric column handling
        numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
        df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce').fillna(0)

        # Save the cleaned data
        processed_file_name = file_name.replace('.csv', '_Processed.csv')
        processed_file_path = os.path.join(processed_directory, processed_file_name)
        df.to_csv(processed_file_path, index=False)

        print(f'Processed file saved to: {processed_file_path}')


Processed file saved to: ../data/processed/20222023_Calgary_Flames_game_results_Processed.csv
Processed file saved to: ../data/processed/20222023_Pittsburgh_Penguins_game_results_Processed.csv
Processed file saved to: ../data/processed/20222023_Tampa_Bay_Lightning_game_results_Processed.csv
Processed file saved to: ../data/processed/20222023_Anaheim_Ducks_game_results_Processed.csv
Processed file saved to: ../data/processed/20222023_Los_Angeles_Kings_game_results_Processed.csv
Processed file saved to: ../data/processed/20222023_Washington_Capitals_game_results_Processed.csv
Processed file saved to: ../data/processed/20222023_Florida_Panthers_game_results_Processed.csv
Processed file saved to: ../data/processed/20222023_Dallas_Stars_game_results_Processed.csv
Processed file saved to: ../data/processed/20222023_Winnipeg_Jets_game_results_Processed.csv
Processed file saved to: ../data/processed/20222023_Chicago_Blackhawks_game_results_Processed.csv
Processed file saved to: ../data/process

In [1]:
import pandas as pd
print('sample processed data')
data = pd.read_csv('../data/processed/20222023_Detroit_Red_Wings_game_results_Processed.csv')
print(data.head())
data.describe()

sample processed data
             home_team  pim  shots  powerPlayPercentage  powerPlayGoals  \
0  Pittsburgh Penguins   18     21                 1.00             2.0   
1    Detroit Red Wings    6     22                 0.25             1.0   
2    Detroit Red Wings    6     31                 0.00             0.0   
3   Chicago Blackhawks    8     35                 0.00             0.0   
4    Detroit Red Wings    4     24                 1.00             2.0   

   powerPlayOpportunities  faceOffWinPercentage  blocked  takeaways  \
0                     2.0                 0.553       15         20   
1                     4.0                 0.432       10         14   
2                     2.0                 0.556       11          2   
3                     4.0                 0.435       10         16   
4                     2.0                 0.540       16          4   

   giveaways  hits  home_goals            away_team  away_goals  
0          3    39           2    

Unnamed: 0,pim,shots,powerPlayPercentage,powerPlayGoals,powerPlayOpportunities,faceOffWinPercentage,blocked,takeaways,giveaways,hits,home_goals,away_goals
count,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0
mean,8.844444,27.888889,0.257178,0.666667,3.011111,0.484844,14.511111,5.522222,6.3,20.777778,3.155556,3.0
std,5.394361,6.385455,0.316265,0.734388,1.377952,0.076285,5.313211,3.690948,3.018482,7.734031,1.760433,1.854239
min,0.0,17.0,0.0,0.0,1.0,0.317,6.0,0.0,0.0,6.0,0.0,0.0
25%,6.0,23.0,0.0,0.0,2.0,0.4325,11.0,3.0,4.0,15.0,2.0,1.25
50%,8.0,26.5,0.225,1.0,3.0,0.485,13.0,5.0,6.0,20.0,3.0,3.0
75%,10.0,33.0,0.333,1.0,4.0,0.5305,17.0,7.0,8.75,26.5,4.0,4.0
max,35.0,46.0,1.0,3.0,8.0,0.674,31.0,20.0,14.0,39.0,8.0,8.0
