In [66]:
#import libraries
import pandas as pd
import numpy as np
import os
import glob
from pathlib import Path
import re

In [2]:
# SET DIRECTORIES
# SET PROJECT ROOT
PROJECT_ROOT = Path().resolve().parent

# SET DATA DIRECTORY
DATA_DIR = PROJECT_ROOT / "data"

# SET DATA SUBDIRECTORIES
EFFICIENCY_TEMPO_DIR = DATA_DIR / "efficiencyTempoData_kenPom"
KAGGLE_DIR = DATA_DIR / "kaggleData"
POINT_DISTRIBUTION_DIR = DATA_DIR / "pointDistributionData_kenPom"
ROSTERDATA_DIR = DATA_DIR / "teamRosterData_kenPom"

# Print checks (commented out after these returned correctly)
#print(f"Project Root: {PROJECT_ROOT}")
#print(f"Data Directory: {DATA_DIR}")
#print(f"Data directory exists: {DATA_DIR.exists()}")
#print(f"Kaggle directory exists: {KAGGLE_DIR.exists()}")
#print(f"RosterData from KenPom directory exists: {ROSTERDATA_DIR.exists()}")

#### KENPOM DATASET CONCATENATION AND CLEANING

In [3]:
# List all CSV files in POINT_DISTRIBUTION_DIR and display their info

csv_files = list(POINT_DISTRIBUTION_DIR.glob("*.csv"))
for file in csv_files:
    print(f"File: {file.name}")
    df = pd.read_csv(file)
    # See column names and data types
    print(df.shape)
    print("-" * 400)

File: pointdist22.csv
(358, 14)
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
File: pointdist23.csv
(363, 14)
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
File: pointdist21.csv
(357, 14)
------------------------------------------------------------------------------------------------------

In [4]:
# Combine all CSV files in POINT_DISTRIBUTION_DIR into a single DataFrame
POINT_DISTRIBUTION_df = pd.concat((pd.read_csv(file) for file in csv_files), ignore_index=True)
print("Combined DataFrame:")
print(POINT_DISTRIBUTION_df.shape)

Combined DataFrame:
(2157, 14)


In [23]:
# Clean POINT_DISTRIBUTION_df

# Rename columns

# Off_1 : ft_point_share_offense
# RankOff_1 : ft_point_share_offense_rank
# Off_2 : two_point_share_offense
# RankOff_2 : two_point_share_offense_rank
# Off_3 : three_point_share_offense
# RankOff_3 : three_point_share_offense_rank
# Def_1	: ft_point_share_defense
# RankDef_1	: ft_point_share_defense_rank
# Def_2	: two_point_share_defense
# RankDef_2	: two_point_share_defense_rank
# Def_3	: three_point_share_defense
# RankDef_3	: three_point_share_defense_rank

POINT_DISTRIBUTION_df.rename(columns={
    'Season' : 'season',
    'TeamName' : 'team_name',
    'Off_1': 'ft_point_share_offense',
    'RankOff_1': 'ft_point_share_offense_rank',
    'Off_2': 'two_point_share_offense',
    'RankOff_2': 'two_point_share_offense_rank',
    'Off_3': 'three_point_share_offense',
    'RankOff_3': 'three_point_share_offense_rank',
    'Def_1': 'ft_point_share_defense',
    'RankDef_1': 'ft_point_share_defense_rank',
    'Def_2': 'two_point_share_defense',
    'RankDef_2': 'two_point_share_defense_rank',
    'Def_3': 'three_point_share_defense',
    'RankDef_3': 'three_point_share_defense_rank'
}, inplace=True)

# Remove index column if it exists
if 'Unnamed: 0' in POINT_DISTRIBUTION_df.columns:
    POINT_DISTRIBUTION_df.drop(columns=['Unnamed: 0'], inplace=True)
# Display the cleaned DataFrame info
#print("Cleaned DataFrame:")
#print(POINT_DISTRIBUTION_df.info())


In [24]:
POINT_DISTRIBUTION_df

Unnamed: 0,season,team_name,ft_point_share_offense,ft_point_share_offense_rank,two_point_share_offense,two_point_share_offense_rank,three_point_share_offense,three_point_share_offense_rank,ft_point_share_defense,ft_point_share_defense_rank,two_point_share_defense,two_point_share_defense_rank,three_point_share_defense,three_point_share_defense_rank
0,2022,Abilene Christian,23.307953,3,48.900169,241,27.791878,263,25.438202,2,44.764045,342,29.797753,232
1,2022,Air Force,14.602804,331,48.948598,239,36.448598,59,22.297650,21,54.203655,68,23.498695,350
2,2022,Akron,20.626988,44,46.251704,304,33.121308,119,15.835777,285,52.492669,123,31.671554,157
3,2022,Alabama,18.909645,116,46.206634,306,34.883721,82,19.976219,74,50.891795,175,29.131986,252
4,2022,Alabama A&M,22.463375,10,58.491590,16,19.045035,356,17.462687,199,49.253731,239,33.283582,105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2152,2025,Wright St.,14.388186,353,52.320675,91,33.291139,123,18.594010,206,51.081531,116,30.324459,234
2153,2025,Wyoming,16.462518,311,48.701617,208,34.835865,87,22.399635,33,49.270073,206,28.330292,302
2154,2025,Xavier,21.121988,66,46.686747,272,32.191265,160,16.967954,296,51.602301,101,31.429745,175
2155,2025,Yale,17.571612,266,54.467721,46,27.960667,283,18.239300,239,41.926070,357,39.834630,5


In [7]:
# Combine all CSV Files in TEAM_ROSTER_DIR into a single DataFrame
csv_files = list(ROSTERDATA_DIR.glob("*.csv"))
TEAMROSTERDATA_df = pd.concat((pd.read_csv(file) for file in csv_files), ignore_index=True)
print(f"Combined TEAMROSTERDATA DataFrame: {TEAMROSTERDATA_df.shape}")
print(f"Seasons in dataset: {TEAMROSTERDATA_df['Season'].unique()}")
TEAMROSTERDATA_df.head(3)


Combined TEAMROSTERDATA DataFrame: (2157, 52)
Seasons in dataset: [2019 2025 2024 2021 2023 2022]


Unnamed: 0,Season,TeamName,Size,SizeRank,Hgt5,Hgt5Rank,Hgt4,Hgt4Rank,Hgt3,Hgt3Rank,...,DR4,DR4Rank,DR3,DR3Rank,DR2,DR2Rank,DR1,DR1Rank,Continuity,RankContinuity
0,2019,Abilene Christian,75.89,299,-0.87,271,-0.71,260,-0.93,295,...,23.1,220,18.34,203,18.71,70,15.46,104,,
1,2019,Air Force,77.15,126,-2.01,340,0.19,145,0.33,113,...,23.29,216,21.18,89,19.24,61,14.24,150,,
2,2019,Akron,76.31,249,0.27,128,-0.96,287,0.2,125,...,20.9,277,22.85,52,19.29,58,13.23,211,,


In [8]:
print(TEAMROSTERDATA_df.columns)

Index(['Season', 'TeamName', 'Size', 'SizeRank', 'Hgt5', 'Hgt5Rank', 'Hgt4',
       'Hgt4Rank', 'Hgt3', 'Hgt3Rank', 'Hgt2', 'Hgt2Rank', 'Hgt1', 'Hgt1Rank',
       'HgtEff', 'HgtEffRank', 'Exp', 'ExpRank', 'Bench', 'BenchRank', 'Pts5',
       'Pts5Rank', 'Pts4', 'Pts4Rank', 'Pts3', 'Pts3Rank', 'Pts2', 'Pts2Rank',
       'Pts1', 'Pts1Rank', 'OR5', 'OR5Rank', 'OR4', 'OR4Rank', 'OR3',
       'OR3Rank', 'OR2', 'OR2Rank', 'OR1', 'OR1Rank', 'DR5', 'DR5Rank', 'DR4',
       'DR4Rank', 'DR3', 'DR3Rank', 'DR2', 'DR2Rank', 'DR1', 'DR1Rank',
       'Continuity', 'RankContinuity'],
      dtype='object')


In [9]:
# Drop unnecessary columns that have missing data
TEAMROSTERDATA_df.drop(columns=['Continuity', 'RankContinuity'], inplace=True)

In [10]:
# Rename columns in TEAMROSTERDATA_df
TEAMROSTERDATA_df.rename(columns={
    'Season': 'season',
    'TeamName': 'team_name',
    'Size': 'avgHeightOnCourt_MinutesWeighted',
    'SizeRank': 'avgHeightOnCourt_MinutesWeighted_Rank',
    'Hgt5': 'avgCenterHeight',
    'Hgt5Rank': 'avgCenterHeight_Rank',
    'Hgt4': 'avgPowerForwardHeight',
    'Hgt4Rank': 'avgPowerForwardHeight_Rank', 
    'Hgt3': 'avgSmallForwardHeight',
    'Hgt3Rank': 'avgSmallForwardHeight_Rank', 
    'Hgt2': 'avgShootingGuardHeight',
    'Hgt2Rank': 'avgShootingGuardHeight_Rank',
    'Hgt1': 'avgPointGuardHeight',
    'Hgt1Rank': 'avgPointGuardHeight_Rank',
    'HgtEff': 'heightEfficiencyRating',
    'HgtEffRank': 'heightEfficiencyRating_Rank',
    'Exp': 'avgYearsOfExperience',
    'ExpRank': 'avgYearsOfExperience_Rank',
    'Bench': 'avgBenchPoints',
    'BenchRank': 'avgBenchPoints_Rank',
    'Pts5': 'avgCenterPoints',
    'Pts5Rank': 'avgCenterPoints_Rank',
    'Pts4': 'avgPowerForwardPoints',
    'Pts4Rank': 'avgPowerForwardPoints_Rank',
    'Pts3': 'avgSmallForwardPoints',
    'Pts3Rank': 'avgSmallForwardPoints_Rank',
    'Pts2': 'avgShootingGuardPoints',
    'Pts2Rank': 'avgShootingGuardPoints_Rank',
    'Pts1': 'avgPointGuardPoints',
    'Pts1Rank': 'avgPointGuardPoints_Rank',
    'OR5': 'avgCenterOffensiveRebounds',
    'OR5Rank': 'avgCenterOffensiveRebounds_Rank',
    'OR4': 'avgPowerForwardOffensiveRebounds',
    'OR4Rank': 'avgPowerForwardOffensiveRebounds_Rank',
    'OR3': 'avgSmallForwardOffensiveRebounds',
    'OR3Rank': 'avgSmallForwardOffensiveRebounds_Rank',
    'OR2': 'avgShootingGuardOffensiveRebounds',
    'OR2Rank': 'avgShootingGuardOffensiveRebounds_Rank',
    'OR1': 'avgPointGuardOffensiveRebounds',
    'OR1Rank': 'avgPointGuardOffensiveRebounds_Rank',
    'DR5': 'avgCenterDefensiveRebounds',
    'DR5Rank': 'avgCenterDefensiveRebounds_Rank',
    'DR4': 'avgPowerForwardDefensiveRebounds',
    'DR4Rank': 'avgPowerForwardDefensiveRebounds_Rank',
    'DR3': 'avgSmallForwardDefensiveRebounds',
    'DR3Rank': 'avgSmallForwardDefensiveRebounds_Rank',
    'DR2': 'avgShootingGuardDefensiveRebounds',
    'DR2Rank': 'avgShootingGuardDefensiveRebounds_Rank',
    'DR1': 'avgPointGuardDefensiveRebounds',
    'DR1Rank': 'avgPointGuardDefensiveRebounds_Rank',
}, inplace=True)

TEAMROSTERDATA_df

Unnamed: 0,season,team_name,avgHeightOnCourt_MinutesWeighted,avgHeightOnCourt_MinutesWeighted_Rank,avgCenterHeight,avgCenterHeight_Rank,avgPowerForwardHeight,avgPowerForwardHeight_Rank,avgSmallForwardHeight,avgSmallForwardHeight_Rank,...,avgCenterDefensiveRebounds,avgCenterDefensiveRebounds_Rank,avgPowerForwardDefensiveRebounds,avgPowerForwardDefensiveRebounds_Rank,avgSmallForwardDefensiveRebounds,avgSmallForwardDefensiveRebounds_Rank,avgShootingGuardDefensiveRebounds,avgShootingGuardDefensiveRebounds_Rank,avgPointGuardDefensiveRebounds,avgPointGuardDefensiveRebounds_Rank
0,2019,Abilene Christian,75.89,299,-0.87,271,-0.71,260,-0.93,295,...,24.34,228,23.10,220,18.34,203,18.71,70,15.46,104
1,2019,Air Force,77.15,126,-2.01,340,0.19,145,0.33,113,...,22.00,292,23.29,216,21.18,89,19.24,61,14.24,150
2,2019,Akron,76.31,249,0.27,128,-0.96,287,0.20,125,...,23.66,253,20.90,277,22.85,52,19.29,58,13.23,211
3,2019,Alabama,77.88,39,-0.01,161,0.69,97,1.08,65,...,25.70,190,23.75,198,19.68,144,19.72,46,11.09,312
4,2019,Alabama A&M,75.77,313,-1.47,322,-0.55,240,-0.04,174,...,25.10,208,22.03,243,19.85,134,18.71,71,14.25,148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2152,2022,Wright St.,76.96,170,-0.16,177,-0.19,207,1.07,50,...,27.71,105,21.01,269,21.87,81,15.96,208,13.43,212
2153,2022,Wyoming,78.72,8,0.12,153,1.00,61,2.07,11,...,28.58,90,18.91,312,17.96,218,20.15,56,14.33,172
2154,2022,Xavier,78.37,23,2.06,23,1.01,60,0.63,94,...,27.96,102,22.86,195,20.90,112,15.33,231,12.94,238
2155,2022,Yale,76.47,249,-1.69,342,-1.75,347,-0.01,182,...,22.14,292,22.35,219,16.43,281,21.93,24,17.16,61


For further information on how some of the metrics in the TEAMROSTERDATA_df were calculated, please visit https://kenpom.com/blog/height-and-other-stuff-page/


In [16]:
# Combine all CSV Files in EFFICIENCY_TEMPO_DIR into a single DataFrame
def load_efficiency_tempo_data(directory: Path):
    all_dfs = []

    for file in directory.glob("*.csv"):
        # Extract season from filename (e.g., "efficiency_tempo_2019.csv" → 2019)
        season = int(file.stem.split("_")[-1])

        df = pd.read_csv(file)
        df["season"] = season  # Add Season column
        all_dfs.append(df)

    combined_df = pd.concat(all_dfs, ignore_index=True)
    return combined_df

EFFICIENCY_TEMPO_df = load_efficiency_tempo_data(EFFICIENCY_TEMPO_DIR)
print(f"Seasons in dataset: {EFFICIENCY_TEMPO_df['season'].unique()}")
EFFICIENCY_TEMPO_df.head(3)

Seasons in dataset: [2021 2023 2022 2025 2019 2024]


Unnamed: 0,Team,Conf,Adjusted Tempo,Adjusted Tempo Rank,Raw Tempo,Raw Tempo Rank,Offensive Average Possession Length (seconds),Offensive Average Possession Length Rank,Defensive Average Possession Length (seconds),Defensive Average Possession Length Rank,Adjusted Offensive Efficiency,Adjusted Offensive Efficiency Rank,Raw Offensive Efficiency,Adjusted Offensive Rank,Adjusted Defensive Efficiency,Adjusted Defensive Efficiency Rank,Raw Defensive Efficiency,Raw Defensive Efficiency Rank,season
0,Coppin St.,MEAC,76.8,1,79.4,1,14.0,1,16.1,13,91.0,334,91.6,324,102.6,183,100.2,135,2021
1,Eastern Kentucky,OVC,74.8,2,75.6,5,15.4,16,16.1,17,101.0,197,104.5,100,101.5,160,97.3,67,2021
2,South Carolina,SEC,74.6,3,75.3,7,15.2,12,16.7,64,103.9,139,96.3,272,98.9,113,105.1,260,2021


In [72]:
# Rename columns in EFFICIENCY_TEMPO_df
EFFICIENCY_TEMPO_df.rename(columns={
    'team': 'team_name',
    'Conf': 'conference',
    'Adjusted Tempo': 'adjusted_tempo_rating',
    'Adjusted Tempo Rank': 'adjusted_tempo__rating_rank',
    'Raw Tempo': 'raw_tempo_rating',
    'Raw Tempo Rank': 'raw_tempo_rating_rank',
    'Offensive Average Possession Length (seconds)': 'offensive_avg_possession_length',
    'Offensive Average Possession Length Rank': 'offensive_avg_possession_length_rank',
    'Defensive Average Possession Length (seconds)': 'defensive_avg_possession_length',
    'Defensive Average Possession Length Rank': 'defensive_avg_possession_length_rank',
    'Adjusted Offensive Efficiency': 'adjusted_offensive_efficiency_rating',
    'Adjusted Offensive Efficiency Rank': 'adjusted_offensive_efficiency_rating_rank',
    'Raw Offensive Efficiency': 'raw_offensive_efficiency_rating',
    'Adjusted Offensive Rank': 'adjusted_offensive_rating_rank',
    'Adjusted Defensive Efficiency': 'adjusted_defensive_efficiency_rating',
    'Adjusted Defensive Efficiency Rank': 'adjusted_defensive_efficiency_rating_rank',
    'Raw Defensive Efficiency': 'raw_defensive_efficiency_rating',
    'Raw Defensive Efficiency Rank': 'raw_defensive_efficiency_rating_rank',
    'season': 'season'
}, inplace=True)

In [73]:
EFFICIENCY_TEMPO_df

Unnamed: 0,team_name,conference,adjusted_tempo_rating,adjusted_tempo__rating_rank,raw_tempo_rating,raw_tempo_rating_rank,offensive_avg_possession_length,offensive_avg_possession_length_rank,defensive_avg_possession_length,defensive_avg_possession_length_rank,adjusted_offensive_efficiency_rating,adjusted_offensive_efficiency_rating_rank,raw_offensive_efficiency_rating,adjusted_offensive_rating_rank,adjusted_defensive_efficiency_rating,adjusted_defensive_efficiency_rating_rank,raw_defensive_efficiency_rating,raw_defensive_efficiency_rating_rank,season
0,Coppin St.,MEAC,76.8,1,79.4,1,14.0,1,16.1,13,91.0,334,91.6,324,102.6,183,100.2,135,2021
1,Eastern Kentucky,OVC,74.8,2,75.6,5,15.4,16,16.1,17,101.0,197,104.5,100,101.5,160,97.3,67,2021
2,South Carolina,SEC,74.6,3,75.3,7,15.2,12,16.7,64,103.9,139,96.3,272,98.9,113,105.1,260,2021
3,The Citadel,SC,74.2,4,74.5,11,15.5,18,16.7,62,104.0,136,104.0,110,109.1,305,110.5,328,2021
4,Monmouth,MAAC,74.2,5,74.4,12,14.6,4,17.8,271,99.0,237,100.4,187,105.1,229,98.3,85,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2148,Towson,CAA,62.3,358,62.9,358,20.2,357,17.8,262,103.5,228,104.4,201,101.7,90,100.8,68,2024
2149,Air Force,MWC,61.9,359,63.4,355,20.5,362,17.2,95,105.9,181,103.5,219,113.4,327,114.0,346,2024
2150,North Texas,Amer,61.8,360,62.9,360,20.3,360,17.6,196,110.3,102,108.3,111,99.4,52,101.6,83,2024
2151,Wagner 16,NEC,61.6,361,62.7,361,20.2,358,17.9,270,96.4,330,97.8,311,106.4,177,100.9,69,2024


In [74]:
# 1. Identify rows where team_name ends with a space + digits
mask = EFFICIENCY_TEMPO_df['team_name'].str.match(r'.*\s+\d+$')

print(f"Rows with embedded seeds: {mask.sum()}")

# 2. Remove trailing seeds (space + digits) for those rows
EFFICIENCY_TEMPO_df.loc[mask, 'team_name'] = EFFICIENCY_TEMPO_df.loc[mask, 'team_name'] \
    .str.replace(r'\s+\d+$', '', regex=True)

# 3. (Optional) Verify no more trailing digits
mask2 = EFFICIENCY_TEMPO_df['team_name'].str.match(r'.*\s+\d+$')
print(f"Rows still containing seeds: {mask2.sum()}")

Rows with embedded seeds: 407
Rows still containing seeds: 0


In [76]:
EFFICIENCY_TEMPO_df.columns

Index(['team_name', 'conference', 'adjusted_tempo_rating',
       'adjusted_tempo__rating_rank', 'raw_tempo_rating',
       'raw_tempo_rating_rank', 'offensive_avg_possession_length',
       'offensive_avg_possession_length_rank',
       'defensive_avg_possession_length',
       'defensive_avg_possession_length_rank',
       'adjusted_offensive_efficiency_rating',
       'adjusted_offensive_efficiency_rating_rank',
       'raw_offensive_efficiency_rating', 'adjusted_offensive_rating_rank',
       'adjusted_defensive_efficiency_rating',
       'adjusted_defensive_efficiency_rating_rank',
       'raw_defensive_efficiency_rating',
       'raw_defensive_efficiency_rating_rank', 'season'],
      dtype='object')

In [77]:
['team_name', 'conference', 'adjusted_tempo_rating',
       'adjusted_tempo__rating_rank', 'raw_tempo_rating',
       'raw_tempo_rating_rank', 'offensive_avg_possession_length',
       'offensive_avg_possession_length_rank',
       'defensive_avg_possession_length',
       'defensive_avg_possession_length_rank',
       'adjusted_offensive_efficiency_rating',
       'adjusted_offensive_efficiency_rating_rank',
       'raw_offensive_efficiency_rating', 'adjusted_offensive_rating_rank',
       'adjusted_defensive_efficiency_rating',
       'adjusted_defensive_efficiency_rating_rank',
       'raw_defensive_efficiency_rating',
       'raw_defensive_efficiency_rating_rank', 'season']
# Reorder and select
cols = ["season", "team_name", "conference", "adjusted_tempo_rating",
        "adjusted_tempo__rating_rank", "raw_tempo_rating",
        "raw_tempo_rating_rank", "offensive_avg_possession_length",
        "offensive_avg_possession_length_rank",
        "defensive_avg_possession_length",
        "defensive_avg_possession_length_rank",
        "adjusted_offensive_efficiency_rating",
        "adjusted_offensive_efficiency_rating_rank",
        "raw_offensive_efficiency_rating",
        "adjusted_offensive_rating_rank",
        "adjusted_defensive_efficiency_rating",
        "adjusted_defensive_efficiency_rating_rank",
        "raw_defensive_efficiency_rating",
        "raw_defensive_efficiency_rating_rank"]
EFFICIENCY_TEMPO_df = EFFICIENCY_TEMPO_df[cols]

In [78]:
EFFICIENCY_TEMPO_df

Unnamed: 0,season,team_name,conference,adjusted_tempo_rating,adjusted_tempo__rating_rank,raw_tempo_rating,raw_tempo_rating_rank,offensive_avg_possession_length,offensive_avg_possession_length_rank,defensive_avg_possession_length,defensive_avg_possession_length_rank,adjusted_offensive_efficiency_rating,adjusted_offensive_efficiency_rating_rank,raw_offensive_efficiency_rating,adjusted_offensive_rating_rank,adjusted_defensive_efficiency_rating,adjusted_defensive_efficiency_rating_rank,raw_defensive_efficiency_rating,raw_defensive_efficiency_rating_rank
0,2021,Coppin St.,MEAC,76.8,1,79.4,1,14.0,1,16.1,13,91.0,334,91.6,324,102.6,183,100.2,135
1,2021,Eastern Kentucky,OVC,74.8,2,75.6,5,15.4,16,16.1,17,101.0,197,104.5,100,101.5,160,97.3,67
2,2021,South Carolina,SEC,74.6,3,75.3,7,15.2,12,16.7,64,103.9,139,96.3,272,98.9,113,105.1,260
3,2021,The Citadel,SC,74.2,4,74.5,11,15.5,18,16.7,62,104.0,136,104.0,110,109.1,305,110.5,328
4,2021,Monmouth,MAAC,74.2,5,74.4,12,14.6,4,17.8,271,99.0,237,100.4,187,105.1,229,98.3,85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2148,2024,Towson,CAA,62.3,358,62.9,358,20.2,357,17.8,262,103.5,228,104.4,201,101.7,90,100.8,68
2149,2024,Air Force,MWC,61.9,359,63.4,355,20.5,362,17.2,95,105.9,181,103.5,219,113.4,327,114.0,346
2150,2024,North Texas,Amer,61.8,360,62.9,360,20.3,360,17.6,196,110.3,102,108.3,111,99.4,52,101.6,83
2151,2024,Wagner,NEC,61.6,361,62.7,361,20.2,358,17.9,270,96.4,330,97.8,311,106.4,177,100.9,69


Ensure all column names across all datasets have been standardized, make sure there are no null/missing values, ensure accurate datatypes and dataset shape.

In [32]:
print(EFFICIENCY_TEMPO_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2153 entries, 0 to 2152
Data columns (total 19 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   team                                       2153 non-null   object 
 1   conference                                 2153 non-null   object 
 2   adjusted_tempo_rating                      2153 non-null   float64
 3   adjusted_tempo__rating_rank                2153 non-null   int64  
 4   raw_tempo_rating                           2153 non-null   float64
 5   raw_tempo_rating_rank                      2153 non-null   int64  
 6   offensive_avg_possession_length            2153 non-null   float64
 7   offensive_avg_possession_length_rank       2153 non-null   int64  
 8   defensive_avg_possession_length            2153 non-null   float64
 9   defensive_avg_possession_length_rank       2153 non-null   int64  
 10  adjusted_offensive_effic

In [28]:
print(POINT_DISTRIBUTION_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2157 entries, 0 to 2156
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   season                          2157 non-null   int64  
 1   team_name                       2157 non-null   object 
 2   ft_point_share_offense          2157 non-null   float64
 3   ft_point_share_offense_rank     2157 non-null   int64  
 4   two_point_share_offense         2157 non-null   float64
 5   two_point_share_offense_rank    2157 non-null   int64  
 6   three_point_share_offense       2157 non-null   float64
 7   three_point_share_offense_rank  2157 non-null   int64  
 8   ft_point_share_defense          2157 non-null   float64
 9   ft_point_share_defense_rank     2157 non-null   int64  
 10  two_point_share_defense         2157 non-null   float64
 11  two_point_share_defense_rank    2157 non-null   int64  
 12  three_point_share_defense       21

In [27]:
print(TEAMROSTERDATA_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2157 entries, 0 to 2156
Data columns (total 50 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   season                                  2157 non-null   int64  
 1   team_name                               2157 non-null   object 
 2   avgHeightOnCourt_MinutesWeighted        2157 non-null   float64
 3   avgHeightOnCourt_MinutesWeighted_Rank   2157 non-null   int64  
 4   avgCenterHeight                         2157 non-null   float64
 5   avgCenterHeight_Rank                    2157 non-null   int64  
 6   avgPowerForwardHeight                   2157 non-null   float64
 7   avgPowerForwardHeight_Rank              2157 non-null   int64  
 8   avgSmallForwardHeight                   2157 non-null   float64
 9   avgSmallForwardHeight_Rank              2157 non-null   int64  
 10  avgShootingGuardHeight                  2157 non-null   floa

In [80]:
# Assign cleaned data directory
CLEANED_DIR = DATA_DIR / "cleaned"

# Order all kenPom DataFrames by season and team_name
TEAMROSTERDATA_df.sort_values(by=["season", "team_name"], inplace=True)
POINT_DISTRIBUTION_df.sort_values(by=["season", "team_name"], inplace=True)
EFFICIENCY_TEMPO_df.sort_values(by=["season", "team_name"], inplace=True)

# Export all kenPom DataFrames to CSV files
TEAMROSTERDATA_df.to_csv(CLEANED_DIR / "team_roster_data.csv", index=False)
POINT_DISTRIBUTION_df.to_csv(CLEANED_DIR / "point_distribution_data.csv", index=False)
EFFICIENCY_TEMPO_df.to_csv(CLEANED_DIR / "efficiency_tempo_data.csv", index=False)

KAGGLE DATA CONCATENATION + CLEANING

In [44]:
teamCoaches_df = pd.read_csv(KAGGLE_DIR / "MTeamCoaches.csv")
teams_df = pd.read_csv(KAGGLE_DIR / "MTeams.csv")
regularSeasonDetailedResults_df = pd.read_csv(KAGGLE_DIR / "MRegularSeasonDetailedResults.csv")
team_spellings_df = pd.read_csv(KAGGLE_DIR / "MTeamSpellings.csv")
teamConferences_df = pd.read_csv(KAGGLE_DIR / "MTeamConferences.csv")

In [41]:
regularSeasonDetailedResults_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118882 entries, 0 to 118881
Data columns (total 34 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Season   118882 non-null  int64 
 1   DayNum   118882 non-null  int64 
 2   WTeamID  118882 non-null  int64 
 3   WScore   118882 non-null  int64 
 4   LTeamID  118882 non-null  int64 
 5   LScore   118882 non-null  int64 
 6   WLoc     118882 non-null  object
 7   NumOT    118882 non-null  int64 
 8   WFGM     118882 non-null  int64 
 9   WFGA     118882 non-null  int64 
 10  WFGM3    118882 non-null  int64 
 11  WFGA3    118882 non-null  int64 
 12  WFTM     118882 non-null  int64 
 13  WFTA     118882 non-null  int64 
 14  WOR      118882 non-null  int64 
 15  WDR      118882 non-null  int64 
 16  WAst     118882 non-null  int64 
 17  WTO      118882 non-null  int64 
 18  WStl     118882 non-null  int64 
 19  WBlk     118882 non-null  int64 
 20  WPF      118882 non-null  int64 
 21  LFGM     1

In [45]:
#Remove all rows with Season less than 2019 from teamCoaches_df and regularSeasonDetailedResults_df and teamConferences_df
teamCoaches_df = teamCoaches_df[teamCoaches_df['Season'] >= 2019]
regularSeasonDetailedResults_df = regularSeasonDetailedResults_df[regularSeasonDetailedResults_df['Season'] >= 2019]
teamConferences_df = teamConferences_df[teamConferences_df['Season'] >= 2019]
#Remove all rows with where Season is 2020 from teamCoaches_df and regularSeasonDetailedResults_df and teamConferences_df
teamCoaches_df = teamCoaches_df[teamCoaches_df['Season'] != 2020]
regularSeasonDetailedResults_df = regularSeasonDetailedResults_df[regularSeasonDetailedResults_df['Season'] != 2020]
teamConferences_df = teamConferences_df[teamConferences_df['Season'] != 2020]
regularSeasonDetailedResults_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31513 entries, 82041 to 118881
Data columns (total 34 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Season   31513 non-null  int64 
 1   DayNum   31513 non-null  int64 
 2   WTeamID  31513 non-null  int64 
 3   WScore   31513 non-null  int64 
 4   LTeamID  31513 non-null  int64 
 5   LScore   31513 non-null  int64 
 6   WLoc     31513 non-null  object
 7   NumOT    31513 non-null  int64 
 8   WFGM     31513 non-null  int64 
 9   WFGA     31513 non-null  int64 
 10  WFGM3    31513 non-null  int64 
 11  WFGA3    31513 non-null  int64 
 12  WFTM     31513 non-null  int64 
 13  WFTA     31513 non-null  int64 
 14  WOR      31513 non-null  int64 
 15  WDR      31513 non-null  int64 
 16  WAst     31513 non-null  int64 
 17  WTO      31513 non-null  int64 
 18  WStl     31513 non-null  int64 
 19  WBlk     31513 non-null  int64 
 20  WPF      31513 non-null  int64 
 21  LFGM     31513 non-null  int64

In [39]:
teams_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   TeamID         380 non-null    int64 
 1   TeamName       380 non-null    object
 2   FirstD1Season  380 non-null    int64 
 3   LastD1Season   380 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 12.0+ KB


In [56]:
# Break down regularSeasonDetailedResults_df from game by game to season by season

# Separate wins and losses into two DataFrames
w = regularSeasonDetailedResults_df.rename(columns={"WTeamID":"TeamID", "WScore":"PtsFor", "LScore":"PtsAg", "WLoc":"Loc", "WFGM":"FGM","WFGA":"FGA","WFGM3":"FGM3","WFGA3":"FGA3","WFTM":"FTM","WFTA":"FTA","WOR":"ORB","WDR":"DRB","WAst":"AST","WTO":"TO","WStl":"STL","WBlk":"BLK","WPF":"PF"})
w["Win"] = 1
l = regularSeasonDetailedResults_df.rename(columns={"LTeamID":"TeamID", "LScore":"PtsFor", "WScore":"PtsAg", "LFGM":"FGM","LFGA":"FGA","LFGM3":"FGM3","LFGA3":"FGA3","LFTM":"FTM","LFTA":"FTA","LOR":"ORB","LDR":"DRB","LAst":"AST","LTO":"TO","LStl":"STL","LBlk":"BLK","LPF":"PF"})
l["Win"] = 0

games = pd.concat([w[["Season","TeamID","PtsFor","PtsAg","FGM","FGA","FGM3","FGA3","FTM","FTA","ORB","DRB","AST","TO","STL","BLK","PF","Win"]], l[["Season","TeamID","PtsFor","PtsAg","FGM","FGA","FGM3","FGA3","FTM","FTA","ORB","DRB","AST","TO","STL","BLK","PF","Win"]]])
teamSeasons_df = games.groupby(["Season","TeamID"]).agg(
    games_played=("Win","count"),
    wins=("Win","sum"),
    avg_pts_for=("PtsFor","mean"),
    avg_pts_ag=("PtsAg","mean"),
    avg_fgm=("FGM","mean"), avg_fga=("FGA","mean"),
    avg_fgm3=("FGM3","mean"), avg_fga3=("FGA3","mean"),
    avg_ftm=("FTM","mean"), avg_fta=("FTA","mean"),
    avg_orb=("ORB","mean"), avg_drb=("DRB","mean"),
    avg_ast=("AST","mean"), avg_to=("TO","mean"),
    avg_stl=("STL","mean"), avg_blk=("BLK","mean"), avg_pf=("PF","mean")
).reset_index()
teamSeasons_df["win_pct"] = teamSeasons_df["wins"]/teamSeasons_df["games_played"]

teamSeasons_df

Unnamed: 0,Season,TeamID,games_played,wins,avg_pts_for,avg_pts_ag,avg_fgm,avg_fga,avg_fgm3,avg_fga3,avg_ftm,avg_fta,avg_orb,avg_drb,avg_ast,avg_to,avg_stl,avg_blk,avg_pf,win_pct
0,2019,1101,29,23,71.724138,64.896552,25.344828,55.241379,7.241379,18.896552,13.793103,19.068966,9.068966,22.896552,14.620690,11.655172,8.000000,2.551724,19.137931,0.793103
1,2019,1102,31,13,67.387097,71.903226,24.741935,55.322581,7.258065,22.419355,10.645161,15.645161,7.741935,25.548387,13.290323,13.193548,4.806452,1.806452,16.967742,0.419355
2,2019,1103,31,15,68.354839,64.580645,23.935484,58.870968,8.870968,28.000000,11.612903,16.290323,9.322581,26.806452,11.935484,11.903226,5.612903,3.129032,17.483871,0.483871
3,2019,1104,33,18,71.787879,71.303030,25.060606,56.848485,7.060606,20.848485,14.606061,21.878788,11.151515,26.424242,12.151515,13.606061,4.606061,4.666667,16.969697,0.545455
4,2019,1105,32,5,60.562500,71.406250,23.000000,56.468750,5.687500,18.000000,8.875000,14.156250,10.031250,22.000000,12.000000,15.218750,7.312500,1.531250,18.343750,0.156250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2142,2025,1476,30,13,67.333333,70.900000,23.300000,53.966667,8.500000,24.000000,12.233333,16.966667,7.166667,22.400000,14.066667,11.300000,5.966667,2.600000,16.733333,0.433333
2143,2025,1477,31,5,64.354839,74.870968,23.000000,55.290323,8.387097,26.709677,9.967742,15.483871,7.935484,19.000000,14.161290,14.612903,8.387097,3.903226,16.774194,0.161290
2144,2025,1478,30,7,71.933333,81.400000,24.800000,55.400000,7.500000,22.900000,14.833333,20.866667,7.466667,21.666667,13.466667,12.933333,6.666667,2.066667,19.300000,0.233333
2145,2025,1479,28,12,65.785714,71.750000,22.678571,53.821429,7.000000,19.607143,13.428571,16.642857,6.214286,18.214286,13.107143,9.750000,6.607143,1.892857,16.678571,0.428571


In [57]:
# Compute coach change flag
coach_counts = teamCoaches_df.groupby(["Season","TeamID"])["CoachName"].nunique().reset_index(name="n_coaches")
coach_counts["coach_change"] = (coach_counts["n_coaches"] > 1).astype(int)

In [None]:
# Merge coach change data into teamSeasons_df on TeamID and Season
teamSeasons_df = teamSeasons_df.merge(coach_counts[['Season', 'TeamID', 'coach_change']], on=["Season", "TeamID"], how="left")
teamSeasons_df

In [60]:
# Merge conference data into teamSeasons_df on TeamID and Season
teamSeasons_df = teamSeasons_df.merge(teamConferences_df[['Season', 'TeamID', 'ConfAbbrev']], on=["Season", "TeamID"], how="left")
teamSeasons_df = teamSeasons_df.rename(columns={"ConfAbbrev":"conference"})
teamSeasons_df

Unnamed: 0,Season,TeamID,games_played,wins,avg_pts_for,avg_pts_ag,avg_fgm,avg_fga,avg_fgm3,avg_fga3,...,avg_orb,avg_drb,avg_ast,avg_to,avg_stl,avg_blk,avg_pf,win_pct,coach_change,conference
0,2019,1101,29,23,71.724138,64.896552,25.344828,55.241379,7.241379,18.896552,...,9.068966,22.896552,14.620690,11.655172,8.000000,2.551724,19.137931,0.793103,0,southland
1,2019,1102,31,13,67.387097,71.903226,24.741935,55.322581,7.258065,22.419355,...,7.741935,25.548387,13.290323,13.193548,4.806452,1.806452,16.967742,0.419355,0,mwc
2,2019,1103,31,15,68.354839,64.580645,23.935484,58.870968,8.870968,28.000000,...,9.322581,26.806452,11.935484,11.903226,5.612903,3.129032,17.483871,0.483871,0,mac
3,2019,1104,33,18,71.787879,71.303030,25.060606,56.848485,7.060606,20.848485,...,11.151515,26.424242,12.151515,13.606061,4.606061,4.666667,16.969697,0.545455,0,sec
4,2019,1105,32,5,60.562500,71.406250,23.000000,56.468750,5.687500,18.000000,...,10.031250,22.000000,12.000000,15.218750,7.312500,1.531250,18.343750,0.156250,0,swac
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2142,2025,1476,30,13,67.333333,70.900000,23.300000,53.966667,8.500000,24.000000,...,7.166667,22.400000,14.066667,11.300000,5.966667,2.600000,16.733333,0.433333,0,nec
2143,2025,1477,31,5,64.354839,74.870968,23.000000,55.290323,8.387097,26.709677,...,7.935484,19.000000,14.161290,14.612903,8.387097,3.903226,16.774194,0.161290,0,southland
2144,2025,1478,30,7,71.933333,81.400000,24.800000,55.400000,7.500000,22.900000,...,7.466667,21.666667,13.466667,12.933333,6.666667,2.066667,19.300000,0.233333,0,nec
2145,2025,1479,28,12,65.785714,71.750000,22.678571,53.821429,7.000000,19.607143,...,6.214286,18.214286,13.107143,9.750000,6.607143,1.892857,16.678571,0.428571,0,nec


In [61]:
teamSeasons_df.columns

Index(['Season', 'TeamID', 'games_played', 'wins', 'avg_pts_for', 'avg_pts_ag',
       'avg_fgm', 'avg_fga', 'avg_fgm3', 'avg_fga3', 'avg_ftm', 'avg_fta',
       'avg_orb', 'avg_drb', 'avg_ast', 'avg_to', 'avg_stl', 'avg_blk',
       'avg_pf', 'win_pct', 'coach_change', 'conference'],
      dtype='object')

In [62]:
# Add team name to teamSeasons_df
teamSeasons_df = teamSeasons_df.merge(teams_df[["TeamID","TeamName"]], on="TeamID", how="left")
teamSeasons_df = teamSeasons_df.rename(columns={"TeamName":"team_name"})

In [64]:
teamSeasons_df.columns

Index(['Season', 'TeamID', 'games_played', 'wins', 'avg_pts_for', 'avg_pts_ag',
       'avg_fgm', 'avg_fga', 'avg_fgm3', 'avg_fga3', 'avg_ftm', 'avg_fta',
       'avg_orb', 'avg_drb', 'avg_ast', 'avg_to', 'avg_stl', 'avg_blk',
       'avg_pf', 'win_pct', 'coach_change', 'conference', 'team_name'],
      dtype='object')

In [65]:
# Rename and Standardize column names for teamSeasons_df
teamSeasons_df = teamSeasons_df.rename(columns={"Season" : "season", "TeamID" : "team_id"})


cols = ["season","team_id","team_name","conference","coach_change",
        "games_played","wins","win_pct","avg_fgm","avg_fga",
        "avg_fgm3","avg_fga3","avg_ftm","avg_fta",
        "avg_orb","avg_drb","avg_ast","avg_to",
        "avg_stl","avg_blk","avg_pf","avg_pts_for","avg_pts_ag"]
teamSeasons_df = teamSeasons_df[cols]
teamSeasons_df

Unnamed: 0,season,team_id,team_name,conference,coach_change,games_played,wins,win_pct,avg_fgm,avg_fga,...,avg_fta,avg_orb,avg_drb,avg_ast,avg_to,avg_stl,avg_blk,avg_pf,avg_pts_for,avg_pts_ag
0,2019,1101,Abilene Chr,southland,0,29,23,0.793103,25.344828,55.241379,...,19.068966,9.068966,22.896552,14.620690,11.655172,8.000000,2.551724,19.137931,71.724138,64.896552
1,2019,1102,Air Force,mwc,0,31,13,0.419355,24.741935,55.322581,...,15.645161,7.741935,25.548387,13.290323,13.193548,4.806452,1.806452,16.967742,67.387097,71.903226
2,2019,1103,Akron,mac,0,31,15,0.483871,23.935484,58.870968,...,16.290323,9.322581,26.806452,11.935484,11.903226,5.612903,3.129032,17.483871,68.354839,64.580645
3,2019,1104,Alabama,sec,0,33,18,0.545455,25.060606,56.848485,...,21.878788,11.151515,26.424242,12.151515,13.606061,4.606061,4.666667,16.969697,71.787879,71.303030
4,2019,1105,Alabama A&M,swac,0,32,5,0.156250,23.000000,56.468750,...,14.156250,10.031250,22.000000,12.000000,15.218750,7.312500,1.531250,18.343750,60.562500,71.406250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2142,2025,1476,Stonehill,nec,0,30,13,0.433333,23.300000,53.966667,...,16.966667,7.166667,22.400000,14.066667,11.300000,5.966667,2.600000,16.733333,67.333333,70.900000
2143,2025,1477,East Texas A&M,southland,0,31,5,0.161290,23.000000,55.290323,...,15.483871,7.935484,19.000000,14.161290,14.612903,8.387097,3.903226,16.774194,64.354839,74.870968
2144,2025,1478,Le Moyne,nec,0,30,7,0.233333,24.800000,55.400000,...,20.866667,7.466667,21.666667,13.466667,12.933333,6.666667,2.066667,19.300000,71.933333,81.400000
2145,2025,1479,Mercyhurst,nec,0,28,12,0.428571,22.678571,53.821429,...,16.642857,6.214286,18.214286,13.107143,9.750000,6.607143,1.892857,16.678571,65.785714,71.750000


In [82]:
# Export teamSeasons_df to CSV
teamSeasons_df.to_csv(CLEANED_DIR / "team_season_data.csv", index=False)
#print(f"Exported teamSeasons_df to {CLEANED_DIR / 'team_season_data.csv'}")