In [507]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import statistics
import time

# to ignore the warnings
from warnings import filterwarnings

# Data From API

In [508]:
scores_df = pd.read_excel('./data/game_scores.xlsx')
scores_df.head()

Unnamed: 0,date,visitor,home,visitor_points,home_points
0,2015-11-25T01:00:00.000Z,Boston Celtics,Atlanta Hawks,97.0,121.0
1,2016-04-09T23:30:00.000Z,Boston Celtics,Atlanta Hawks,107.0,118.0
2,2016-04-16T23:00:00.000Z,Boston Celtics,Atlanta Hawks,101.0,102.0
3,2016-04-19T23:00:00.000Z,Boston Celtics,Atlanta Hawks,72.0,89.0
4,2016-04-27T00:30:00.000Z,Boston Celtics,Atlanta Hawks,83.0,110.0


In [509]:
scores_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14242 entries, 0 to 14241
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            14242 non-null  object 
 1   visitor         14242 non-null  object 
 2   home            14242 non-null  object 
 3   visitor_points  12796 non-null  float64
 4   home_points     12796 non-null  float64
dtypes: float64(2), object(3)
memory usage: 556.5+ KB


In [510]:
scores_df['visitor'].unique()

array(['Boston Celtics', 'Atlanta Hawks', 'Brooklyn Nets',
       'Charlotte Hornets', 'Chicago Bulls', 'Cleveland Cavaliers',
       'Dallas Mavericks', 'Denver Nuggets', 'Detroit Pistons',
       'Golden State Warriors', 'Houston Rockets', 'Indiana Pacers',
       'LA Clippers', 'Los Angeles Lakers', 'Memphis Grizzlies',
       'Miami Heat', 'Milwaukee Bucks', 'Minnesota Timberwolves',
       'New Orleans Pelicans', 'New York Knicks', 'Oklahoma City Thunder',
       'Orlando Magic', 'Philadelphia 76ers', 'Phoenix Suns',
       'Portland Trail Blazers', 'Sacramento Kings', 'San Antonio Spurs',
       'Toronto Raptors', 'Utah Jazz', 'Washington Wizards'], dtype=object)

# Date Correction

In [511]:
# From this dataframe it is found that games where the dates are in the morning are previous day games
# These games will have to be adjusted to match actual game schedules 

# 1. Convert date column to datetime
scores_df['date'] = pd.to_datetime(scores_df['date'], errors='coerce')

# 2. Find days between 12 am and 3 am and subtract one day for each row (row count started to change from 3am to 4am, 3am was the limit)
scores_df.loc[scores_df['date'].dt.hour.between(0, 3), 'date'] -= pd.Timedelta(days=1)

# 3.Format the 'date' column to show only the date portion (year-month-day)
scores_df['date'] = scores_df['date'].dt.strftime('%Y-%m-%d')

# Display
scores_df.head()

Unnamed: 0,date,visitor,home,visitor_points,home_points
0,2015-11-24,Boston Celtics,Atlanta Hawks,97.0,121.0
1,2016-04-09,Boston Celtics,Atlanta Hawks,107.0,118.0
2,2016-04-16,Boston Celtics,Atlanta Hawks,101.0,102.0
3,2016-04-19,Boston Celtics,Atlanta Hawks,72.0,89.0
4,2016-04-26,Boston Celtics,Atlanta Hawks,83.0,110.0


In [512]:
# Find total amount of duplicated rows
duplicate_count = scores_df.duplicated().sum()

# Drop duplicated rows
scores_df = scores_df.drop_duplicates()
print(f"{duplicate_count} duplicated rows have been dropped.")

34 duplicated rows have been dropped.


In [513]:
scores_df.isnull().sum()

date               283
visitor              0
home                 0
visitor_points    1412
home_points       1412
dtype: int64

In [514]:
scores_df[scores_df.isnull().any(axis=1)]

Unnamed: 0,date,visitor,home,visitor_points,home_points
46,2024-11-12,Atlanta Hawks,Boston Celtics,,
47,2025-01-18,Atlanta Hawks,Boston Celtics,,
82,2025-03-16,Atlanta Hawks,Brooklyn Nets,,
83,2025-04-10,Atlanta Hawks,Brooklyn Nets,,
92,,Charlotte Hornets,Atlanta Hawks,,
...,...,...,...,...,...
14210,,Toronto Raptors,Washington Wizards,,
14220,2025-01-29,Toronto Raptors,Washington Wizards,,
14221,2025-03-24,Toronto Raptors,Washington Wizards,,
14231,2025-03-05,Utah Jazz,Washington Wizards,,


In [515]:
# Drop all rows that have null values
api_df = scores_df.dropna()

# Display
api_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12796 entries, 0 to 14240
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            12796 non-null  object 
 1   visitor         12796 non-null  object 
 2   home            12796 non-null  object 
 3   visitor_points  12796 non-null  float64
 4   home_points     12796 non-null  float64
dtypes: float64(2), object(3)
memory usage: 599.8+ KB


In [516]:
# Export to excel file
api_df.to_csv(f'./data/2015-2024_apiGamesScores.csv', index=False)