In [1]:
import pandas as pd
import datetime as dt

In [2]:
# Read in CSV files
api_df = pd.read_csv('./data/2015-2024_apiGameScores.csv')
excel_df = pd.read_csv('./data/2022-2023_csvGameScores.csv')

print('API file')
print(api_df.head())
print('\n')

print('Excel file')
print(excel_df.head())

API file
         date         visitor           home  visitor_points  home_points
0  2015-11-24  Boston Celtics  Atlanta Hawks            97.0        121.0
1  2016-04-09  Boston Celtics  Atlanta Hawks           107.0        118.0
2  2016-04-16  Boston Celtics  Atlanta Hawks           101.0        102.0
3  2016-04-19  Boston Celtics  Atlanta Hawks            72.0         89.0
4  2016-04-26  Boston Celtics  Atlanta Hawks            83.0        110.0


Excel file
         date             visitor                   home  visitor_points  \
0  2022-10-18  Philadelphia 76ers         Boston Celtics             117   
1  2022-10-18  Los Angeles Lakers  Golden State Warriors             109   
2  2022-10-19       Orlando Magic        Detroit Pistons             109   
3  2022-10-19  Washington Wizards         Indiana Pacers             114   
4  2022-10-19     Houston Rockets          Atlanta Hawks             107   

   home_points  
0          126  
1          123  
2          113  
3        

# Union

In [3]:
print(f"Api Data Frame has {len(api_df)} rows of data.")
print(f"Excel Date Frame has {len(excel_df)} rows of data.")


Api Data Frame has 12796 rows of data.
Excel Date Frame has 1230 rows of data.


In [4]:
# Merge both data frames
merged_df = pd.concat([api_df, excel_df], keys='date', ignore_index=True)

# Order by 'date' column
merged_df = merged_df.sort_values(by='date').reset_index(drop=True)

# Display
merged_df

  merged_df = pd.concat([api_df, excel_df], keys='date', ignore_index=True)


Unnamed: 0,date,visitor,home,visitor_points,home_points
0,2015-10-02,Denver Nuggets,LA Clippers,96.0,103.0
1,2015-10-03,New Orleans Pelicans,Indiana Pacers,110.0,105.0
2,2015-10-03,Charlotte Hornets,Orlando Magic,106.0,100.0
3,2015-10-04,Charlotte Hornets,Miami Heat,90.0,77.0
4,2015-10-04,LA Clippers,Toronto Raptors,73.0,93.0
...,...,...,...,...,...
14021,2024-11-04,Philadelphia 76ers,Phoenix Suns,116.0,118.0
14022,2024-11-04,Indiana Pacers,Dallas Mavericks,134.0,127.0
14023,2024-11-04,Sacramento Kings,Miami Heat,111.0,110.0
14024,2024-11-04,Los Angeles Lakers,Detroit Pistons,103.0,115.0


In [5]:
# Find total count of null values within each column
merged_df.isnull().sum()

date              0
visitor           0
home              0
visitor_points    0
home_points       0
dtype: int64

In [6]:
# Find total count of duplicates
duplicate_count = len(merged_df[merged_df.duplicated()])
print(f"Total duplicates in merged data frame: {duplicate_count}")

# Display
merged_df[merged_df.duplicated()]

Total duplicates in merged data frame: 1229


Unnamed: 0,date,visitor,home,visitor_points,home_points
9916,2022-10-18,Philadelphia 76ers,Boston Celtics,117.0,126.0
9918,2022-10-18,Los Angeles Lakers,Golden State Warriors,109.0,123.0
9925,2022-10-19,Washington Wizards,Indiana Pacers,114.0,107.0
9927,2022-10-19,Orlando Magic,Detroit Pistons,109.0,113.0
9931,2022-10-19,New Orleans Pelicans,Brooklyn Nets,130.0,108.0
...,...,...,...,...,...
12371,2023-04-09,New Orleans Pelicans,Minnesota Timberwolves,108.0,113.0
12372,2023-04-09,San Antonio Spurs,Dallas Mavericks,138.0,117.0
12373,2023-04-09,Milwaukee Bucks,Toronto Raptors,105.0,121.0
12374,2023-04-09,Utah Jazz,Los Angeles Lakers,117.0,128.0


In [7]:
# Show example
merged_df.loc[(merged_df['visitor'] == 'Detroit Pistons')  & (merged_df['home'] == 'San Antonio Spurs')]

Unnamed: 0,date,visitor,home,visitor_points,home_points
65,2015-10-18,Detroit Pistons,San Antonio Spurs,92.0,96.0
1000,2016-03-02,Detroit Pistons,San Antonio Spurs,81.0,97.0
1635,2016-11-11,Detroit Pistons,San Antonio Spurs,86.0,96.0
3303,2017-12-04,Detroit Pistons,San Antonio Spurs,93.0,96.0
4395,2018-10-05,Detroit Pistons,San Antonio Spurs,93.0,117.0
5353,2019-02-27,Detroit Pistons,San Antonio Spurs,93.0,105.0
6381,2019-12-28,Detroit Pistons,San Antonio Spurs,109.0,136.0
8000,2021-04-22,Detroit Pistons,San Antonio Spurs,91.0,106.0
8925,2021-12-26,Detroit Pistons,San Antonio Spurs,109.0,144.0
11083,2023-01-06,Detroit Pistons,San Antonio Spurs,109.0,121.0


In [8]:
# Drop duplicates
final_df = merged_df.drop_duplicates()

print(f"Total number of rows in final dataset is : {len(final_df)}")

Total number of rows in final dataset is : 12797


In [9]:
# Export as CSV file
final_df.to_csv(f'./data/finalMergedSet.csv', index=False)