In [17]:
import pandas as pd
import datetime as dt

In [18]:
# Read in CSV files
api_df = pd.read_csv('./data/2015-2024_apiGameScores.csv')
excel_df = pd.read_csv('./data/2022-2023_csvGameScores.csv')

print('API file')
print(api_df.head())
print('\n')

print('Excel file')
print(excel_df.head())

API file
         date         visitor           home  visitor_points  home_points
0  2015-11-24  Boston Celtics  Atlanta Hawks            97.0        121.0
1  2016-04-09  Boston Celtics  Atlanta Hawks           107.0        118.0
2  2016-04-16  Boston Celtics  Atlanta Hawks           101.0        102.0
3  2016-04-19  Boston Celtics  Atlanta Hawks            72.0         89.0
4  2016-04-26  Boston Celtics  Atlanta Hawks            83.0        110.0


Excel file
         date             visitor                   home  visitor_points  \
0  2023-10-24  Los Angeles Lakers         Denver Nuggets             107   
1  2023-10-24        Phoenix Suns  Golden State Warriors             108   
2  2023-10-25       Atlanta Hawks      Charlotte Hornets             110   
3  2023-10-25  Washington Wizards         Indiana Pacers             120   
4  2023-10-25      Boston Celtics        New York Knicks             108   

   home_points  
0          119  
1          104  
2          116  
3        

# Union

In [25]:
print(f"Api Data Frame has {len(api_df)} rows of data.")
print(f"Excel Date Frame has {len(excel_df)} rows of data.")


Api Data Frame has 12796 rows of data.
Excel Date Frame has 1231 rows of data.


In [31]:
# Merge both data frames
merged_df = pd.concat([api_df, excel_df], keys='date', ignore_index=True)

# Order by 'date' column
merged_df = merged_df.sort_values(by='date').reset_index(drop=True)

# Display
merged_df

  merged_df = pd.concat([api_df, excel_df], keys='date', ignore_index=True)


Unnamed: 0,date,visitor,home,visitor_points,home_points
0,2015-10-02,Denver Nuggets,LA Clippers,96.0,103.0
1,2015-10-03,New Orleans Pelicans,Indiana Pacers,110.0,105.0
2,2015-10-03,Charlotte Hornets,Orlando Magic,106.0,100.0
3,2015-10-04,Utah Jazz,Los Angeles Lakers,90.0,71.0
4,2015-10-04,Charlotte Hornets,Miami Heat,90.0,77.0
...,...,...,...,...,...
14022,2024-11-04,Boston Celtics,Atlanta Hawks,123.0,93.0
14023,2024-11-04,Indiana Pacers,Dallas Mavericks,134.0,127.0
14024,2024-11-04,Orlando Magic,Oklahoma City Thunder,86.0,102.0
14025,2024-11-04,San Antonio Spurs,LA Clippers,104.0,113.0


In [33]:
# Find total count of null values within each column
merged_df.isnull().sum()

date              0
visitor           0
home              0
visitor_points    0
home_points       0
dtype: int64

In [None]:
# Find total count of duplicates
duplicate_count = len(merged_df[merged_df.duplicated()])
print(f"Total duplicates in merged data frame: {duplicate_count}")

# Display
merged_df[merged_df.duplicated()]

Total duplicates in merged data frame: 1227


Unnamed: 0,date,visitor,home,visitor_points,home_points
11301,2023-10-24,Los Angeles Lakers,Denver Nuggets,107.0,119.0
11302,2023-10-24,Phoenix Suns,Golden State Warriors,108.0,104.0
11310,2023-10-25,Detroit Pistons,Miami Heat,102.0,103.0
11313,2023-10-25,Minnesota Timberwolves,Toronto Raptors,94.0,97.0
11316,2023-10-25,Oklahoma City Thunder,Chicago Bulls,124.0,104.0
...,...,...,...,...,...
13756,2024-04-14,Atlanta Hawks,Indiana Pacers,115.0,157.0
13757,2024-04-14,Washington Wizards,Boston Celtics,122.0,132.0
13758,2024-04-14,Toronto Raptors,Miami Heat,103.0,118.0
13759,2024-04-14,Houston Rockets,LA Clippers,116.0,105.0


In [None]:
# Show example
merged_df.loc[(merged_df['visitor'] == 'Detroit Pistons')  & (merged_df['home'] == 'San Antonio Spurs')]

Unnamed: 0,date,visitor,home,visitor_points,home_points
69,2015-10-18,Detroit Pistons,San Antonio Spurs,92.0,96.0
996,2016-03-02,Detroit Pistons,San Antonio Spurs,81.0,97.0
1633,2016-11-11,Detroit Pistons,San Antonio Spurs,86.0,96.0
3310,2017-12-04,Detroit Pistons,San Antonio Spurs,93.0,96.0
4393,2018-10-05,Detroit Pistons,San Antonio Spurs,93.0,117.0
5348,2019-02-27,Detroit Pistons,San Antonio Spurs,93.0,105.0
6372,2019-12-28,Detroit Pistons,San Antonio Spurs,109.0,136.0
8002,2021-04-22,Detroit Pistons,San Antonio Spurs,91.0,106.0
8927,2021-12-26,Detroit Pistons,San Antonio Spurs,109.0,144.0
10503,2023-01-06,Detroit Pistons,San Antonio Spurs,109.0,121.0


In [None]:
# Drop duplicates
final_df = merged_df.drop_duplicates()

print(f"Total number of rows in final dataset is : {len(final_df)}")

Total number of rows in final dataset is : 12800


In [None]:
# Export as CSV file
final_df.to_csv(f'./data/finalMergedSet.csv', index=False)