In [12]:
import pandas as pd
import datetime as dt

games_df = pd.read_csv('./data/nba-2022-EasternStandardTime.csv')
games_df

Unnamed: 0,Match Number,Round Number,Date,Location,Home Team,Away Team,Result
0,1,1,18/10/2022 19:30,TD Garden,Boston Celtics,Philadelphia 76ers,126 - 117
1,2,1,18/10/2022 22:00,Chase Center,Golden State Warriors,Los Angeles Lakers,123 - 109
2,3,1,19/10/2022 19:00,Little Caesars Arena,Detroit Pistons,Orlando Magic,113 - 109
3,4,1,19/10/2022 19:00,Gainbridge Fieldhouse,Indiana Pacers,Washington Wizards,107 - 114
4,5,1,19/10/2022 19:30,State Farm Arena,Atlanta Hawks,Houston Rockets,117 - 107
...,...,...,...,...,...,...,...
1225,1226,24,09/04/2023 15:30,Paycom Center,Oklahoma City Thunder,Memphis Grizzlies,115 - 100
1226,1227,24,09/04/2023 15:30,Ball Arena,Denver Nuggets,Sacramento Kings,109 - 95
1227,1228,24,09/04/2023 15:30,Crypto.com Arena,Los Angeles Lakers,Utah Jazz,128 - 117
1228,1229,24,09/04/2023 15:30,Footprint Center,Phoenix Suns,LA Clippers,114 - 119


# Game Information

In [13]:
games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1230 entries, 0 to 1229
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Match Number  1230 non-null   int64 
 1   Round Number  1230 non-null   int64 
 2   Date          1230 non-null   object
 3   Location      1230 non-null   object
 4   Home Team     1230 non-null   object
 5   Away Team     1230 non-null   object
 6   Result        1230 non-null   object
dtypes: int64(2), object(5)
memory usage: 67.4+ KB


In [14]:
# Find total number of null values from each column
games_df.isnull().sum()

Match Number    0
Round Number    0
Date            0
Location        0
Home Team       0
Away Team       0
Result          0
dtype: int64

In [15]:
# Select specific columns
games_df = games_df[['Date', 'Home Team', 'Away Team', 'Result']]

# Display first five rows
games_df

Unnamed: 0,Date,Home Team,Away Team,Result
0,18/10/2022 19:30,Boston Celtics,Philadelphia 76ers,126 - 117
1,18/10/2022 22:00,Golden State Warriors,Los Angeles Lakers,123 - 109
2,19/10/2022 19:00,Detroit Pistons,Orlando Magic,113 - 109
3,19/10/2022 19:00,Indiana Pacers,Washington Wizards,107 - 114
4,19/10/2022 19:30,Atlanta Hawks,Houston Rockets,117 - 107
...,...,...,...,...
1225,09/04/2023 15:30,Oklahoma City Thunder,Memphis Grizzlies,115 - 100
1226,09/04/2023 15:30,Denver Nuggets,Sacramento Kings,109 - 95
1227,09/04/2023 15:30,Los Angeles Lakers,Utah Jazz,128 - 117
1228,09/04/2023 15:30,Phoenix Suns,LA Clippers,114 - 119


In [16]:
# Split 'Result' column
games_df[['Home Points', 'Away Points']] = games_df['Result'].str.split('-', expand=True)

# Drop 'Result' column
games_df = games_df.drop(columns=['Result'])

games_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_df[['Home Points', 'Away Points']] = games_df['Result'].str.split('-', expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_df[['Home Points', 'Away Points']] = games_df['Result'].str.split('-', expand=True)


Unnamed: 0,Date,Home Team,Away Team,Home Points,Away Points
0,18/10/2022 19:30,Boston Celtics,Philadelphia 76ers,126,117
1,18/10/2022 22:00,Golden State Warriors,Los Angeles Lakers,123,109
2,19/10/2022 19:00,Detroit Pistons,Orlando Magic,113,109
3,19/10/2022 19:00,Indiana Pacers,Washington Wizards,107,114
4,19/10/2022 19:30,Atlanta Hawks,Houston Rockets,117,107


In [17]:
# Rename columns
games_df = games_df.rename(columns={
    'Date': 'date',
    'Home Team': 'home',
    'Away Team': 'visitor',
    'Home Points': 'home_points',
    'Away Points': 'visitor_points'
})

# Rearrange columns
games_df = games_df[['date', 'visitor', 'home', 'visitor_points', 'home_points']]

# Display first five rows
games_df.head()

Unnamed: 0,date,visitor,home,visitor_points,home_points
0,18/10/2022 19:30,Philadelphia 76ers,Boston Celtics,117,126
1,18/10/2022 22:00,Los Angeles Lakers,Golden State Warriors,109,123
2,19/10/2022 19:00,Orlando Magic,Detroit Pistons,109,113
3,19/10/2022 19:00,Washington Wizards,Indiana Pacers,114,107
4,19/10/2022 19:30,Houston Rockets,Atlanta Hawks,107,117


In [18]:
# Convert to datetime column
games_df['date'] = pd.to_datetime(games_df['date'], errors='coerce')

# Change the format of the date columns
games_df['date'] = games_df['date'].dt.strftime('%Y-%m-%d')

# Display
games_df.head()

  games_df['date'] = pd.to_datetime(games_df['date'], errors='coerce')


Unnamed: 0,date,visitor,home,visitor_points,home_points
0,2022-10-18,Philadelphia 76ers,Boston Celtics,117,126
1,2022-10-18,Los Angeles Lakers,Golden State Warriors,109,123
2,2022-10-19,Orlando Magic,Detroit Pistons,109,113
3,2022-10-19,Washington Wizards,Indiana Pacers,114,107
4,2022-10-19,Houston Rockets,Atlanta Hawks,107,117


In [19]:
# Export to excel file
games_df.to_csv('./data/2022-2023_csvGameScores.csv', index=False)