In [27]:
import os

import pandas as pd
import numpy as np

import hopsworks

from datetime import datetime, timedelta
from pytz import timezone

from src.webscraping import (
    activate_web_driver,
    scrape_to_dataframe,
    convert_columns,
    combine_home_visitor,  
)

from src.data_processing import (
    process_games,
    add_TARGET,
)

from src.feature_engineering import (
    process_features,
)

from src.hopsworks_utils import (
    save_feature_names,
    convert_feature_names,
)

import json

from pathlib import Path  #for Windows/Linux compatibility
DATAPATH = Path(r'data')

**Load API keys**

In [28]:
from dotenv import load_dotenv

load_dotenv()

try:
    HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']
except:
    raise Exception('Set environment variable HOPSWORKS_API_KEY')

**Scrape Data and Format**

In [29]:

# set search strings for the last seven days 
DAYS = 7
SEASON = "" #no season will cause website to default to current season, format is "2022-23"
TODAY = datetime.now(timezone('EST')) #nba.com uses US Eastern Standard Time
LASTWEEK = (TODAY - timedelta(days=DAYS))
DATETO = TODAY.strftime("%m/%d/%y")
DATEFROM = LASTWEEK.strftime("%m/%d/%y")

# initate a webdriver in selenium 
# since website data is dynamically generated
driver = activate_web_driver('firefox')

df = scrape_to_dataframe(driver, Season=SEASON, DateFrom=DATEFROM, DateTo=DATETO)

driver.close() 

df = convert_columns(df)
df = combine_home_visitor(df)

df_new = df

df_new.head()

2022-12-09 08:30:55,882 INFO: Get LATEST geckodriver version for 107.0 firefox


[WDM] - Downloading: 19.0kB [00:00, 6.50MB/s]                   


2022-12-09 08:30:56,472 INFO: Driver [C:\Users\Chris\.wdm\drivers\geckodriver\win64\0.32\geckodriver.exe] found in cache




Unnamed: 0,GAME_DATE_EST,HOME_TEAM_WINS,PTS_home,FG_PCT_home,FG3_PCT_home,FT_PCT_home,REB_home,AST_home,HOME_TEAM_ID,GAME_ID,PTS_away,FG_PCT_away,FG3_PCT_away,FT_PCT_away,REB_away,AST_away,VISITOR_TEAM_ID,SEASON
0,2022-12-08,1,121,51.6,40.0,68.2,48,28,1610612743,22200376,120,48.4,40.0,100.0,38,30,1610612757,2022
1,2022-12-08,0,109,44.9,34.5,73.3,42,23,1610612745,22200375,118,55.4,36.1,86.7,38,25,1610612759,2022
2,2022-12-08,0,110,46.3,47.2,90.5,43,24,1610612746,22200374,115,51.1,37.5,78.6,43,24,1610612748,2022
3,2022-12-07,0,111,48.8,29.0,69.0,38,22,1610612764,22200367,115,50.0,43.5,88.2,41,29,1610612741,2022
4,2022-12-07,0,113,42.5,21.2,91.4,46,16,1610612747,22200366,126,44.3,27.9,83.3,54,25,1610612761,2022


**Access Feature Store**

In [30]:
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/3350
Connected. Call `.close()` to terminate connection gracefully.


**Access Feature Group**

In [31]:
rolling_stats_fg = fs.get_feature_group(
    name="rolling_stats",
    version=1,
)

**Query Old Data Needed for Feature Engineering of New Data**

To generate features like rolling averages for the new games, older data from previous games is needed since some of the rolling averages might extend back 15 or 20 games or so.

In [32]:
BASE_FEATURES = ['game_date_est',
 'game_id',
 'home_team_id',
 'visitor_team_id',
 'season',
 'pts_home',
 'fg_pct_home',
 'ft_pct_home',
 'fg3_pct_home',
 'ast_home',
 'reb_home',
 'pts_away',
 'fg_pct_away',
 'ft_pct_away',
 'fg3_pct_away',
 'ast_away',
 'reb_away',
 'home_team_wins',
]

ds_query = rolling_stats_fg.select(BASE_FEATURES)
df_old = ds_query.read()
df_old.head()


2022-12-09 08:34:28,157 INFO: USE `nba_predictor_featurestore`
2022-12-09 08:34:28,543 INFO: SELECT `fg0`.`game_date_est` `game_date_est`, `fg0`.`game_id` `game_id`, `fg0`.`home_team_id` `home_team_id`, `fg0`.`visitor_team_id` `visitor_team_id`, `fg0`.`season` `season`, `fg0`.`pts_home` `pts_home`, `fg0`.`fg_pct_home` `fg_pct_home`, `fg0`.`ft_pct_home` `ft_pct_home`, `fg0`.`fg3_pct_home` `fg3_pct_home`, `fg0`.`ast_home` `ast_home`, `fg0`.`reb_home` `reb_home`, `fg0`.`pts_away` `pts_away`, `fg0`.`fg_pct_away` `fg_pct_away`, `fg0`.`ft_pct_away` `ft_pct_away`, `fg0`.`fg3_pct_away` `fg3_pct_away`, `fg0`.`ast_away` `ast_away`, `fg0`.`reb_away` `reb_away`, `fg0`.`home_team_wins` `home_team_wins`
FROM `nba_predictor_featurestore`.`rolling_stats_1` `fg0`




Unnamed: 0,game_date_est,game_id,home_team_id,visitor_team_id,season,pts_home,fg_pct_home,ft_pct_home,fg3_pct_home,ast_home,reb_home,pts_away,fg_pct_away,ft_pct_away,fg3_pct_away,ast_away,reb_away,home_team_wins
0,2017-12-08,21700374,1610612759,1610612738,2017,105,0.468994,0.875,0.295898,16,46,102,0.458008,0.881836,0.289062,14,39,1
1,2013-03-01,21200874,1610612756,1610612737,2012,92,0.444092,0.833008,0.455078,16,38,87,0.425049,0.772949,0.3479,21,43,1
2,2005-11-30,20500210,1610612738,1610612755,2005,110,0.447998,0.78418,0.25,24,59,103,0.408936,0.770996,0.308105,21,40,1
3,2018-12-10,21800395,1610612749,1610612739,2018,108,0.437988,0.817871,0.416992,22,58,92,0.375,0.666992,0.333008,24,46,1
4,2007-03-12,20600946,1610612756,1610612745,2006,103,0.5,0.727051,0.600098,18,50,82,0.385986,0.722168,0.262939,13,36,1


**Convert Feature Names back to original mixed case**

In [33]:
df_old = convert_feature_names(df_old)
df_old.head()

Unnamed: 0,GAME_DATE_EST,GAME_ID,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
0,2017-12-08,21700374,1610612759,1610612738,2017,105,0.468994,0.875,0.295898,16,46,102,0.458008,0.881836,0.289062,14,39,1
1,2013-03-01,21200874,1610612756,1610612737,2012,92,0.444092,0.833008,0.455078,16,38,87,0.425049,0.772949,0.3479,21,43,1
2,2005-11-30,20500210,1610612738,1610612755,2005,110,0.447998,0.78418,0.25,24,59,103,0.408936,0.770996,0.308105,21,40,1
3,2018-12-10,21800395,1610612749,1610612739,2018,108,0.437988,0.817871,0.416992,22,58,92,0.375,0.666992,0.333008,24,46,1
4,2007-03-12,20600946,1610612756,1610612745,2006,103,0.5,0.727051,0.600098,18,50,82,0.385986,0.722168,0.262939,13,36,1


**Combine New Data with Old Data**

In [34]:
df_combined = pd.concat([df_new, df_old])
df_combined.head()

Unnamed: 0,GAME_DATE_EST,HOME_TEAM_WINS,PTS_home,FG_PCT_home,FG3_PCT_home,FT_PCT_home,REB_home,AST_home,HOME_TEAM_ID,GAME_ID,PTS_away,FG_PCT_away,FG3_PCT_away,FT_PCT_away,REB_away,AST_away,VISITOR_TEAM_ID,SEASON
0,2022-12-08,1,121,51.6,40.0,68.2,48,28,1610612743,22200376,120,48.4,40.0,100.0,38,30,1610612757,2022
1,2022-12-08,0,109,44.9,34.5,73.3,42,23,1610612745,22200375,118,55.4,36.1,86.7,38,25,1610612759,2022
2,2022-12-08,0,110,46.3,47.2,90.5,43,24,1610612746,22200374,115,51.1,37.5,78.6,43,24,1610612748,2022
3,2022-12-07,0,111,48.8,29.0,69.0,38,22,1610612764,22200367,115,50.0,43.5,88.2,41,29,1610612741,2022
4,2022-12-07,0,113,42.5,21.2,91.4,46,16,1610612747,22200366,126,44.3,27.9,83.3,54,25,1610612761,2022


**Data Processing**

In [35]:
df_combined = process_games(df_combined) 
df_combined = add_TARGET(df_combined)
df_combined.head()

Unnamed: 0,GAME_DATE_EST,HOME_TEAM_WINS,PTS_home,FG_PCT_home,FG3_PCT_home,FT_PCT_home,REB_home,AST_home,HOME_TEAM_ID,GAME_ID,PTS_away,FG_PCT_away,FG3_PCT_away,FT_PCT_away,REB_away,AST_away,VISITOR_TEAM_ID,SEASON,PLAYOFF,TARGET
0,2022-12-08,1,121,51.6,40.0,68.2,48,28,1610612743,22200376,120,48.4,40.0,100.0,38,30,1610612757,2022,0,1
1,2022-12-08,0,109,44.9,34.5,73.3,42,23,1610612745,22200375,118,55.4,36.1,86.7,38,25,1610612759,2022,0,0
2,2022-12-08,0,110,46.3,47.2,90.5,43,24,1610612746,22200374,115,51.1,37.5,78.6,43,24,1610612748,2022,0,0
3,2022-12-07,0,111,48.8,29.0,69.0,38,22,1610612764,22200367,115,50.0,43.5,88.2,41,29,1610612741,2022,0,0
4,2022-12-07,0,113,42.5,21.2,91.4,46,16,1610612747,22200366,126,44.3,27.9,83.3,54,25,1610612761,2022,0,0


**Feature Engineering**

In [36]:
# Feature engineering to add: 
    # rolling averages of key stats, 
    # win/lose streaks, 
    # home/away streaks, 
    # specific matchup (team X vs team Y) rolling averages and streaks

df_combined = process_features(df_combined)
df_combined.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

Unnamed: 0,GAME_DATE_EST,HOME_TEAM_WINS,PTS_home,FG_PCT_home,FG3_PCT_home,FT_PCT_home,REB_home,AST_home,HOME_TEAM_ID,GAME_ID,...,FG3_PCT_AVG_LAST_10_ALL_x_minus_y,FG3_PCT_AVG_LAST_15_ALL_x_minus_y,AST_AVG_LAST_3_ALL_x_minus_y,AST_AVG_LAST_7_ALL_x_minus_y,AST_AVG_LAST_10_ALL_x_minus_y,AST_AVG_LAST_15_ALL_x_minus_y,REB_AVG_LAST_3_ALL_x_minus_y,REB_AVG_LAST_7_ALL_x_minus_y,REB_AVG_LAST_10_ALL_x_minus_y,REB_AVG_LAST_15_ALL_x_minus_y
0,2003-10-28,1,89,0.439941,0.350098,0.533203,39,25,1610612755,20300001,...,,,,,,,,,,
1,2003-10-28,1,83,0.425049,0.099976,0.769043,38,20,1610612759,20300002,...,,,,,,,,,,
2,2003-10-28,1,109,0.505859,0.350098,0.600098,46,32,1610612747,20300003,...,,,,,,,,,,
3,2003-10-29,1,88,0.323975,0.160034,0.700195,55,24,1610612740,20300006,...,,,,,,,,,,
4,2003-10-29,1,90,0.425049,0.166992,0.799805,45,17,1610612761,20300007,...,,,,,,,,,,


**Insert New Data into Feature Group**

In [37]:
#rolling_stats_fg.insert(df, write_options={"wait_for_job" : False})

df = df_combined[df_new.index.isin(df_combined.index)]

df.head()



ValueError: Item wrong length 51 instead of 22598.