In [1]:
import json
import requests
import pandas as pd
import numpy as np

In [2]:
base_url = "http://stats.nba.com/stats/leaguedashptstats"
season="2014-15"
season_type="Playoffs"
player_or_team="Player"
measure_type="CatchShoot"
parameters={"Season": season,
            "SeasonType": season_type,
            "PlayerOrTeam": player_or_team,
             "PtMeasureType": measure_type
            }
data=requests.get(base_url,params=parameters)

In [3]:
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/47.0.2526.73 Chrome/47.0.2526.73 Safari/537.36"
REFERER = "http://stats.nba.com/scores/"

In [4]:
def get_sportvu_data_for_stat(season, season_type, player_or_team, measure_type, start_date="", end_date="", last_n_games=0, league_id="00", month=0, opponent_team_id=0, por_round=0, per_mode="Totals", team_id=0, outcome="", location="", season_segment="", vs_conference="", vs_division="", game_scope="", player_experience="", player_position="", starter_bench=""):
    parameters = {
                    "DateFrom": start_date,
                    "DateTo": end_date,
                    "PlayerOrTeam": player_or_team,
                    "PtMeasureType": measure_type,
                    "Season": season,
                    "SeasonType": season_type,
                    "LastNGames": last_n_games,
                    "LeagueID": league_id,
                    "Month": month,
                    "OpponentTeamID": opponent_team_id,
                    "PORound": por_round,
                    "PerMode": per_mode,
                    "TeamID": team_id,
                    "Outcome": outcome,
                    "Location": location,
                    "SeasonSegment": season_segment,
                    "VsConference": vs_conference,
                    "VsDivision": vs_division,
                    "GameScope": game_scope,
                    "PlayerExperience": player_experience,
                    "PlayerPosition": player_position,
                    "StarterBench": starter_bench
    }
    base_url = "http://stats.nba.com/stats/leaguedashptstats"
    return get_data_from_url_with_parameters(base_url, parameters, 0)

In [5]:
def get_data_from_url_with_parameters(base_url, parameters, index):
    measure_type=parameters["PtMeasureType"]
    response = requests.get(base_url, params=parameters, headers={'User-Agent': USER_AGENT, 'referer': REFERER})
    data = response.json()
    headers = data['resultSets'][index]['headers']
    rows = data['resultSets'][index]['rowSet']
    #return [dict(zip(headers, row)) for row in rows]
    df = pd.DataFrame(rows)
    df.columns = headers
    df = df.set_index(['PLAYER_ID', 'TEAM_ABBREVIATION','PLAYER_NAME','TEAM_ID'])
    df.columns = ['{}_{}'.format(measure_type, x) for x in df.columns]
    return df

In [6]:
data=get_sportvu_data_for_stat(season="2014-15",
season_type="Regular Season",
player_or_team="Player",
measure_type="Rebounding")

In [7]:
measures=["CatchShoot", 
"Defense",
"Drives",
"Passing",
"PullUpShot",
"Rebounding",
"Efficiency",
"SpeedDistance",
"ElbowTouch",
"PaintTouch",
"PostTouch",
"Possessions"]

In [8]:
def sportvu_df(season, measure_types,season_type="Regular Season",
player_or_team="Player"):
    sportvu=None
    for m in measure_types:
        print(m)
        input_df=get_sportvu_data_for_stat(season,
        season_type,
        player_or_team,
        measure_type=m)
        if sportvu is None:
            sportvu = input_df
        else:
            sportvu = pd.merge(sportvu,
                               input_df,
                               how='inner',
                               left_index=True,
                               right_index=True)
            print(sportvu.shape)
    sportvu.to_csv('raw_{}'.format(season))
    return sportvu

In [9]:
sportvu_data=sportvu_df(season='2014-15',measure_types=measures)

CatchShoot
Defense
(492, 22)
Drives
(492, 43)
Passing
(492, 57)
PullUpShot
(492, 69)
Rebounding
(492, 100)
Efficiency
(492, 118)
SpeedDistance
(492, 129)
ElbowTouch
(492, 151)
PaintTouch
(492, 173)
PostTouch
(492, 195)
Possessions
(492, 212)


In [40]:
season='2013-14'
nba_data=pd.read_csv('raw_2013-14')

In [41]:
nba_data=nba_data.set_index(['PLAYER_ID', 'TEAM_ABBREVIATION','PLAYER_NAME','TEAM_ID'])

In [42]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn import (manifold,  decomposition, lda, random_projection)

In [39]:
player_names=[x[2] for x in nba_data.index]
team_names=[x[1] for x in nba_data.index]

In [43]:
nba_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,CatchShoot_GP,CatchShoot_W,CatchShoot_L,CatchShoot_MIN,CatchShoot_CATCH_SHOOT_FGM,CatchShoot_CATCH_SHOOT_FGA,CatchShoot_CATCH_SHOOT_FG_PCT,CatchShoot_CATCH_SHOOT_PTS,CatchShoot_CATCH_SHOOT_FG3M,CatchShoot_CATCH_SHOOT_FG3A,...,Possessions_TIME_OF_POSS,Possessions_AVG_SEC_PER_TOUCH,Possessions_AVG_DRIB_PER_TOUCH,Possessions_PTS_PER_TOUCH,Possessions_ELBOW_TOUCHES,Possessions_POST_TOUCHES,Possessions_PAINT_TOUCHES,Possessions_PTS_PER_ELBOW_TOUCH,Possessions_PTS_PER_POST_TOUCH,Possessions_PTS_PER_PAINT_TOUCH
PLAYER_ID,TEAM_ABBREVIATION,PLAYER_NAME,TEAM_ID,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
708,BKN,Kevin Garnett,1610612751,53,26,27,1090.0,63,131,0.481,126,0,2,...,45.8,1.19,0.24,0.147,210,137,116,0.143,0.657,0.741
951,MIA,Ray Allen,1610612748,73,47,26,1936.0,124,303,0.409,356,108,270,...,78.1,1.94,1.42,0.29,89,44,40,0.315,0.523,0.525
959,LAL,Steve Nash,1610612747,15,4,11,313.0,7,16,0.438,20,6,13,...,60.5,4.57,5.21,0.128,9,3,3,0.0,0.667,0.667
965,OKC,Derek Fisher,1610612760,81,59,22,1428.0,70,166,0.422,200,60,148,...,122.2,3.37,3.25,0.196,114,11,9,0.184,0.364,0.444
977,LAL,Kobe Bryant,1610612747,6,2,4,177.0,2,10,0.2,5,1,8,...,33.9,4.39,3.77,0.179,31,6,6,0.387,0.0,0.667


In [44]:
#nba_data=nba_data.fillna(nba_data.mean(skipna=True))
#nba_data1=nba_data.convert_objects(convert_numeric=True)
nba_data=nba_data[nba_data['CatchShoot_GP']>40]
nba_data.shape

(337, 212)

In [45]:
player_names=[x[2] for x in nba_data.index]
team_names=[x[1] for x in nba_data.index]

In [46]:
scaler=StandardScaler()
nba_data=scaler.fit_transform(nba_data)
nba_data=pd.DataFrame(nba_data)

In [47]:
def clustering(data,year,k_clusters=7):
      center_file=''.join(["nba_centers_",year,".csv"])
      kmeans=KMeans(k_clusters).fit(data)
      centers=kmeans.cluster_centers_
      np.savetxt(center_file,centers)
      return(kmeans)

In [48]:
corr = nba_data.T.corr()
X = corr.as_matrix()
    
X_reduced = TruncatedSVD(n_components=10, random_state=0).fit_transform(X)

method='TSNE'
year='2013-14'
if(method=='TSNE'):
    tsne = manifold.TSNE(n_components=2, perplexity=40, verbose=2)
    X_tsne = tsne.fit_transform(X_reduced)
#      
kmeans=clustering(data=X_tsne,year=year,k_clusters=7)
df_projected = pd.DataFrame(X_tsne)
df_projected['player'] = player_names
df_projected['team'] = team_names
df_projected['cluster']=kmeans.labels_[0:df_projected.shape[0]]
df_projected.columns = ['x', 'y', 'player', 'team','cluster']
df_projected.to_csv('projected_2013-14', index=False)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 337 / 337
[t-SNE] Mean sigma: 2.176658
[t-SNE] Iteration 25: error = 0.6946407, gradient norm = 0.0163737
[t-SNE] Iteration 50: error = 0.7601568, gradient norm = 0.0179642
[t-SNE] Iteration 75: error = 0.5878504, gradient norm = 0.0108830
[t-SNE] Iteration 100: error = 0.8125876, gradient norm = 0.0166604
[t-SNE] Error after 100 iterations with early exaggeration: 0.812588
[t-SNE] Iteration 125: error = 0.5987321, gradient norm = 0.0106252
[t-SNE] Iteration 150: error = 0.7731693, gradient norm = 0.0155650
[t-SNE] Iteration 175: error = 1.4314015, gradient norm = 0.0254308
[t-SNE] Iteration 175: did not make any progress during the last 30 episodes. Finished.
[t-SNE] Error after 175 iterations: 1.431401


In [4]:
library(data.table) 
library(scatterD3) 
library(d3heatmap)
library(ggplot2) 
library(ggrepel) 
library(htmlwidgets)

file_path='projected_2013-14'
similarity<- function(year){
    print(file_path)
    dt <- fread(file_path, header=TRUE) # an error arise if read_csv is useddt
    dt=dt[team!='total']
       
    tooltips <- paste("<strong>", dt$player,"</strong><br /><strong>", dt$team, "</strong><br />") 
    p <- scatterD3(x = dt$x, y = dt$y, lab = dt$player, col_var=dt$team, symbol_var=dt$cluters, point_opacity = 0.7, tooltip_text = tooltips, col_lab = "Team", symbol_lab = "Cluster", width=1000, height=1000) 
    saveWidget(p, paste('nba_player_similarity',year,'.html'))
    print(paste(year,"_saved"))
}



In [5]:
similarity('2013-14')

[1] "projected_2013-14"
[1] "2013-14 _saved"
