In [None]:
pip install -U kaleido



In [None]:
!pip install nba_api pandas numpy scikit-learn umap-learn plotly kaleido

import pandas as pd
import numpy as np
import time

from nba_api.stats.static import players
from nba_api.stats.endpoints import playercareerstats, leaguedashplayerstats

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import umap
from sklearn.cluster import KMeans

import plotly.express as px
import plotly.graph_objects as go

print("Libraries installed and imported successfully!")

Libraries installed and imported successfully!


In [None]:
#Last 15 NBA Most Valuable Players
player_names_of_interest = [
    "Shai Gilgeous-Alexander",
    "Nikola Jokić",
    "Joel Embiid",
    "Giannis Antetokounmpo",
    "James Harden",
    "Russell Westbrook",
    "Stephen Curry",
    "Kevin Durant",
    "LeBron James",
    "Derrick Rose",
    "Kobe Bryant",
    "Dirk Nowitzki",
    "Steve Nash",
    "Kevin Garnett",
    "Tim Duncan"
]

nba_players = players.get_players()

player_ids_map = {}
active_players_map = {}
for player_name in player_names_of_interest:
    found_player = None
    for p in nba_players:
        if p['full_name'].lower() == player_name.lower():
            found_player = p
            break
    if found_player:
        player_ids_map[player_name] = found_player['id']
        active_players_map[player_name] = found_player['is_active']
        print(f"Found player: {player_name} with ID: {found_player['id']}")
    else:
        print(f"Could not find player: {player_name}")



Found player: Shai Gilgeous-Alexander with ID: 1628983
Found player: Nikola Jokić with ID: 203999
Found player: Joel Embiid with ID: 203954
Found player: Giannis Antetokounmpo with ID: 203507
Found player: James Harden with ID: 201935
Found player: Russell Westbrook with ID: 201566
Found player: Stephen Curry with ID: 201939
Found player: Kevin Durant with ID: 201142
Found player: LeBron James with ID: 2544
Found player: Derrick Rose with ID: 201565
Found player: Kobe Bryant with ID: 977
Found player: Dirk Nowitzki with ID: 1717
Found player: Steve Nash with ID: 959
Found player: Kevin Garnett with ID: 708
Found player: Tim Duncan with ID: 1495


In [None]:
all_player_season_stats = []

player_season_years_map = {}

for player_name, player_id in player_ids_map.items():
    print(f"\nFetching career stats for {player_name} (ID: {player_id}) to identify seasons...")
    try:
        career = playercareerstats.PlayerCareerStats(player_id=player_id, timeout=60)
        career_df = career.get_data_frames()[0]
        seasons_played = career_df['SEASON_ID'].unique().tolist()
        player_season_years_map[player_id] = sorted(seasons_played)
        print(f"Identified {len(seasons_played)} seasons for {player_name}: {seasons_played[:5]}...{seasons_played[-5:] if len(seasons_played) > 5 else ''}")
        time.sleep(1)
    except Exception as e:
        print(f"Error fetching career stats for {player_name}: {e}")

print("\n--- Now fetching detailed stats per season using LeagueDashPlayerStats ---")

detailed_player_stats_list = []

for player_name, player_id in player_ids_map.items():
    print(f"\nFetching year-over-year dashboard for {player_name}...")
    if player_id not in player_season_years_map:
        print(f"No seasons identified for {player_name}, skipping dashboard fetch.")
        continue
    try:
        from nba_api.stats.endpoints import playerdashboardbyyearoveryear
        player_dashboard = playerdashboardbyyearoveryear.PlayerDashboardByYearOverYear(
            player_id=player_id,
            per_mode_detailed='Per100Possessions',
            timeout=60
        )

        player_stats_df = player_dashboard.by_year_player_dashboard.get_data_frame()

        if player_stats_df is not None and not player_stats_df.empty:
            player_stats_df['PLAYER_ID'] = player_id
            player_stats_df['PLAYER_NAME'] = player_name
            detailed_player_stats_list.append(player_stats_df)
            print(f"Fetched {len(player_stats_df)} seasons of dashboard data for {player_name}.")
        else:
            print(f"No ByYearPlayerDashboard data found for {player_name} with Per100Possessions.")

        time.sleep(1)

    except Exception as e:
        print(f"Error fetching dashboard stats for {player_name} (ID: {player_id}): {e}")


if detailed_player_stats_list:
    final_stats_df = pd.concat(detailed_player_stats_list, ignore_index=True)
    print(f"\nSuccessfully combined stats for {final_stats_df['PLAYER_NAME'].nunique()} players over various seasons.")
    print(f"Shape of combined DataFrame: {final_stats_df.shape}")
    print("Sample data:")
    print(final_stats_df.head())
    print("\nColumns available:")
    print(final_stats_df.columns.tolist())
else:
    print("\nNo detailed stats were fetched. Further cells might not work.")
    final_stats_df = pd.DataFrame()


Fetching career stats for Shai Gilgeous-Alexander (ID: 1628983) to identify seasons...
Identified 7 seasons for Shai Gilgeous-Alexander: ['2018-19', '2019-20', '2020-21', '2021-22', '2022-23']...['2020-21', '2021-22', '2022-23', '2023-24', '2024-25']

Fetching career stats for Nikola Jokić (ID: 203999) to identify seasons...
Identified 10 seasons for Nikola Jokić: ['2015-16', '2016-17', '2017-18', '2018-19', '2019-20']...['2020-21', '2021-22', '2022-23', '2023-24', '2024-25']

Fetching career stats for Joel Embiid (ID: 203954) to identify seasons...
Identified 9 seasons for Joel Embiid: ['2016-17', '2017-18', '2018-19', '2019-20', '2020-21']...['2020-21', '2021-22', '2022-23', '2023-24', '2024-25']

Fetching career stats for Giannis Antetokounmpo (ID: 203507) to identify seasons...
Identified 12 seasons for Giannis Antetokounmpo: ['2013-14', '2014-15', '2015-16', '2016-17', '2017-18']...['2020-21', '2021-22', '2022-23', '2023-24', '2024-25']

Fetching career stats for James Harden (ID

In [None]:
if not final_stats_df.empty:


    potential_features = [
        'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA',
        'OREB', 'DREB', 'AST', 'TOV', 'STL', 'BLK', 'PTS',
        'USG_PCT', 'PIE', 'TS_PCT'
    ]

    features_to_use = [f for f in potential_features if f in final_stats_df.columns]

    identifier_cols = ['PLAYER_NAME', 'PLAYER_ID', 'GROUP_VALUE']

    if 'GROUP_VALUE' not in final_stats_df.columns and 'SEASON_ID' in final_stats_df.columns:
        final_stats_df.rename(columns={'SEASON_ID': 'GROUP_VALUE'}, inplace=True)
        if 'GROUP_VALUE' not in identifier_cols:
             identifier_cols = ['PLAYER_NAME', 'PLAYER_ID', 'GROUP_VALUE']


    if not features_to_use:
        print("Error: No features selected or available in the dataframe. Aborting.")
    elif 'GROUP_VALUE' not in final_stats_df.columns:
        print("Error: Season identifier ('GROUP_VALUE' or 'SEASON_ID') not found. Aborting.")
    else:
        print(f"Selected features for style DNA: {features_to_use}")

        min_games_played = 20
        min_minutes_played_per_game_approx = 10
        if 'GP' in final_stats_df.columns and 'MIN' in final_stats_df.columns:
            final_stats_df_filtered = final_stats_df[
                (final_stats_df['GP'] >= min_games_played) &
                (final_stats_df['MIN'] >= final_stats_df['GP'] * min_minutes_played_per_game_approx / final_stats_df['GP'].where(final_stats_df['GP'] > 0, 1)) # Avoid div by zero
            ].copy()
            print(f"Filtered down to {len(final_stats_df_filtered)} player-seasons meeting criteria.")
        else:
            print("Warning: GP or MIN columns not found. Skipping playing time filter. Results might include low-activity seasons.")
            final_stats_df_filtered = final_stats_df.copy()

        if not final_stats_df_filtered.empty:
            X = final_stats_df_filtered[features_to_use].copy()
            identifiers = final_stats_df_filtered[identifier_cols].copy()

            imputer = SimpleImputer(strategy='mean')
            X_imputed = imputer.fit_transform(X)
            X_imputed_df = pd.DataFrame(X_imputed, columns=features_to_use, index=X.index)

            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X_imputed_df)
            X_scaled_df = pd.DataFrame(X_scaled, columns=features_to_use, index=X.index)

            print("\nData preprocessed and scaled successfully.")
            print("Shape of scaled features (X_scaled_df):", X_scaled_df.shape)
            print("Sample of scaled data:")
            print(X_scaled_df.head())
        else:
            print("No player-seasons remaining after filtering. Cannot proceed.")
            X_scaled_df = pd.DataFrame()
            identifiers = pd.DataFrame()
else:
    print("final_stats_df is empty. Skipping preprocessing.")
    X_scaled_df = pd.DataFrame()
    identifiers = pd.DataFrame()

Selected features for style DNA: ['FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB', 'AST', 'TOV', 'STL', 'BLK', 'PTS']
Filtered down to 242 player-seasons meeting criteria.

Data preprocessed and scaled successfully.
Shape of scaled features (X_scaled_df): (242, 13)
Sample of scaled data:
        FGM       FGA      FG3M      FG3A       FTM       FTA      OREB  \
0  1.901960  1.626726  0.685781  0.661930  1.485030  1.080681 -0.612985   
1  1.461422  0.965388 -0.188395 -0.147368  1.299860  1.020540 -0.612985   
2  0.976830  0.769436 -0.591861 -0.605970  2.188675  1.712166 -0.696137   
3  0.139807  0.548990  0.147826  0.473093  0.411046  0.419126 -0.862440   
4  0.183861 -0.112349  0.618537  0.392164  0.225876  0.238702 -0.945591   

       DREB       AST       TOV       STL       BLK       PTS  
0 -0.840273  0.275267 -0.798905  1.364685  0.210296  1.953268  
1 -0.627313  0.185449 -1.075889  2.127396 -0.008123  1.434872  
2 -0.992387 -0.233698 -0.429593  0.792652 -0.008123  1.37

In [None]:
if not X_scaled_df.empty:
    print("\n--- Performing Dimensionality Reduction using UMAP ---")

    reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)

    embedding = reducer.fit_transform(X_scaled_df)

    embedding_df = pd.DataFrame(embedding, columns=['UMAP1', 'UMAP2'], index=X_scaled_df.index)

    plot_df = pd.concat([identifiers.reset_index(drop=True), embedding_df.reset_index(drop=True)], axis=1)

    print("UMAP embedding created successfully.")
    print("Shape of embedding DataFrame:", plot_df.shape)
    print("Sample of UMAP embedding with identifiers:")
    print(plot_df.head())
else:
    print("Scaled features dataframe is empty. Skipping UMAP.")
    plot_df = pd.DataFrame()


--- Performing Dimensionality Reduction using UMAP ---


  warn(


UMAP embedding created successfully.
Shape of embedding DataFrame: (242, 5)
Sample of UMAP embedding with identifiers:
               PLAYER_NAME  PLAYER_ID GROUP_VALUE     UMAP1     UMAP2
0  Shai Gilgeous-Alexander    1628983     2024-25  7.559562  0.753341
1  Shai Gilgeous-Alexander    1628983     2023-24  7.419311  0.832237
2  Shai Gilgeous-Alexander    1628983     2022-23  7.996895  0.671564
3  Shai Gilgeous-Alexander    1628983     2021-22  8.774911  1.389956
4  Shai Gilgeous-Alexander    1628983     2020-21  9.731174  1.220499


In [None]:
if not plot_df.empty and 'UMAP1' in plot_df.columns:
    print("\n--- Performing Clustering using K-Means (Optional) ---")

    n_clusters = 6
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
    plot_df['Cluster'] = kmeans.fit_predict(embedding_df[['UMAP1', 'UMAP2']])
    plot_df['Cluster'] = plot_df['Cluster'].astype(str)

    print(f"Assigned player-seasons to {n_clusters} clusters.")
    print("Sample data with cluster assignments:")
    print(plot_df.head())
else:
    print("Plot DataFrame is empty or UMAP components not found. Skipping K-Means clustering.")
    if 'Cluster' not in plot_df.columns and not plot_df.empty:
        plot_df['Cluster'] = 'N/A'


--- Performing Clustering using K-Means (Optional) ---
Assigned player-seasons to 6 clusters.
Sample data with cluster assignments:
               PLAYER_NAME  PLAYER_ID GROUP_VALUE     UMAP1     UMAP2 Cluster
0  Shai Gilgeous-Alexander    1628983     2024-25  7.559562  0.753341       0
1  Shai Gilgeous-Alexander    1628983     2023-24  7.419311  0.832237       0
2  Shai Gilgeous-Alexander    1628983     2022-23  7.996895  0.671564       0
3  Shai Gilgeous-Alexander    1628983     2021-22  8.774911  1.389956       0
4  Shai Gilgeous-Alexander    1628983     2020-21  9.731174  1.220499       0


In [None]:
if not plot_df.empty and 'UMAP1' in plot_df.columns:
    print("\n--- Generating Visualizations with Plotly ---")

    hover_data_cols = ['PLAYER_NAME', 'GROUP_VALUE'] + features_to_use

    if not X_imputed_df.empty and not identifiers.empty:
        temp_plot_df_for_hover = pd.concat([
            identifiers.reset_index(drop=True),
            X_imputed_df.reset_index(drop=True)
        ], axis=1)

        if 'Cluster' in plot_df.columns:
            cols_for_full_plot_df = identifier_cols + ['UMAP1', 'UMAP2', 'Cluster']
        else:
            cols_for_full_plot_df = identifier_cols + ['UMAP1', 'UMAP2']

        full_plot_df = plot_df[cols_for_full_plot_df].copy()

        for feature in features_to_use:
            if feature in X_imputed_df.columns:
                full_plot_df[feature] = X_imputed_df[feature].values

    else:
        print("Original features not available for hover data, using only identifiers.")
        full_plot_df = plot_df.copy()
        hover_data_cols = ['PLAYER_NAME', 'GROUP_VALUE']


    fig1_title = 'NBA Player Style Map (UMAP Projection)'
    if 'Cluster' in full_plot_df.columns:
        fig1_title += ' - Clustered'

    fig1 = px.scatter(
        full_plot_df,
        x='UMAP1',
        y='UMAP2',
        color='Cluster' if 'Cluster' in full_plot_df.columns else None,
        hover_name='PLAYER_NAME',
        hover_data=hover_data_cols,
        title=fig1_title,
        template='plotly_dark'
    )
    fig1.update_traces(marker=dict(size=8, opacity=0.8))
    fig1.show()

    print("\nGenerating individual player career trajectories...")
    for player_name in player_names_of_interest:
        player_trajectory_df = full_plot_df[full_plot_df['PLAYER_NAME'] == player_name].sort_values(by='GROUP_VALUE')

        if not player_trajectory_df.empty:
            fig2 = px.line(
                player_trajectory_df,
                x='UMAP1',
                y='UMAP2',
                hover_name='GROUP_VALUE',
                hover_data=hover_data_cols,
                title=f'Career Style Evolution: {player_name}',
                template='plotly_dark',
                markers=True
            )
            for i, row in player_trajectory_df.iterrows():
                fig2.add_annotation(
                    x=row['UMAP1'], y=row['UMAP2'],
                    text=str(row['GROUP_VALUE']),
                    showarrow=False, yshift=7, font=dict(size=8)
                )
            fig2.show()
        else:
            print(f"No data to plot trajectory for {player_name}")

else:
    print("Plot DataFrame is empty. No visualizations generated.")


--- Generating Visualizations with Plotly ---



Generating individual player career trajectories...
