# NBA Player Sentiment Analysis

This notebook processes Reddit data for NBA players, performs sentiment analysis, and merges it with their game statistics.

In [None]:
# Install and import required libraries
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import os
import logging
import matplotlib.pyplot as plt
import seaborn as sns

# Configure logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Ensure VADER lexicon is available
try:
    nltk.data.find('sentiment/vader_lexicon.zip')
    print("VADER lexicon already downloaded")
except LookupError:
    print("Downloading VADER lexicon...")
    nltk.download('vader_lexicon')
    print("Download complete")

In [None]:
# Configuration
# Base directory where your 'new' folder containing 'reddit_data' and 'player_stats' is
BASE_DATA_DIR = 'data/new/'
REDDIT_DATA_DIR = os.path.join(BASE_DATA_DIR, 'reddit_data')
PLAYER_STATS_DIR = os.path.join(BASE_DATA_DIR, 'player_stats')
OUTPUT_DIR = os.path.join(BASE_DATA_DIR, 'processed_data')

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# List of player file slugs
PLAYER_SLUGS = [
    "anthony_edwards",
    "donovan_mitchell",
    "giannis_antetokounmpo",
    "jalen_brunson",
    "lebron_james",
    "luka_doncic",
    "shai_gilgeous-alexander",
    "stephen_curry"
]

# Seasons for game logs
SEASONS = ["2022", "2023", "2024"]

# Initialize the Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

## Load and Explore Reddit Data

Let's look at one player's Reddit data to understand the structure.

In [None]:
# Load sample Reddit data for one player
sample_player = "lebron_james"
reddit_file_path = os.path.join(REDDIT_DATA_DIR, f"{sample_player}_reddit_mentions.csv")

try:
    sample_reddit_df = pd.read_csv(reddit_file_path)
    print(f"Loaded {len(sample_reddit_df)} Reddit mentions for {sample_player}.")
    print(f"\nColumn names: {sample_reddit_df.columns.tolist()}")
    display(sample_reddit_df.head(2))
except Exception as e:
    print(f"Error loading Reddit data for {sample_player}: {e}")

## Process Player Data Function

This function handles the processing for a single player.

In [None]:
def process_player_data(player_slug, target_season):
    """
    Loads Reddit data and player stats, performs sentiment analysis,
    aggregates sentiment, merges data, and saves the result.
    """
    logging.info(f"--- Processing player: {player_slug} for season {target_season} ---")

    # --- 1. Load Reddit Data ---
    reddit_file_path = os.path.join(REDDIT_DATA_DIR, f"{player_slug}_reddit_mentions.csv")
    if not os.path.exists(reddit_file_path):
        logging.error(f"Reddit mentions file not found for {player_slug}: {reddit_file_path}")
        return None
    
    try:
        reddit_df = pd.read_csv(reddit_file_path)
        logging.info(f"Loaded {len(reddit_df)} Reddit mentions for {player_slug}.")
    except Exception as e:
        logging.error(f"Error loading Reddit data for {player_slug}: {e}")
        return None

    if reddit_df.empty:
        logging.warning(f"Reddit data for {player_slug} is empty. Skipping further processing for this player.")
        return None

    # --- 2. Sentiment Scoring ---
    # Concatenate title, body, and comments for sentiment analysis
    text_columns = ['post_title', 'post_body', 'scraped_comments_sample']
    for col in text_columns:
        if col not in reddit_df.columns:
            logging.warning(f"Column '{col}' not found in Reddit data for {player_slug}. Will use empty string.")
            reddit_df[col] = "" # Add empty column if missing to prevent error

    reddit_df['combined_text'] = reddit_df[text_columns].fillna('').agg(' '.join, axis=1)
    
    # Apply VADER sentiment analysis
    try:
        reddit_df['compound_sentiment'] = reddit_df['combined_text'].apply(
            lambda txt: sia.polarity_scores(str(txt))['compound'] # Ensure txt is string
        )
        logging.info(f"Calculated compound sentiment for {player_slug}.")
    except Exception as e:
        logging.error(f"Error during sentiment scoring for {player_slug}: {e}")
        reddit_df['compound_sentiment'] = 0.0 # Fallback to neutral

    # --- 3. Aggregate Sentiment per game_date_reference ---
    if 'game_date_reference' not in reddit_df.columns:
        logging.error(f"'game_date_reference' column missing in Reddit data for {player_slug}.")
        return None

    daily_sentiment = reddit_df.groupby('game_date_reference').agg(
        mean_sentiment=('compound_sentiment', 'mean'),
        positive_sentiment_ratio=('compound_sentiment', lambda x: (x > 0.05).mean()), # Standard VADER threshold for positive
        negative_sentiment_ratio=('compound_sentiment', lambda x: (x < -0.05).mean()),# Standard VADER threshold for negative
        mention_count=('compound_sentiment', 'size')
    ).reset_index()
    logging.info(f"Aggregated daily sentiment for {player_slug}.")

    # --- 4. Load Player Game Stats ---
    stats_file_path = os.path.join(PLAYER_STATS_DIR, f"season_{target_season}", f"{player_slug}_gamelog.csv")
    if not os.path.exists(stats_file_path):
        logging.error(f"Player stats file not found for {player_slug}, season {target_season}: {stats_file_path}")
        return None 
        
    try:
        stats_df = pd.read_csv(stats_file_path)
        logging.info(f"Loaded {len(stats_df)} game log entries for {player_slug}, season {target_season}.")
    except Exception as e:
        logging.error(f"Error loading player stats for {player_slug}, season {target_season}: {e}")
        return None

    if 'GAME_DATE' not in stats_df.columns:
        logging.error(f"'GAME_DATE' column missing in stats data for {player_slug}, season {target_season}.")
        return None

    # --- 5. Merge Sentiment with Stats ---
    # Convert GAME_DATE in stats_df to 'YYYY-MM-DD' string format to match daily_sentiment
    try:
        stats_df['game_date_reference'] = pd.to_datetime(stats_df['GAME_DATE'], format='%b %d, %Y').dt.strftime('%Y-%m-%d')
    except ValueError as e:
        logging.error(f"Error converting GAME_DATE format for {player_slug}: {e}. Check if format string is correct.")
        try:
            logging.warning(f"Attempting general date parsing for {player_slug} due to format error.")
            stats_df['game_date_reference'] = pd.to_datetime(stats_df['GAME_DATE']).dt.strftime('%Y-%m-%d')
        except Exception as e_gen:
            logging.error(f"General date parsing also failed for {player_slug}: {e_gen}. Cannot proceed with merge.")
            return None

    # Merge stats with daily sentiment
    merged_df = pd.merge(stats_df, daily_sentiment, on='game_date_reference', how='left')
    
    # Fill NaN values for sentiment columns (for game days with no Reddit mentions)
    sentiment_cols_to_fill = ['mean_sentiment', 'positive_sentiment_ratio', 'negative_sentiment_ratio', 'mention_count']
    fill_values = {
        'mean_sentiment': 0.0,  # Neutral sentiment
        'positive_sentiment_ratio': 0.0,
        'negative_sentiment_ratio': 0.0,
        'mention_count': 0      # Zero mentions
    }
    for col in sentiment_cols_to_fill:
        if col in merged_df.columns:
            merged_df[col] = merged_df[col].fillna(fill_values.get(col, 0))
        else:
            merged_df[col] = fill_values.get(col, 0)

    logging.info(f"Merged stats and sentiment for {player_slug}. Resulting shape: {merged_df.shape}")

    # --- 6. Save Processed Data ---
    output_file_path = os.path.join(OUTPUT_DIR, f"{player_slug}_stats_plus_sentiment_{target_season}.csv")
    try:
        merged_df.to_csv(output_file_path, index=False, encoding='utf-8-sig')
        logging.info(f"✅ Successfully saved processed data to {output_file_path}")
    except Exception as e:
        logging.error(f"Error saving processed data for {player_slug}: {e}")

    return merged_df

## Process All Players

Now let's process all players for all seasons.

In [None]:
all_players_merged_data = {}

for season in SEASONS:
    print(f"\n==== Processing Season {season} ====\n")
    
    for slug in PLAYER_SLUGS:
        processed_df = process_player_data(slug, season)
        if processed_df is not None:
            all_players_merged_data[f"{slug}_{season}"] = processed_df
            print(f"Finished processing for {slug} (Season {season}).")
        else:
            print(f"Processing failed or resulted in no data for {slug} (Season {season}).")
            
print("\n--- All players processed. ---")

## Data Visualization Examples

Let's create some visualizations of our processed data.

In [None]:
# Example 1: Sentiment vs. Performance for a single player
def plot_sentiment_vs_performance(player_slug, season, metric='PTS'):
    """Plot the relationship between sentiment and a performance metric"""
    key = f"{player_slug}_{season}"
    if key not in all_players_merged_data:
        print(f"No data found for {player_slug} in season {season}")
        return
        
    df = all_players_merged_data[key].copy()
    
    # Create a figure with two subplots
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True)
    
    # Plot 1: Performance over time
    ax1.plot(range(len(df)), df[metric], 'o-', color='blue', label=metric)
    ax1.set_ylabel(metric, fontsize=12)
    ax1.set_title(f"{player_slug.replace('_', ' ').title()}: {metric} per Game (Season {season})")
    ax1.legend()
    ax1.grid(True, linestyle='--', alpha=0.7)
    
    # Plot 2: Sentiment over time
    ax2.plot(range(len(df)), df['mean_sentiment'], 'o-', color='green', label='Mean Sentiment')
    ax2.fill_between(range(len(df)), 0, df['mean_sentiment'], 
                    where=(df['mean_sentiment'] > 0), color='green', alpha=0.3)
    ax2.fill_between(range(len(df)), 0, df['mean_sentiment'], 
                    where=(df['mean_sentiment'] < 0), color='red', alpha=0.3)
    ax2.set_ylabel('Sentiment Score', fontsize=12)
    ax2.set_xlabel('Game Number', fontsize=12)
    ax2.axhline(y=0, color='black', linestyle='-', alpha=0.3)
    ax2.set_title(f"Mean Reddit Sentiment Score")
    ax2.grid(True, linestyle='--', alpha=0.7)
    ax2.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Correlation analysis
    corr = df[[metric, 'mean_sentiment']].corr().iloc[0,1]
    print(f"Correlation between {metric} and sentiment: {corr:.4f}")

In [None]:
# Example 2: Compare sentiment across players
def compare_player_sentiments(season):
    """Compare sentiment distributions across players for a given season"""
    plt.figure(figsize=(12, 6))
    
    data = []
    labels = []
    
    for slug in PLAYER_SLUGS:
        key = f"{slug}_{season}"
        if key in all_players_merged_data:
            data.append(all_players_merged_data[key]['mean_sentiment'])
            labels.append(slug.replace('_', ' ').title())
    
    if not data:
        print(f"No data found for season {season}")
        return
        
    plt.boxplot(data, labels=labels, showfliers=True)
    plt.xticks(rotation=45, ha='right')
    plt.ylabel('Sentiment Score')
    plt.title(f"Sentiment Distribution Comparison (Season {season})")
    plt.axhline(y=0, color='red', linestyle='--', alpha=0.3)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

In [None]:
# Let's try the visualization for a player
# Run this after you have processed the data
# Replace with your actual player and season
try:
    player_to_plot = "lebron_james"
    season_to_plot = "2023"
    plot_sentiment_vs_performance(player_to_plot, season_to_plot, 'PTS')
except Exception as e:
    print(f"Error plotting: {e}")

In [None]:
# Compare sentiments across players
# Run this after you have processed the data
try:
    compare_player_sentiments("2023")
except Exception as e:
    print(f"Error plotting comparison: {e}")

## Final Analysis and Insights

Here we can analyze the relationship between sentiment and performance metrics.

In [None]:
def calculate_correlations(player_slug, season):
    """Calculate correlations between sentiment and various performance metrics"""
    key = f"{player_slug}_{season}"
    if key not in all_players_merged_data:
        print(f"No data found for {player_slug} in season {season}")
        return None
        
    df = all_players_merged_data[key]
    
    # Potential performance metrics
    potential_metrics = ['PTS', 'AST', 'REB', 'STL', 'BLK', 'TOV', 'PLUS_MINUS', 'MIN']
    available_metrics = [col for col in potential_metrics if col in df.columns]
    
    if not available_metrics:
        print(f"No known performance metrics found for {player_slug}")
        return None
    
    # Calculate correlations
    correlations = []
    for metric in available_metrics:
        corr = df[['mean_sentiment', metric]].corr().iloc[0,1]
        correlations.append({'Metric': metric, 'Correlation': corr})
    
    corr_df = pd.DataFrame(correlations).sort_values('Correlation', ascending=False)
    return corr_df

In [None]:
# Calculate and display correlations for a player
# Replace with your actual player and season
try:
    player_to_analyze = "lebron_james"
    season_to_analyze = "2023"
    
    corr_results = calculate_correlations(player_to_analyze, season_to_analyze)
    if corr_results is not None:
        print(f"Correlations between Reddit sentiment and performance metrics for {player_to_analyze.replace('_', ' ').title()} (Season {season_to_analyze}):")
        display(corr_results)
        
        # Visualization of correlations
        plt.figure(figsize=(10, 6))
        plt.bar(corr_results['Metric'], corr_results['Correlation'], color=['g' if x > 0 else 'r' for x in corr_results['Correlation']])
        plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
        plt.title(f"Correlation between Sentiment and Performance\n{player_to_analyze.replace('_', ' ').title()} (Season {season_to_analyze})")
        plt.ylabel('Correlation Coefficient')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.show()
except Exception as e:
    print(f"Error analyzing correlations: {e}")