In [1]:
# Import necessary libraries
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from pymongo import MongoClient
from collections import defaultdict
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, MultiHeadAttention, LayerNormalization, Dropout, GlobalAveragePooling1D
from tensorflow.keras.callbacks import EarlyStopping




In [2]:
# MongoDB connection
mongo_uri = "mongodb+srv://pranaynandkeolyar:nfl@cluster0.4nbxj.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
client = MongoClient(mongo_uri)

# Connect to ED database
ed_db = client['ED']
all_teams = ed_db.list_collection_names()

print(f"Found {len(all_teams)} team collections in ED database")
metric = 'Current_PFF'  # Metric to predict


ServerSelectionTimeoutError: SSL handshake failed: cluster0-shard-00-02.4nbxj.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1129) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms),SSL handshake failed: cluster0-shard-00-00.4nbxj.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1129) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms),SSL handshake failed: cluster0-shard-00-01.4nbxj.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1129) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 691bbe26fd0b80d2948659eb, topology_type: ReplicaSetNoPrimary, servers: [<ServerDescription ('cluster0-shard-00-00.4nbxj.mongodb.net', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('SSL handshake failed: cluster0-shard-00-00.4nbxj.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1129) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>, <ServerDescription ('cluster0-shard-00-01.4nbxj.mongodb.net', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('SSL handshake failed: cluster0-shard-00-01.4nbxj.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1129) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>, <ServerDescription ('cluster0-shard-00-02.4nbxj.mongodb.net', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('SSL handshake failed: cluster0-shard-00-02.4nbxj.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1129) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>

In [None]:
# Define numeric fields to calculate weighted averages for
numeric_fields = [
    'assists', 'batted_passes', 'forced_fumbles', 'fumble_recoveries',
    'grades_defense', 'grades_defense_penalty', 'grades_pass_rush_defense',
    'grades_run_defense', 'grades_tackle', 'hits', 'hurries',
    'missed_tackle_rate', 'missed_tackles', 'penalties', 'sacks',
    'snap_counts_pass_rush', 'snap_counts_run_defense', 'snap_counts_dl',
    'snap_counts_dl_outside_t', 'snap_counts_dl_over_t',
    'stops', 'tackles', 'tackles_for_loss', 'total_pressures'
]

print("Numeric fields to aggregate:", numeric_fields)


In [None]:
# Function to calculate weighted average with snap count weighting formula
# Players with fewer snaps are weighted less using a square root formula
def calculate_weighted_average(players, field, min_snaps=50):
    """
    Calculate weighted average where players with fewer snaps are weighted less.
    Uses sqrt(snap_count) as weight to reduce impact of low-snap players.
    """
    weighted_sum = 0
    total_weight = 0
    valid_count = 0
    
    for player in players:
        snap_count = player.get('snap_counts_defense', 0)
        value = player.get(field)
        
        # Skip players with very few snaps (less than min_snaps)
        if snap_count is None or snap_count < min_snaps:
            continue
            
        if value is not None:
            try:
                snap_count = float(snap_count)
                value = float(value)
                
                if not np.isnan(value) and not np.isnan(snap_count) and snap_count > 0:
                    # Use square root of snap count as weight
                    # This reduces the impact of players with fewer snaps
                    weight = np.sqrt(snap_count)
                    weighted_sum += value * weight
                    total_weight += weight
                    valid_count += 1
            except (ValueError, TypeError):
                continue
    
    if total_weight > 0 and valid_count > 0:
        return weighted_sum / total_weight
    return 0

# Collect all player data grouped by team and year
print("\nCollecting player data from MongoDB...")
team_year_data = defaultdict(list)

for team in all_teams:
    collection = ed_db[team]
    # Query for edge_position players only
    cursor = collection.find({
        'Year': {'$exists': True},
        'position': 'ED'  # Filter for edge position
    })
    
    for doc in cursor:
        team_name = doc.get('Team') or doc.get('team_name')
        year = doc.get('Year')
        
        if team_name and year is not None:
            team_year_data[(team_name, year)].append(doc)

print(f"Collected data for {len(team_year_data)} team-year combinations")


In [None]:
# Calculate weighted averages for each team-year combination
print("\nCalculating weighted averages with snap count weighting...")
weighted_results = []

for (team, year), players in team_year_data.items():
    if len(players) == 0:
        continue
    
    # Calculate total snap counts for this team-year
    total_snaps = sum(
        float(p.get('snap_counts_defense', 0)) 
        for p in players 
        if p.get('snap_counts_defense') is not None
    )
    
    if total_snaps == 0:
        continue
    
    # Initialize result dictionary
    result = {
        'Team': team,
        'Year': year,
        'Position': 'ED',
        'snap_counts_defense': total_snaps,
        'player_count': len(players)
    }
    
    # Calculate weighted averages for each numeric field using our formula
    for field in numeric_fields:
        weighted_avg = calculate_weighted_average(players, field)
        result[field] = weighted_avg
    
    # Add Current_PFF (same as grades_defense for compatibility)
    result['Current_PFF'] = result.get('grades_defense', 0)
    
    weighted_results.append(result)

print(f"Calculated weighted averages for {len(weighted_results)} team-year combinations")

# Convert to DataFrame
ed_df = pd.DataFrame(weighted_results)
ed_df = ed_df.sort_values(by=['Team', 'Year'])
print(f"\nDataFrame shape: {ed_df.shape}")
print(ed_df.head())


In [None]:
# Create Previous_ columns by shifting data by year for each team
print("\nCreating Previous_ columns...")

# Group results by team and sort by year
team_results = defaultdict(list)
for result in weighted_results:
    team_results[result['Team']].append(result)

# Sort each team's results by year
for team in team_results:
    team_results[team].sort(key=lambda x: x['Year'])

# Create Previous_ columns
fields_to_shift = numeric_fields + ['snap_counts_defense', 'grades_defense', 'Current_PFF']

# Rebuild weighted_results with Previous_ columns
weighted_results = []
for team, results in team_results.items():
    for i in range(len(results)):
        if i > 0:  # First year has no previous data
            prev_result = results[i-1]
            for field in fields_to_shift:
                prev_value = prev_result.get(field)
                if prev_value is not None:
                    results[i][f'Previous_{field}'] = prev_value
                else:
                    results[i][f'Previous_{field}'] = 0
        weighted_results.append(results[i])

# Convert back to DataFrame
ed_df = pd.DataFrame(weighted_results)
ed_df = ed_df.sort_values(by=['Team', 'Year'])

print("Previous_ columns created")
print(f"DataFrame columns: {list(ed_df.columns)}")


In [None]:
# Prepare sequences for transformer model
import numpy as np

sequences = []
targets = []
team_data = ed_df.groupby('Team')

# Edge-specific features for the sequence
feature_cols = [
    'Previous_stops',
    'Previous_total_pressures',
    'Previous_hurries',
    'Previous_snap_counts_pass_rush',
    'Previous_hits',
    'Previous_sacks',
    'Previous_batted_passes'
]

# Filter to only include teams with enough data
for team, group in team_data:
    # Ensure the team has at least 4 years of data
    if len(group) >= 4:
        print(f"Processing team: {team}, data length: {len(group)}")
        
        # Iterate through the data to create sequences for 3 years
        for i in range(len(group) - 3):
            # Check if all required columns exist
            if all(col in group.columns for col in feature_cols):
                # Select the relevant columns for the sequence
                sequence = group.iloc[i:i+3][feature_cols]
                
                # The target is the next year's Current_PFF
                target = group.iloc[i+3]['Current_PFF']
                
                # Check for NaN values
                if not sequence.isnull().any().any() and not pd.isnull(target):
                    sequences.append(sequence.values)
                    targets.append(target)

# Convert lists to numpy arrays
X = np.array(sequences)
y = np.array(targets)

print(f"\nTotal sequences generated: {len(sequences)}")
print(f"X shape: {X.shape}, y shape: {y.shape}")

# Scale the features
scaler = StandardScaler()
X = scaler.fit_transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")


In [None]:
# Build Pure Transformer Model (no RNN)
def build_transformer_model(input_shape, d_model=128, num_heads=8, num_layers=4, ff_dim=256, dropout_rate=0.1):
    """
    Build a pure transformer encoder model for time series prediction.
    No RNN layers - uses only attention mechanisms.
    """
    inputs = Input(shape=input_shape)  # (timesteps, features)
    
    # Project input to d_model dimensions
    x = Dense(d_model)(inputs)
    
    # Add positional encoding (learned)
    # For simplicity, we'll let the model learn positional relationships through attention
    
    # Stack transformer encoder layers
    for i in range(num_layers):
        # Multi-head self-attention
        attention_output = MultiHeadAttention(
            num_heads=num_heads, 
            key_dim=d_model // num_heads
        )(x, x)
        
        # Add & Norm (residual connection + layer norm)
        attention_output = Dropout(dropout_rate)(attention_output)
        x = LayerNormalization()(x + attention_output)
        
        # Feed-forward network
        ffn_output = Dense(ff_dim, activation='relu')(x)
        ffn_output = Dense(d_model)(ffn_output)
        ffn_output = Dropout(dropout_rate)(ffn_output)
        
        # Add & Norm (residual connection + layer norm)
        x = LayerNormalization()(x + ffn_output)
    
    # Global average pooling across time steps
    x = GlobalAveragePooling1D()(x)
    
    # Final dense layers for prediction
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(32, activation='relu')(x)
    
    # Output layer
    outputs = Dense(1)(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    return model

# Build the model
input_shape = (X_train.shape[1], X_train.shape[2])  # (timesteps, features)
model = build_transformer_model(input_shape, d_model=128, num_heads=8, num_layers=4)

# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss='mean_squared_error',
    metrics=['mean_absolute_error']
)

# Print model summary
model.summary()


In [None]:
# Early stopping callback
early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience=20, 
    restore_best_weights=True,
    verbose=1
)


In [None]:
# Train the model with different epoch counts
epoch_list = [5, 10, 15, 20, 25, 30, 50, 100, 250]
best_r2 = -np.inf
best_epoch = 0

for e in epoch_list:
    print(f"\n{'='*60}")
    print(f"Training with {e} epochs...")
    print(f"{'='*60}")
    
    # Reset model weights for each epoch count (optional - comment out if you want cumulative training)
    # model = build_transformer_model(input_shape, d_model=128, num_heads=8, num_layers=4)
    # model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), loss='mean_squared_error')
    
    history = model.fit(
        X_train, y_train,
        epochs=e,
        batch_size=8,
        validation_data=(X_test, y_test),
        callbacks=[early_stopping],
        verbose=1
    )
    
    # Plot training & validation loss
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss', marker='o')
    plt.plot(history.history['val_loss'], label='Validation Loss', marker='s')
    plt.title(f'Training vs Validation Loss - {e} Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    
    # Make predictions
    y_pred_train = model.predict(X_train, verbose=0)
    y_pred_test = model.predict(X_test, verbose=0)
    
    # Calculate metrics
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    print(f"\nResults after {e} epochs:")
    print(f"  Training R²: {train_r2:.4f}")
    print(f"  Test R²: {test_r2:.4f}")
    print(f"  Test MAE: {test_mae:.4f}")
    print(f"  Test RMSE: {test_rmse:.4f}")
    
    # Track best model
    if test_r2 > best_r2:
        best_r2 = test_r2
        best_epoch = e
    
    # Plot actual vs predicted
    plt.subplot(1, 2, 2)
    plt.scatter(y_test, y_pred_test, alpha=0.6, label=f'R² = {test_r2:.4f}')
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('Actual Current_PFF')
    plt.ylabel('Predicted Current_PFF')
    plt.title(f'Actual vs Predicted - {e} Epochs')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

print(f"\n{'='*60}")
print(f"Best Test R²: {best_r2:.4f} at {best_epoch} epochs")
print(f"{'='*60}")


In [None]:
# Make final predictions on all data and visualize by division
y_pred_all = model.predict(X, verbose=0)

# Create visualization DataFrame
viz_df = ed_df.copy()
# Match predictions to the correct rows (need to handle the sequence indexing)
# For simplicity, we'll use the test set predictions
test_indices = np.arange(len(X_test))
test_indices = test_indices + len(X_train)  # Adjust if needed

# Get teams and years for visualization
teams_years = []
for team, group in ed_df.groupby('Team'):
    if len(group) >= 4:
        for i in range(len(group) - 3):
            teams_years.append((group.iloc[i+3]['Team'], group.iloc[i+3]['Year']))

# Create visualization data
viz_data = []
for idx, (team, year) in enumerate(teams_years):
    if idx < len(y_pred_all):
        viz_data.append({
            'Team': team,
            'Year': year,
            'Actual_PFF': y[idx] if idx < len(y) else None,
            'Predicted_PFF': y_pred_all[idx][0] if idx < len(y_pred_all) else None
        })

viz_df_plot = pd.DataFrame(viz_data)
print("\nVisualization data created")
print(viz_df_plot.head())


In [None]:
# Visualize predictions by division for 2022 data
divisions = {
    "49ers": "NFC West", "Bears": "NFC North", "Bengals": "AFC North", "Bills": "AFC East",
    "Broncos": "AFC West", "Browns": "AFC North", "Buccaneers": "NFC South",
    "Cardinals": "NFC West", "Chargers": "AFC West", "Chiefs": "AFC West", "Colts": "AFC South",
    "Commanders": "NFC East", "Cowboys": "NFC East", "Dolphins": "AFC East", "Eagles": "NFC East",
    "Falcons": "NFC South", "Giants": "NFC East", "Jaguars": "AFC South", "Jets": "AFC East",
    "Lions": "NFC North", "Packers": "NFC North", "Panthers": "NFC South", "Patriots": "AFC East",
    "Raiders": "AFC West", "Rams": "NFC West", "Ravens": "AFC North", "Saints": "NFC South",
    "Seahawks": "NFC West", "Steelers": "AFC North", "Texans": "AFC South", "Titans": "AFC South",
    "Vikings": "NFC North"
}

# Get 2022 data for visualization
data_2022 = ed_df[ed_df['Year'] == 2022].copy()

# Match predictions to 2022 teams
# We need to find which sequences correspond to 2022 predictions
pred_2022 = []
actual_2022 = []
teams_2022 = []

for team, group in ed_df.groupby('Team'):
    if len(group) >= 4:
        group_sorted = group.sort_values('Year')
        # Find 2022 row
        if 2022 in group_sorted['Year'].values:
            idx_2022 = group_sorted[group_sorted['Year'] == 2022].index[0]
            # Find sequence that predicts this
            for i, (t, y) in enumerate(teams_years):
                if t == team and y == 2022 and i < len(y_pred_all):
                    pred_2022.append(y_pred_all[i][0])
                    actual_2022.append(group_sorted.loc[idx_2022, 'Current_PFF'])
                    teams_2022.append(team)
                    break

if len(pred_2022) > 0:
    data_2022_viz = pd.DataFrame({
        'Team': teams_2022,
        'Current_PFF': actual_2022,
        'Predicted_PFF': pred_2022
    })
    data_2022_viz["Division"] = data_2022_viz["Team"].map(divisions)
    
    # Plot by division
    unique_divisions = data_2022_viz["Division"].dropna().unique()
    n_divisions = len(unique_divisions)
    n_cols = 2
    n_rows = (n_divisions + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 5*n_rows))
    if n_rows == 1:
        axes = axes.reshape(1, -1)
    axes = axes.flatten()
    
    fig.suptitle('Actual vs Predicted Current PFF by Division (2022) - Transformer Model', fontsize=16)
    
    for idx, division in enumerate(unique_divisions):
        division_data = data_2022_viz[data_2022_viz["Division"] == division]
        if len(division_data) > 0:
            x = np.arange(len(division_data))
            width = 0.35
            
            ax = axes[idx]
            ax.bar(x - width/2, division_data['Current_PFF'], width, label='Actual', color='#19D373')
            ax.bar(x + width/2, division_data['Predicted_PFF'], width, label='Predicted', color='#7606FC')
            
            ax.set_xticks(x)
            ax.set_xticklabels(division_data['Team'], rotation=45, ha='right')
            ax.set_title(division)
            ax.set_xlabel('Teams')
            ax.set_ylabel('Current PFF')
            ax.legend()
            ax.grid(True, alpha=0.3)
    
    # Hide empty subplots
    for idx in range(len(unique_divisions), len(axes)):
        axes[idx].set_visible(False)
    
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.show()
else:
    print("No 2022 data found for visualization")

# Close MongoDB connection
client.close()
print("\n✅ MongoDB connection closed")
