In [1]:
import pandas as pd

path_coaches_season = '/content/Modified_coaches_season.csv'
path_coaches_career = '/content/Modified_coaches_career.csv'
path_teams = '/content/Modified_teams.csv'
path_team_season = '/content/Modified_team_season.csv'

coaches_season_df = pd.read_csv(path_coaches_season)
coaches_career_df = pd.read_csv(path_coaches_career)
teams_df = pd.read_csv(path_teams)
team_season_df = pd.read_csv(path_team_season)

coaches_season_df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/content/Modified_coaches_season.csv'

In [None]:
coaches_career_df.head()

In [None]:
teams_df.head()

In [None]:
team_season_df.head()

In [None]:
# Calculate per-game stats for teams from the team_season_df

# We're interested in the offensive and defensive stats, so we'll exclude identifiers and summary stats like 'won' and 'lost'.
stats_columns = [col for col in team_season_df.columns if col.startswith(('o_', 'd_')) and not col.endswith(('3pm', '3pa'))]  # Exclude 3-point stats as they might not be available for all historical seasons.

# Add games played column
team_season_df['games_played'] = team_season_df['won'] + team_season_df['lost']

# Calculate per-game stats
for col in stats_columns:
    team_season_df[col + '_per_game'] = team_season_df[col] / team_season_df['games_played']

# Select the relevant columns for the model, which includes the per-game stats and excludes the total stats.
per_game_stats_columns = [col for col in team_season_df.columns if 'per_game' in col]
team_per_game_stats_df = team_season_df[['team', 'year'] + per_game_stats_columns]

# Display the first few rows of the per-game stats dataframe
team_per_game_stats_df.head()


In [None]:
# Calculate win percentage for coaches from the coaches_season_df

# Adding win percentage column for coaches
coaches_season_df['win_percentage'] = coaches_season_df['season_win'] / (coaches_season_df['season_win'] + coaches_season_df['season_loss'])

# We only need the coach's ID, team, year, and win percentage for joining with the team stats
coaches_win_percentage_df = coaches_season_df[['coachid', 'team', 'year', 'win_percentage']]

# Merging the coaches' win percentage with the teams' per-game stats
# We'll assume that the most recent coach's stats for a team in a season is the most relevant
team_stats_with_coach_df = pd.merge(team_per_game_stats_df, coaches_win_percentage_df, on=['team', 'year'], how='left')

# Since there may be multiple coaches in a season, we'll drop duplicates keeping the last entry
team_stats_with_coach_df = team_stats_with_coach_df.drop_duplicates(subset=['team', 'year'], keep='last')

# Display the first few rows of the merged dataframe
team_stats_with_coach_df.head()


In [None]:
# Correct the function to exclude non-numeric columns before performing subtraction

def create_matchup_features(team_stats_df, team1, team2, year):
    # Get the stats for both teams
    team1_stats = team_stats_df[(team_stats_df['team'] == team1) & (team_stats_df['year'] == year)]
    team2_stats = team_stats_df[(team_stats_df['team'] == team2) & (team_stats_df['year'] == year)]

    # If we don't have data for both teams for that year, return None
    if team1_stats.empty or team2_stats.empty:
        return None

    # Exclude non-numeric columns before subtraction
    numeric_columns = team_stats_df.select_dtypes(include=['float64', 'int']).columns
    team1_numeric_stats = team1_stats[numeric_columns].iloc[0]
    team2_numeric_stats = team2_stats[numeric_columns].iloc[0]

    # Calculate the difference in stats - this will be our feature set
    matchup_features = team1_numeric_stats - team2_numeric_stats

    # Add a target variable: 1 if team1 has a higher win percentage, 0 otherwise
    matchup_features['target'] = int(team1_stats['win_percentage'].values[0] > team2_stats['win_percentage'].values[0])

    # Return the matchup features with team identifiers and year for reference
    matchup_features = matchup_features.to_frame().transpose()  # Convert to dataframe for merging
    matchup_features['team1'] = team1
    matchup_features['team2'] = team2
    matchup_features['year'] = year

    return matchup_features

# We'll create a dataframe to hold all the matchup features
matchup_features_df = pd.DataFrame()

# This is a computationally intensive process, so for demonstration, we'll just create matchups for one year
# In practice, you'd want to do this for all combinations of teams and years
year = 2003
teams = team_stats_with_coach_df[team_stats_with_coach_df['year'] == year]['team'].unique()
for i, team1 in enumerate(teams):
    for team2 in teams[i+1:]:  # Create matchups only once for each pair
        matchup_features = create_matchup_features(team_stats_with_coach_df, team1, team2, year)
        if matchup_features is not None:
            matchup_features_df = pd.concat([matchup_features_df, matchup_features], ignore_index=True)

matchup_features_df.head()


In [None]:
# To create matchup features for all years, we'll iterate over each unique year and create matchups for all teams
# We'll store all the matchup features in a list and then concatenate them into a single dataframe

# Get unique years from the dataset
years = team_stats_with_coach_df['year'].unique()

# Initialize an empty list to store the matchup feature dataframes
all_matchup_features = []

# Iterate over each year and create matchups for all teams
for year in years:
    teams = team_stats_with_coach_df[team_stats_with_coach_df['year'] == year]['team'].unique()
    for i, team1 in enumerate(teams):
        for team2 in teams[i+1:]:  # Create matchups only once for each pair
            matchup_features = create_matchup_features(team_stats_with_coach_df, team1, team2, year)
            if matchup_features is not None:
                all_matchup_features.append(matchup_features)

# Concatenate all the matchup features dataframes into a single dataframe
all_matchup_features_df = pd.concat(all_matchup_features, ignore_index=True)

# Display the shape of the resulting dataframe and the first few rows
all_matchup_features_df.shape


In [None]:
all_matchup_features_df

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Check for NaN values in the dataset
nan_columns = all_matchup_features_df.columns[all_matchup_features_df.isna().any()].tolist()

# Handle NaN values by filling them with the median value of the column
for column in nan_columns:
    median_value = all_matchup_features_df[column].median()
    all_matchup_features_df[column].fillna(median_value, inplace=True)

# Verify if all NaN values are handled
nan_columns_after = all_matchup_features_df.columns[all_matchup_features_df.isna().any()].tolist()

# If there are no NaN values left, we proceed to re-train the model
if not nan_columns_after:
    # Prepare the features and target variable for the model
    X_recent = all_matchup_features_df.drop(columns=['team1', 'team2', 'year', 'target'])  # Features
    y_recent = all_matchup_features_df['target']  # Target variable

    # Splitting the dataset into training and testing sets
    X_train_recent, X_test_recent, y_train_recent, y_test_recent = train_test_split(X_recent, y_recent, test_size=0.2, random_state=42)

    # Initialize the Logistic Regression model
    model_recent = LogisticRegression(max_iter=1000)

    # Train the model on the recent years data
    model_recent.fit(X_train_recent, y_train_recent)

    # Predict on the test set
    y_pred_recent = model_recent.predict(X_test_recent)

    # Evaluating the model on the recent years data
    accuracy_recent = accuracy_score(y_test_recent, y_pred_recent)
    classification_rep_recent = classification_report(y_test_recent, y_pred_recent)

    accuracy_recent, classification_rep_recent
else:
    # Output the columns that still contain NaN values
    nan_columns_after

    
    
#It would be nice to pick two teams randomly and predict their outcome? Not sure if thats the goal

In [None]:
from sklearn import metrics
confusion_matrix = metrics.confusion_matrix(y_test_recent, y_pred_recent)

cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])

cm_display.plot()
plt.show()

In [None]:
accuracy_recent

In [None]:
classification_rep_recent