In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [2]:
# Load datasets
train_2007 = pd.read_csv('matchups-2007.csv')
train_2008 = pd.read_csv('matchups-2008.csv')
train_2009 = pd.read_csv('matchups-2009.csv')
train_2010 = pd.read_csv('matchups-2010.csv')
train_2011 = pd.read_csv('matchups-2011.csv')
train_2012 = pd.read_csv('matchups-2012.csv')
train_2013 = pd.read_csv('matchups-2013.csv')
train_2014 = pd.read_csv('matchups-2014.csv')
train_2015 = pd.read_csv('matchups-2015.csv')
test_data = pd.read_csv('NBA_test.csv')
test_labels = pd.read_csv('NBA_test_labels.csv')

In [3]:
# Combine training data
train_data = pd.concat([train_2007, train_2008, train_2009, train_2010, train_2011, train_2012, train_2013, train_2014, train_2015], ignore_index=True)


In [4]:
# Define allowed features (excluding 'game' column as it is not a numerical feature)
allowed_features = ['season', 'home_team', 'away_team', 'starting_min', 'home_0', 'home_1', 'home_2', 'home_3', 'home_4', 'away_0', 'away_1', 'away_2', 'away_3', 'away_4']
train_data = train_data[allowed_features]

In [5]:
# Prepare test data
test_data = test_data[['game', 'season', 'home_team', 'away_team', 'starting_min', 'home_0', 'home_1', 'home_2', 'home_3', 'away_0', 'away_1', 'away_2', 'away_3', 'away_4']]


KeyError: "['game'] not in index"

In [6]:
# Check available columns in test dataset
print("Columns in test_data:", test_data.columns.tolist())

Columns in test_data: ['season', 'home_team', 'away_team', 'starting_min', 'home_0', 'home_1', 'home_2', 'home_3', 'home_4', 'away_0', 'away_1', 'away_2', 'away_3', 'away_4']


In [7]:
# Ensure required columns exist in test_data
required_test_features = ['season', 'home_team', 'away_team', 'starting_min', 'home_0', 'home_1', 'home_2', 'home_3', 'away_0', 'away_1', 'away_2', 'away_3', 'away_4']
test_data = test_data[[col for col in required_test_features if col in test_data.columns]]


In [8]:
# Preserve game column if it exists, otherwise create a placeholder
if 'game' in test_data.columns:
    test_games = test_data[['game', 'home_team']].copy()
else:
    test_games = pd.DataFrame({'game': [f'game_{i}' for i in range(len(test_data))], 'home_team': test_data['home_team']})


In [9]:
# Encode categorical variables
label_encoders = {}
for col in ['home_team', 'away_team', 'home_0', 'home_1', 'home_2', 'home_3', 'home_4', 'away_0', 'away_1', 'away_2', 'away_3', 'away_4']:
    if col in train_data.columns:
        le = LabelEncoder()
        train_data[col] = le.fit_transform(train_data[col])

        # Store label encoder
        label_encoders[col] = le

        # Transform test data, handling unseen labels
        if col in test_data.columns:
            test_data[col] = test_data[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[col] = test_data[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[col] = test_data[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[col

In [10]:
# Split features and target
X = train_data.drop(columns=['home_4'])
y = train_data['home_4']

In [11]:
# Train-test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [13]:
# Evaluate model
y_pred = model.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred)
print(f'Model Accuracy: {accuracy:.4f}')

Model Accuracy: 0.8100


In [14]:
# Predict the missing fifth player for test data
predictions = model.predict(test_data)
test_data['predicted_home_4'] = label_encoders['home_4'].inverse_transform(predictions)

In [15]:
# Generate output file
output = test_games.copy()
output['Fifth_Player'] = test_data['predicted_home_4']
output.to_csv('NBA_predictions.csv', index=False)

print("Predictions saved to NBA_predictions.csv")


Predictions saved to NBA_predictions.csv


## Evaluation

In [16]:


def compare_fifth_player_all(predictions_file, test_labels_file):
    # Load the CSV files
    df_predictions = pd.read_csv(predictions_file)
    df_test_labels = pd.read_csv(test_labels_file)

    # Extract relevant columns
    predicted_players = df_predictions['Fifth_Player']
    actual_players = df_test_labels['removed_value']

    # Check if both columns have the same length
    if len(predicted_players) != len(actual_players):
        print("Warning: The number of predictions does not match the number of test labels.")

    # Compare values
    comparison_results = predicted_players == actual_players

    # Calculate accuracy
    accuracy = comparison_results.mean() * 100

    print(f"Total comparisons: {len(comparison_results)}")
    print(f"Correct predictions: {comparison_results.sum()}")
    print(f"Accuracy: {accuracy:.2f}%")

    # Save comparison results
    result_df = pd.DataFrame({
        'Predicted_Fifth_Player': predicted_players,
        'Actual_Fifth_Player': actual_players,
        'Match': comparison_results
    })
    result_df.to_csv("comparison_results_all.csv", index=False)
    print("Comparison results saved to comparison_results_all.csv")

# Example usage
compare_fifth_player_all("NBA_predictions_using_all dataset.csv", "NBA_test_labels.csv")


Total comparisons: 1000
Correct predictions: 219
Accuracy: 21.90%
Comparison results saved to comparison_results_all.csv
