In [46]:
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import Data

In [47]:
from sklearn.model_selection import train_test_split

data = pd.read_csv('/content/drive/MyDrive/CMPE252_project/data.csv')

# remove rows where the Winner is nan
data = data.dropna(subset=['Winner: 0 Home, 1Visiting'])

game_data = data.drop(columns=['Winner: 0 Home, 1Visiting'])
game_results = data['Winner: 0 Home, 1Visiting']

game_data_train, game_data_test, game_results_train, game_results_test = train_test_split(game_data, game_results, test_size=0.3)

# Normalize Data

In [48]:
from sklearn.preprocessing import StandardScaler

game_data_train = StandardScaler().fit_transform(game_data_train)
game_data_test = StandardScaler().fit_transform(game_data_test)

# Check the mean and standard deviation after normalization (for StandardScaler)
print("Mean of normalized training data:", game_data_train.mean(axis=0))
print("Standard deviation of normalized training data:", game_data_train.std(axis=0))

# For Min-Max, verify the range
print("Min of normalized data:", game_data_train.min(axis=0))
print("Max of normalized data:", game_data_train.max(axis=0))

Mean of normalized training data: [-5.18067491e-17  2.95035046e-16 -1.17662786e-16 -1.40492879e-17
 -3.51232198e-18  2.80985758e-17 -1.29077833e-16  1.45761362e-16
 -1.29955913e-16  1.07125820e-16 -2.49374860e-16 -4.39040247e-18
  6.67341175e-17 -1.40492879e-17  1.38736718e-16 -3.29280185e-17
 -8.78080494e-18 -1.75616099e-18 -1.12394303e-16  3.06450092e-16
  7.02464395e-18  1.75616099e-17 -7.11245200e-17 -7.02464395e-18
 -4.26747120e-16 -7.02464395e-17 -9.83450153e-17 -1.05369659e-16
 -1.79128421e-16 -3.51232198e-18  3.54744520e-16  3.51232198e-17
  1.21175108e-16  1.38736718e-16  8.51738079e-17  2.45862538e-17
  1.96690031e-16  1.11516223e-16 -3.51232198e-17 -1.51029845e-16
  3.86355417e-17  1.40492879e-17  3.60013003e-16 -4.21478637e-17
 -5.44409906e-17 -1.22931269e-16  1.75616099e-17 -2.01958514e-17
 -2.31813250e-16 -6.40998761e-17  1.05369659e-17 -3.86355417e-17
  5.26848296e-17  0.00000000e+00 -6.77878141e-16  1.00101176e-16
  2.10739319e-16 -3.68793807e-17 -3.51232198e-18 -1.2732

# Feature Selection using PCA

In [49]:
from sklearn.decomposition import PCA

print(f"Original number of features: {game_data_train.shape[1]}")

# replace nan values with 0 to avoid errors in PCA
game_data_train = np.nan_to_num(game_data_train, nan=0.0)
game_data_test = np.nan_to_num(game_data_test, nan=0.0)

pca = PCA(n_components=50)
game_data_train = pca.fit_transform(game_data_train)
game_data_test = pca.fit_transform(game_data_test)

print(f"Reduced number of features: {game_data_train.shape[1]}")
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Number of components: {pca.n_components_}")

Original number of features: 232
Reduced number of features: 50
Explained variance ratio: [0.06458998 0.04343177 0.03762371 0.02823994 0.02545927 0.02319306
 0.02187983 0.0217514  0.02106638 0.02036015 0.01994304 0.01887111
 0.01848927 0.01826106 0.017769   0.01749783 0.01678592 0.01646896
 0.01589891 0.01554481 0.01511633 0.0146899  0.01440872 0.01418741
 0.01386281 0.01289518 0.01283327 0.01245186 0.01202109 0.0115809
 0.01138302 0.01056759 0.01040609 0.01027409 0.00994575 0.00964167
 0.0090839  0.00833634 0.00793689 0.00760805 0.00734206 0.00684044
 0.00640459 0.00603178 0.00566206 0.00541261 0.00539823 0.00501271
 0.00493813 0.0048299 ]
Number of components: 50



# Model Selection and Training

In [50]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(game_data_train, game_results_train)

model.score(game_data_test, game_results_test)

0.597926267281106