In [90]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import numpy as np

# to ignore the warnings
from warnings import filterwarnings

In [91]:
total_df = pd.read_csv('./data/totalPredict_ML3.csv')
total_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9510 entries, 0 to 9509
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   home_team       9510 non-null   object 
 1   visitor_team    9510 non-null   object 
 2   visitor_points  9510 non-null   float64
 3   visitor_fgm     9510 non-null   float64
 4   home_points     9510 non-null   float64
 5   home_fgm        9510 non-null   float64
 6   visitor_ORtg    9510 non-null   float64
 7   home_ORtg       9510 non-null   float64
 8   visitor_DRtg    9510 non-null   float64
 9   home_DRtg       9510 non-null   float64
 10  total_points    9510 non-null   float64
dtypes: float64(9), object(2)
memory usage: 817.4+ KB


In [92]:
total_df.isnull().sum()

home_team         0
visitor_team      0
visitor_points    0
visitor_fgm       0
home_points       0
home_fgm          0
visitor_ORtg      0
home_ORtg         0
visitor_DRtg      0
home_DRtg         0
total_points      0
dtype: int64

In [93]:
total_df.shape

(9510, 11)

In [94]:
winner_df = pd.read_csv('./data/winPredict_ML3.csv')


In [95]:
winner_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9510 entries, 0 to 9509
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   home_team       9510 non-null   object 
 1   visitor_team    9510 non-null   object 
 2   visitor_points  9510 non-null   float64
 3   home_points     9510 non-null   float64
 4   visitor_ORtg    9510 non-null   float64
 5   home_ORtg       9510 non-null   float64
 6   visitor_DRtg    9510 non-null   float64
 7   home_DRtg       9510 non-null   float64
 8   visitor_efg%    9509 non-null   float64
 9   home_efg%       9509 non-null   float64
 10  visitor_ts%     9510 non-null   float64
 11  home_ts%        9510 non-null   float64
 12  home_win        9510 non-null   int64  
dtypes: float64(10), int64(1), object(2)
memory usage: 966.0+ KB


In [96]:
winner_df.shape

(9510, 13)

In [97]:
# winner_df['is_home'] = (winner_df['home_team'] == winner_df['home_team']).astype(int)
# winner_df.head()

# One-Hot Encoding

In [98]:
# Apply one-hot encoding to categorical columns
encoded_total_df = pd.get_dummies(total_df, columns=['visitor_team', 'home_team'])
encoded_winner_df = pd.get_dummies(winner_df, columns=['visitor_team', 'home_team'])

# Covert booleans to integers
new_total_df = encoded_total_df.replace({True: 1, False: 0})
new_winner_df = encoded_winner_df.replace({True: 1, False: 0})

  new_total_df = encoded_total_df.replace({True: 1, False: 0})
  new_winner_df = encoded_winner_df.replace({True: 1, False: 0})


In [99]:
new_winner_df = new_winner_df.dropna()
new_total_df = new_total_df.dropna()

In [100]:
print(f"Shape of encoded_total_df: {new_total_df.shape}")
print(f"Shape of encoded_winner_df: {new_winner_df.shape}")

Shape of encoded_total_df: (9510, 69)
Shape of encoded_winner_df: (9509, 71)


# Machine Learning

In [101]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Create feature values
X_total = new_total_df.drop(columns=['home_points', 'visitor_points', 'total_points'])
X_winner = new_winner_df.drop(columns=['home_points', 'visitor_points', 'home_win'])

# Create target values
y_total = new_total_df['total_points']
y_winner = new_winner_df['home_win']

# Split training data
X_total_train, X_total_test, y_total_train, y_total_test = train_test_split(
    X_total, 
    y_total, 
    test_size=0.2, 
    random_state=42
)

X_winner_train, X_winner_test, y_winner_train, y_winner_test = train_test_split(
    X_winner, 
    y_winner, 
    test_size=0.2, 
    random_state=42
)

# Initiate Scaler
scaler = StandardScaler()

# Train the classification model (winner predictions)
clf_model = DecisionTreeClassifier().fit(X_winner_train, y_winner_train)

# Scale training data
X_total_train.replace([np.inf, -np.inf], np.nan, inplace=True) # X_total_train contained infinite values
X_total_train.fillna(X_total_train.mean(), inplace=True) # Fill with column mean

X_total_train_scaled = scaler.fit_transform(X_total_train)

# Train regression model (total prediction)
linear_model = LinearRegression().fit(X_total_train_scaled, y_total_train)

# Scale the test data
X_total_test_scaled = scaler.transform(X_total_test)

In [102]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report # type: ignore

# Evaluate the classification model
y_winner_pred = clf_model.predict(X_winner_test)
print(confusion_matrix(y_winner_test, y_winner_pred))
print(classification_report(y_winner_test, y_winner_pred))
print("Accuracy:", accuracy_score(y_winner_test, y_winner_pred))


[[916  10]
 [  5 971]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       926
           1       0.99      0.99      0.99       976

    accuracy                           0.99      1902
   macro avg       0.99      0.99      0.99      1902
weighted avg       0.99      0.99      0.99      1902

Accuracy: 0.9921135646687698


In [103]:
y_total_pred = linear_model.predict(X_total_test_scaled)
from sklearn.metrics import mean_squared_error, r2_score
print("MSE:", mean_squared_error(y_total_test, y_total_pred))
print("R2 Score:", r2_score(y_total_test, y_total_pred))


MSE: 75.12772573836824
R2 Score: 0.8366578889853996


In [104]:
# from sklearn.model_selection import cross_val_score
# scores = cross_val_score(clf_model, X_winner, y_winner, cv=5)
# print("Cross-Validation Scores:", scores)
# print("Mean CV Accuracy:", scores.mean())


In [105]:
# from sklearn.model_selection import GridSearchCV
# param_grid = {
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }
# grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
# grid_search.fit(X_winner_train, y_winner_train)
# print("Best Parameters:", grid_search.best_params_)


In [108]:
# Inputs for the home and visitor teams
visitor_team = input("Enter Away Team: ")
home_team = input("Enter Home Team: ")

# Create input data with correct features for the models
input_data_winner = {col: 0 for col in X_winner.columns}
input_data_total = {col: 0 for col in X_total.columns}

# Populate the features based on the provided teams
if f'home_team_{home_team}' in input_data_winner:
    input_data_winner[f'home_team_{home_team}'] = 1
if f'visitor_team_{visitor_team}' in input_data_winner:
    input_data_winner[f'visitor_team_{visitor_team}'] = 1
    
    
if f'home_team_{home_team}' in input_data_total:
    input_data_winner[f'home_team_{home_team}'] = 1    
if f'visitor_team_{visitor_team}' in input_data_total:
    input_data_winner[f'visitor_team_{visitor_team}'] = 1
    
# Convert dictionaries to DataFrames
input_df_winner = pd.DataFrame([input_data_winner])
input_df_total = pd.DataFrame([input_data_total])

# Scale the input data for the total points
input_df_total_scaled = scaler.transform(input_df_total)

# Predict the winner using the classification model
predicted_winner = clf_model.predict(input_df_winner)[0]
predict_total_points = linear_model.predict(input_df_total_scaled)[0]

# Map the winner prediction to the team name
winner = home_team if predicted_winner == 1 else visitor_team

print("\n--- Game Prediction ---")
print(f"Predicted Winner: {winner}")
# print(f"Predicted Total Points: {round(predict_total_points, 0)}")



--- Game Prediction ---
Predicted Winner: Phoenix Suns


# Using Model 1 ML

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

X = model_df.drop(columns=['winner','home_points', 'visitor_points', 'total_points'])
X_df = pd.DataFrame(X, columns=model_df.drop(columns=['winner', 'home_points', 'visitor_points', 'total_points']).columns)

# Create both target variables
y_winner = model_df['winner']
y_total_points = model_df['total_points']

# Splitting data
X_train, X_test, y_train_winner, y_test_winner = train_test_split(X_df, y_winner, test_size=0.2, random_state=42)
_, _, y_train_points, y_test_points = train_test_split(X_df, y_total_points, test_size=0.2, random_state=42)

# Scaling the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train the classification model (winner prediction)
clf_model = DecisionTreeClassifier().fit(X_train, y_train_winner)

# Train regression model (total_points)
linear_model = LinearRegression().fit(X_train_scaled, y_train_points)

# Scale the test data
X_test_scaled = scaler.transform(X_test)