# Imbalanced Dataset

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
import warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Load the dataset
df_check = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ML Project/NBA_after_mice.csv')

# Check the shape of the dataframe
print(f"Shape of the dataset before removal: {df_check.shape}")

# Display the first few rows
print(df_check.head())

# Check for missing values
print("Missing values in the dataset:")
print(df_check.isnull().sum())

# Columns to remove
columns_to_remove = ["International_Indication", "Regular_Season_Indication", "Playoffs_Indication", "Player"]

# Drop the specified columns
df_cleaned = df_check.drop(columns=columns_to_remove, errors='ignore')

# Save the cleaned dataset as CSV and Pickle
csv_path = '/content/drive/My Drive/Colab Notebooks/ML Project/NBA_after_mice_with_out_text.csv'
pkl_path = '/content/drive/My Drive/Colab Notebooks/ML Project/NBA_after_mice_with_out_text.pkl'

df_cleaned.to_csv(csv_path, index=False)
df_cleaned.to_pickle(pkl_path)

print(f"Dataset saved as CSV: {csv_path}")
print(f"Dataset saved as Pickle: {pkl_path}")

# Display the first few rows
print(df_cleaned.head())

# Check for missing values
print("Missing values in the dataset:")
print(df_cleaned.isnull().sum())


Mounted at /content/drive
Shape of the dataset before removal: (14582, 25)
      GP     MIN    FGM    FGA         3PM         3PA    FTM    FTA    TOV  \
0   20.0   249.0   56.0   96.0   -0.032691   11.431083   19.0   27.0   21.0   
1   50.0  1360.0  246.0  558.0   46.000000  141.000000   96.0  128.0   94.0   
2  187.0  3769.0  354.0  801.0    1.000000   10.000000  161.0  229.0  107.0   
3   68.0  1952.5  394.0  870.0  175.000000  434.000000  209.0  249.0  150.0   
4   98.0  2842.7  468.0  884.0   -0.321020    6.000000  176.0  366.0  182.0   

      PF  ...   BLK     PTS  final_birth_year  final_height_cm  \
0   34.0  ...   1.0   131.0       1998.000000       196.855161   
1   98.0  ...  19.0   634.0       1990.000000       201.000000   
2  290.0  ...  29.0   870.0       1989.340090       206.000000   
3  163.0  ...  19.0  1172.0       1986.000000       198.000000   
4  345.0  ...  94.0  1112.0       1988.950497       208.000000   

   final_weight_kg  NBA_Indication  Regular_Season_In

In [None]:
# Count the occurrences of 1 and 0 in the 'NBA_INDICATION' column
count_1 = df_check[df_check['NBA_Indication'] == 1].shape[0]
count_0 = df_check[df_check['NBA_Indication'] == 0].shape[0]

# Display the counts
print(f"Number of 1's in 'NBA_Indication': {count_1}")
print(f"Number of 0's in 'NBA_Indication': {count_0}")

Number of 1's in 'NBA_Indication': 1119
Number of 0's in 'NBA_Indication': 13463


Oversampling- SMOTE

# Imbalanced techniques

In [None]:
df_balanced_test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ML Project/NBA_after_mice_with_out_text.csv')


In [None]:
techniques = {
    "ROS": RandomOverSampler(random_state=47),
    "RUS": RandomUnderSampler(random_state=47),
    "SMOTE": SMOTE(random_state=47),
    "SMOTETomek": SMOTETomek(random_state=47)
}

In [None]:
X = df_balanced_test.drop(columns=["NBA_Indication"])
y = df_balanced_test["NBA_Indication"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Evaluate and collect results using list comprehension
results = [
    {
        "Technique": name,
        "Accuracy": accuracy_score(y_test, model.predict(X_test)),
        "Precision": precision_score(y_test, model.predict(X_test)),
        "Recall": recall_score(y_test, model.predict(X_test)),
        "F1-Score": f1_score(y_test, model.predict(X_test))
    }
    for name, technique in techniques.items()
    for model in [LogisticRegression(random_state=42).fit(*technique.fit_resample(X_train, y_train))]
]
# Display results
results_df = pd.DataFrame(results)
print(results_df)

    Technique  Accuracy  Precision    Recall  F1-Score
0         ROS  0.908125   0.447619  0.839286  0.583851
1         RUS  0.910867   0.455446  0.821429  0.585987
2       SMOTE  0.918752   0.482850  0.816964  0.606965
3  SMOTETomek  0.917038   0.477041  0.834821  0.607143


In [None]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Technique,Accuracy,Precision,Recall,F1-Score
0,ROS,0.908125,0.447619,0.839286,0.583851
1,RUS,0.910867,0.455446,0.821429,0.585987
2,SMOTE,0.918752,0.48285,0.816964,0.606965
3,SMOTETomek,0.917038,0.477041,0.834821,0.607143


We selected SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# Load the dataset
df = pd.read_pickle('/content/drive/My Drive/Colab Notebooks/ML Project/NBA_after_mice_with_out_text.pkl')

# Define features (X) and target (y)
X = df.drop(columns=["NBA_Indication"])  # All features except the target
y = df["NBA_Indication"]

# Split data before SMOTE to prevent data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train a Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Display metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Convert back to DataFrame
df_resampled = pd.DataFrame(X_train_resampled, columns=X.columns)
df_resampled["NBA_Indication"] = y_train_resampled  # Reattach target

# Save the new dataset
df_resampled.to_csv('/content/drive/My Drive/Colab Notebooks/ML Project/NBA_after_smote.csv', index=False)
df_resampled.to_pickle('/content/drive/My Drive/Colab Notebooks/ML Project/NBA_after_smote.pkl')

# Display new class distribution
print("\nNew class distribution:")
print(df_resampled["NBA_Indication"].value_counts())


Accuracy: 0.9160
Precision: 0.4736
Recall: 0.8393
F1-Score: 0.6055

New class distribution:
NBA_Indication
0    10770
1    10770
Name: count, dtype: int64


# Train Test Split

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# Load the dataset
df = pd.read_pickle('/content/drive/My Drive/Colab Notebooks/ML Project/NBA_after_mice_with_out_text.pkl')

# Define features (X) and target (y)
X = df.drop(columns=["NBA_Indication"])  # All features except the target
y = df["NBA_Indication"]

# First, split into train (70%) and temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Split temp into validation (15%) and test (15%)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train a Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_resampled, y_train_resampled)

# Make predictions on validation and test sets
y_valid_pred = model.predict(X_valid)
y_test_pred = model.predict(X_test)

# Calculate metrics for validation set
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
valid_precision = precision_score(y_valid, y_valid_pred)
valid_recall = recall_score(y_valid, y_valid_pred)
valid_f1 = f1_score(y_valid, y_valid_pred)

# Calculate metrics for test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

# Display validation metrics
print("Validation Set Metrics:")
print(f"Accuracy: {valid_accuracy:.4f}")
print(f"Precision: {valid_precision:.4f}")
print(f"Recall: {valid_recall:.4f}")
print(f"F1-Score: {valid_f1:.4f}")

# Display test metrics
print("\nTest Set Metrics:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1-Score: {test_f1:.4f}")

# Save the new training set after SMOTE
df_resampled = pd.DataFrame(X_train_resampled, columns=X.columns)
df_resampled["NBA_Indication"] = y_train_resampled  # Reattach target

df_resampled.to_csv('/content/drive/My Drive/Colab Notebooks/ML Project/NBA_after_smote_train.csv', index=False)
df_resampled.to_pickle('/content/drive/My Drive/Colab Notebooks/ML Project/NBA_after_smote_train.pkl')

# Display new class distribution
print("\nNew class distribution in Train Set:")
print(df_resampled["NBA_Indication"].value_counts())


Validation Set Metrics:
Accuracy: 0.9214
Precision: 0.4929
Recall: 0.8274
F1-Score: 0.6178

Test Set Metrics:
Accuracy: 0.9278
Precision: 0.5181
Recall: 0.8512
F1-Score: 0.6441

New class distribution in Train Set:
NBA_Indication
0    9424
1    9424
Name: count, dtype: int64


In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# Load the dataset (using the pickle file saved in Drive)
df = pd.read_pickle('/content/drive/My Drive/Colab Notebooks/ML Project/NBA_after_mice_with_out_text.pkl')

# Define features (X) and target (y)
X = df.drop(columns=["NBA_Indication"])
y = df["NBA_Indication"]

# Split data: 70% train, 15% validation, 15% test (using stratification)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Apply SMOTE on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define models
models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "SVM": SVC(kernel='linear', probability=True, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Loop over models
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_resampled, y_train_resampled)

    # Make predictions on validation and test sets
    y_valid_pred = model.predict(X_valid)
    y_test_pred = model.predict(X_test)

    # Calculate evaluation metrics
    model_results = {
        "Validation Accuracy": accuracy_score(y_valid, y_valid_pred),
        "Validation Precision": precision_score(y_valid, y_valid_pred),
        "Validation Recall": recall_score(y_valid, y_valid_pred),
        "Validation F1": f1_score(y_valid, y_valid_pred),
        "Test Accuracy": accuracy_score(y_test, y_test_pred),
        "Test Precision": precision_score(y_test, y_test_pred),
        "Test Recall": recall_score(y_test, y_test_pred),
        "Test F1": f1_score(y_test, y_test_pred)
    }

    # Create a DataFrame for this model's results
    model_results_df = pd.DataFrame([model_results])

    # Save the results to CSV and pickle files using the model name in the filename
    csv_filename = f"/content/drive/My Drive/Colab Notebooks/ML Project/model_comparison_results_Training {name}.csv"
    pkl_filename = f"/content/drive/My Drive/Colab Notebooks/ML Project/model_comparison_results_Training {name}.pkl"
    model_results_df.to_csv(csv_filename, index=False)
    model_results_df.to_pickle(pkl_filename)

    print(f"Results for {name} saved to:\n  {csv_filename}\n  {pkl_filename}\n")


Training Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Results for Logistic Regression saved to:
  /content/drive/My Drive/Colab Notebooks/ML Project/model_comparison_results_Training Logistic Regression.csv
  /content/drive/My Drive/Colab Notebooks/ML Project/model_comparison_results_Training Logistic Regression.pkl

Training Random Forest...
Results for Random Forest saved to:
  /content/drive/My Drive/Colab Notebooks/ML Project/model_comparison_results_Training Random Forest.csv
  /content/drive/My Drive/Colab Notebooks/ML Project/model_comparison_results_Training Random Forest.pkl

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



Results for XGBoost saved to:
  /content/drive/My Drive/Colab Notebooks/ML Project/model_comparison_results_Training XGBoost.csv
  /content/drive/My Drive/Colab Notebooks/ML Project/model_comparison_results_Training XGBoost.pkl

Training SVM...
Results for SVM saved to:
  /content/drive/My Drive/Colab Notebooks/ML Project/model_comparison_results_Training SVM.csv
  /content/drive/My Drive/Colab Notebooks/ML Project/model_comparison_results_Training SVM.pkl

Training Gradient Boosting...
Results for Gradient Boosting saved to:
  /content/drive/My Drive/Colab Notebooks/ML Project/model_comparison_results_Training Gradient Boosting.csv
  /content/drive/My Drive/Colab Notebooks/ML Project/model_comparison_results_Training Gradient Boosting.pkl



In [None]:
import pandas as pd
import glob

# Define the path pattern to locate all model result pkl files
path_pattern= pd.read_pickle('/content/drive/My Drive/Colab Notebooks/ML Project/NBA_after_mice_with_out_text.pkl')

# Use glob to find all files matching the pattern
csv_files = glob.glob(path_pattern)

# Initialize an empty list to hold DataFrames
df_list = []

# Loop over the list of files and read each into a DataFrame
for file in csv_files:
    # Extract the model name from the file name
    model_name = file.split("Training ")[-1].split(".csv")[0]

    # Read the CSV file into a DataFrame
    df = pd.read_csv(file)

    # Add a column for the model name
    df['Model'] = model_name

    # Append the DataFrame to the list
    df_list.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
comparison_df = pd.concat(df_list, ignore_index=True)

# Reorder columns to have 'Model' as the first column
cols = ['Model'] + [col for col in comparison_df if col != 'Model']
comparison_df = comparison_df[cols]

# Display the comparison DataFrame
print(comparison_df)

# Optionally, save the comparison DataFrame to a new CSV file
comparison_csv = "/content/drive/My Drive/Colab Notebooks/ML Project/model_comparison_summary.csv"
comparison_df.to_csv(comparison_csv, index=False)
print(f"Comparison summary saved to: {comparison_csv}")


                 Model  Validation Accuracy  Validation Precision  \
0  Logistic Regression             0.921353              0.492908   
1        Random Forest             0.947874              0.662651   
2              XGBoost             0.955190              0.708333   
3                  SVM             0.891175              0.404372   
4    Gradient Boosting             0.904435              0.432343   

   Validation Recall  Validation F1  Test Accuracy  Test Precision  \
0           0.827381       0.617778       0.927788        0.518116   
1           0.654762       0.658683       0.944241        0.625000   
2           0.708333       0.708333       0.950183        0.672515   
3           0.880952       0.554307       0.895338        0.417344   
4           0.779762       0.556263       0.903108        0.429487   

   Test Recall   Test F1  
0     0.851190  0.644144  
1     0.684524  0.653409  
2     0.684524  0.678466  
3     0.916667  0.573557  
4     0.797619  0.558333  
Co

Fine-Tuning

XGBoost Fine-Tuning with GridSearchCV

In [None]:
import pandas as pd
import pickle
from sklearn.model_selection import GridSearchCV, train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
data_path = "/content/drive/My Drive/Colab Notebooks/ML Project/NBA_after_mice_with_out_text.pkl"
df = pd.read_pickle(data_path)

# Define features and target variable
X = df.drop(columns=['NBA_Indication'])  # Features (all except the target column)
y = df['NBA_Indication']  # Target variable (NBA or not)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize XGBoost classifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss")

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
}

# Define multiple scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision_weighted',
    'recall': 'recall_weighted',
    'f1': 'f1_weighted'
}

# Perform Grid Search for hyperparameter tuning
grid_search = GridSearchCV(xgb, param_grid, scoring=scoring, refit='accuracy', cv=5, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Retrieve the best model
best_xgb = grid_search.best_estimator_

# Extract best validation scores
best_validation_acc = grid_search.cv_results_['mean_test_accuracy'][grid_search.best_index_]
best_validation_precision = grid_search.cv_results_['mean_test_precision'][grid_search.best_index_]
best_validation_recall = grid_search.cv_results_['mean_test_recall'][grid_search.best_index_]
best_validation_f1 = grid_search.cv_results_['mean_test_f1'][grid_search.best_index_]

# Evaluate on the test set
y_pred = best_xgb.predict(X_test)

# Compute test performance metrics
test_accuracy = accuracy_score(y_test, y_pred)
test_precision = precision_score(y_test, y_pred, average='weighted')
test_recall = recall_score(y_test, y_pred, average='weighted')
test_f1 = f1_score(y_test, y_pred, average='weighted')

# Store results in a DataFrame
results = {
    "Model": ["XGBoost (Tuned)"],
    "Validation Accuracy": [best_validation_acc],
    "Validation Precision": [best_validation_precision],
    "Validation Recall": [best_validation_recall],
    "Validation F1": [best_validation_f1],
    "Test Accuracy": [test_accuracy],
    "Test Precision": [test_precision],
    "Test Recall": [test_recall],
    "Test F1": [test_f1]
}

results_df = pd.DataFrame(results)

# Save results to CSV
results_csv_path = "/content/drive/My Drive/Colab Notebooks/ML Project/model_comparison_results_Training_XGBoost_after_Fine_Tuning.csv"
results_df.to_csv(results_csv_path, index=False)

# Save the best model to a pickle file
model_pkl_path = "/content/drive/My Drive/Colab Notebooks/ML Project/model_comparison_results_Training_XGBoost_after_Fine_Tuning.pkl"
with open(model_pkl_path, "wb") as file:
    pickle.dump(best_xgb, file)

print(f"Tuned model saved as {model_pkl_path}")
print(f"Results saved as {results_csv_path}")


Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Tuned model saved as /content/drive/My Drive/Colab Notebooks/ML Project/model_comparison_results_Training_XGBoost_after_Fine_Tuning.pkl
Results saved as /content/drive/My Drive/Colab Notebooks/ML Project/model_comparison_results_Training_XGBoost_after_Fine_Tuning.csv


In [None]:
import pandas as pd

# Load the results CSV
results_path = "/content/drive/My Drive/Colab Notebooks/ML Project/model_comparison_results_Training_XGBoost_after_Fine_Tuning.csv"
df_results = pd.read_csv(results_path)

# Display the results
print(df_results)


             Model  Validation Accuracy  Validation Precision  \
0  XGBoost (Tuned)             0.955765              0.952124   

   Validation Recall  Validation F1  Test Accuracy  Test Precision  \
0           0.955765       0.951418       0.961262        0.958602   

   Test Recall   Test F1  
0     0.961262  0.958747  


Comparison of XGBoost Performance: Before and After Fine-Tuning

In [None]:
import pandas as pd

# Load the old results (pre-Fine-Tuning)
old_results_path = "/content/drive/My Drive/Colab Notebooks/ML Project/model_comparison_summary.csv"
df_old = pd.read_csv(old_results_path)

# Load the new results (post-Fine-Tuning)
new_results_path = "/content/drive/My Drive/Colab Notebooks/ML Project/model_comparison_results_Training_XGBoost_after_Fine_Tuning.csv"
df_new = pd.read_csv(new_results_path)

# Filter only XGBoost from old results
df_xgb_old = df_old[df_old['Model'] == 'XGBoost']

# Merge old and new results
df_comparison = pd.concat([df_xgb_old, df_new], keys=["Before Fine-Tuning", "After Fine-Tuning"])

# Display comparison
print(df_comparison)


                                Model  Validation Accuracy  \
Before Fine-Tuning 2          XGBoost             0.955190   
After Fine-Tuning  0  XGBoost (Tuned)             0.955765   

                      Validation Precision  Validation Recall  Validation F1  \
Before Fine-Tuning 2              0.708333           0.708333       0.708333   
After Fine-Tuning  0              0.952124           0.955765       0.951418   

                      Test Accuracy  Test Precision  Test Recall   Test F1  
Before Fine-Tuning 2       0.950183        0.672515     0.684524  0.678466  
After Fine-Tuning  0       0.961262        0.958602     0.961262  0.958747  
