In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer

In [2]:
# Shared data
shared_data_files = {
    'cities': "Cities.csv",
    'conferences': "Conferences.csv"
}

# Men's data
mens_data_files = {
    'm_conference_tourney_games': "MConferenceTourneyGames.csv",
    'm_game_cities': "MGameCities.csv",
    'm_massey_ordinals': "MMasseyOrdinals_thru_Season2023_Day128.csv",
    'm_ncaa_tourney_compact_results': "MNCAATourneyCompactResults.csv",
    'm_ncaa_tourney_detailed_results': "MNCAATourneyDetailedResults.csv",
    'm_ncaa_tourney_seed_round_slots': "MNCAATourneySeedRoundSlots.csv",
    'm_ncaa_tourney_seeds': "MNCAATourneySeeds.csv",
    'm_ncaa_tourney_slots': "MNCAATourneySlots.csv",
    'm_regular_season_compact_results': "MRegularSeasonCompactResults.csv",
    'm_regular_season_detailed_results': "MRegularSeasonDetailedResults.csv",
    'm_seasons': "MSeasons.csv",
    'm_secondary_tourney_compact_results': "MSecondaryTourneyCompactResults.csv",
    'm_secondary_tourney_teams': "MSecondaryTourneyTeams.csv",
    'm_team_coaches': "MTeamCoaches.csv",
    'm_team_conferences': "MTeamConferences.csv",
    'm_team_spellings': "MTeamSpellings.csv",
    'm_teams': "MTeams.csv"
}

# Women's data
womens_data_files = {
    'w_game_cities': "WGameCities.csv",
    'w_ncaa_tourney_compact_results': "WNCAATourneyCompactResults.csv",
    'w_ncaa_tourney_detailed_results': "WNCAATourneyDetailedResults.csv",
    'w_ncaa_tourney_seeds': "WNCAATourneySeeds.csv",
    'w_ncaa_tourney_slots': "WNCAATourneySlots.csv",
    'w_regular_season_compact_results': "WRegularSeasonCompactResults.csv",
    'w_regular_season_detailed_results': "WRegularSeasonDetailedResults.csv",
    'w_seasons': "WSeasons.csv",
    'w_team_conferences': "WTeamConferences.csv",
    'w_team_spellings': "WTeamSpellings.csv",
    'w_teams': "WTeams.csv"
}

# Loading shared data
shared_data = {}
for key, file_name in shared_data_files.items():
    shared_data[key] = pd.read_csv(f"/kaggle/input/march-machine-learning-mania-2023/{file_name}")

# Loading men's data
mens_data = {}
for key, file_name in mens_data_files.items():
    mens_data[key] = pd.read_csv(f"/kaggle/input/march-machine-learning-mania-2023/{file_name}", encoding = 'latin1')

# Loading women's data
womens_data = {}
for key, file_name in womens_data_files.items():
    womens_data[key] = pd.read_csv(f"/kaggle/input/march-machine-learning-mania-2023/{file_name}", encoding = 'latin1')

In [3]:
df = mens_data['m_regular_season_detailed_results'].copy()
df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [4]:
# Merge team names for team 1 (Winning team)
df = df.merge(mens_data['m_teams'], left_on='WTeamID', right_on='TeamID', how='left')
df.rename(columns={'TeamName': 'WTeamName'}, inplace=True)
df.drop(columns=['TeamID'], inplace=True)

In [5]:
# Merge team names for team 2 (Losing team)
df = df.merge(mens_data['m_teams'], left_on='LTeamID', right_on='TeamID', how='left')
df.rename(columns={'TeamName': 'LTeamName'}, inplace=True)
df.drop(columns=['TeamID'], inplace=True)

In [6]:
df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LTO,LStl,LBlk,LPF,WTeamName,FirstD1Season_x,LastD1Season_x,LTeamName,FirstD1Season_y,LastD1Season_y
0,2003,10,1104,68,1328,62,N,0,27,58,...,18,9,2,20,Alabama,1985,2023,Oklahoma,1985,2023
1,2003,10,1272,70,1393,63,N,0,26,62,...,12,8,6,16,Memphis,1985,2023,Syracuse,1985,2023
2,2003,11,1266,73,1437,61,N,0,24,58,...,12,2,5,23,Marquette,1985,2023,Villanova,1985,2023
3,2003,11,1296,56,1457,50,N,0,18,38,...,19,4,3,23,N Illinois,1985,2023,Winthrop,1987,2023
4,2003,11,1400,77,1208,71,N,0,30,61,...,10,7,1,14,Texas,1985,2023,Georgia,1985,2023


In [7]:
# Merge seed information for team 1 (Winning team)
df = df.merge(mens_data['m_ncaa_tourney_seeds'], left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'], how='left')
df.rename(columns={'Seed': 'WSeed'}, inplace=True)
df.drop(columns=['TeamID'], inplace=True)

In [8]:
# Merge seed information for team 2 (Losing team)
df = df.merge(mens_data['m_ncaa_tourney_seeds'], left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'], how='left')
df.rename(columns={'Seed': 'LSeed'}, inplace=True)
df.drop(columns=['TeamID'], inplace=True)

In [9]:
# Function to extract region from seed
def extract_region(seed):
    if pd.isna(seed):
        return None
    return seed[0]

# Create new columns for regions
df['WRegion'] = df['WSeed'].apply(extract_region)
df['LRegion'] = df['LSeed'].apply(extract_region)


In [10]:
# Function to clean seed information and convert to integer
def clean_seed(seed):
    if pd.isna(seed):
        return None
    return int(seed[1:3])

# Clean seed information
df['WSeed'] = df['WSeed'].apply(clean_seed)
df['LSeed'] = df['LSeed'].apply(clean_seed)

In [11]:
# Calculate seed difference as a new feature
df['SeedDiff'] = df['WSeed'] - df['LSeed']

In [12]:
df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,FirstD1Season_x,LastD1Season_x,LTeamName,FirstD1Season_y,LastD1Season_y,WSeed,LSeed,WRegion,LRegion,SeedDiff
0,2003,10,1104,68,1328,62,N,0,27,58,...,1985,2023,Oklahoma,1985,2023,10.0,1.0,Y,W,9.0
1,2003,10,1272,70,1393,63,N,0,26,62,...,1985,2023,Syracuse,1985,2023,7.0,3.0,Z,W,4.0
2,2003,11,1266,73,1437,61,N,0,24,58,...,1985,2023,Villanova,1985,2023,3.0,,Y,,
3,2003,11,1296,56,1457,50,N,0,18,38,...,1985,2023,Winthrop,1987,2023,,,,,
4,2003,11,1400,77,1208,71,N,0,30,61,...,1985,2023,Georgia,1985,2023,1.0,,X,,


In [13]:
# Replace 'None' with 'Unknown' in WRegion and LRegion
df['WRegion'].replace('None', 'Unknown', inplace=True)
df['LRegion'].replace('None', 'Unknown', inplace=True)

# Replace NaN with -1 in WSeed, LSeed, and SeedDiff
df['WSeed'].fillna(-1, inplace=True)
df['LSeed'].fillna(-1, inplace=True)
df['SeedDiff'].fillna(-1, inplace=True)

In [14]:
df.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,FirstD1Season_x,LastD1Season_x,LTeamName,FirstD1Season_y,LastD1Season_y,WSeed,LSeed,WRegion,LRegion,SeedDiff
0,2003,10,1104,68,1328,62,N,0,27,58,...,1985,2023,Oklahoma,1985,2023,10.0,1.0,Y,W,9.0
1,2003,10,1272,70,1393,63,N,0,26,62,...,1985,2023,Syracuse,1985,2023,7.0,3.0,Z,W,4.0
2,2003,11,1266,73,1437,61,N,0,24,58,...,1985,2023,Villanova,1985,2023,3.0,-1.0,Y,,-1.0
3,2003,11,1296,56,1457,50,N,0,18,38,...,1985,2023,Winthrop,1987,2023,-1.0,-1.0,,,-1.0
4,2003,11,1400,77,1208,71,N,0,30,61,...,1985,2023,Georgia,1985,2023,1.0,-1.0,X,,-1.0


In [15]:
# Replace 'None' (or np.nan) with 'Unknown' in WRegion and LRegion
df['WRegion'].replace({None: 'Unknown', np.nan: 'Unknown'}, inplace=True)
df['LRegion'].replace({None: 'Unknown', np.nan: 'Unknown'}, inplace=True)

In [16]:
numeric_columns = df.select_dtypes(include='number').columns
numeric_columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'NumOT',
       'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst',
       'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM',
       'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF',
       'FirstD1Season_x', 'LastD1Season_x', 'FirstD1Season_y',
       'LastD1Season_y', 'WSeed', 'LSeed', 'SeedDiff'],
      dtype='object')

In [None]:
#Checking distribution generally and for outliers too
n_cols = 2
n_rows = len(numeric_columns) // n_cols + (len(numeric_columns) % n_cols > 0)
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(16, n_rows * 4))
for idx, col in enumerate(numeric_columns):
    row_idx = idx // n_cols
    col_idx = idx % n_cols
    sns.histplot(data=df, x=col, kde=True, ax=axes[row_idx, col_idx])
    axes[row_idx, col_idx].set_title(col)

# Adjust the layout and show the plots
plt.tight_layout()
plt.show()

In [18]:
# Calculate total points scored and allowed for each team in the regular season
team_points_scored = df.groupby(["Season", "WTeamID"]).agg({"WScore": "sum"}).reset_index().rename(columns={"WTeamID": "TeamID", "WScore": "PointsScored"})
team_points_allowed = df.groupby(["Season", "WTeamID"]).agg({"LScore": "sum"}).reset_index().rename(columns={"WTeamID": "TeamID", "LScore": "PointsAllowed"})

team_points_scored_losing = df.groupby(["Season", "LTeamID"]).agg({"LScore": "sum"}).reset_index().rename(columns={"LTeamID": "TeamID", "LScore": "PointsScored"})
team_points_allowed_losing = df.groupby(["Season", "LTeamID"]).agg({"WScore": "sum"}).reset_index().rename(columns={"LTeamID": "TeamID", "WScore": "PointsAllowed"})

# Merge scored and allowed points
team_points = team_points_scored.merge(team_points_allowed, on=["Season", "TeamID"])
team_points_losing = team_points_scored_losing.merge(team_points_allowed_losing, on=["Season", "TeamID"])

# Combine winning and losing team points data
team_points_combined = team_points.append(team_points_losing).groupby(["Season", "TeamID"]).agg({"PointsScored": "sum", "PointsAllowed": "sum"}).reset_index()


In [19]:
total_games_won = df.groupby(["Season", "WTeamID"]).size().reset_index(name="GamesWon")
total_games_lost = df.groupby(["Season", "LTeamID"]).size().reset_index(name="GamesLost")

total_games_won.rename(columns={"WTeamID": "TeamID"}, inplace=True)
total_games_lost.rename(columns={"LTeamID": "TeamID"}, inplace=True)

total_games = total_games_won.merge(total_games_lost, on=["Season", "TeamID"])
total_games["TotalGames"] = total_games["GamesWon"] + total_games["GamesLost"]


In [20]:
team_points_extended = team_points_combined.merge(total_games, on=["Season", "TeamID"])
team_points_extended["AvgPointsScored"] = team_points_extended["PointsScored"] / team_points_extended["TotalGames"]
team_points_extended["AvgPointsAllowed"] = team_points_extended["PointsAllowed"] / team_points_extended["TotalGames"]

In [21]:
df = df.merge(team_points_extended[["Season", "TeamID", "AvgPointsScored", "AvgPointsAllowed"]], left_on=["Season", "WTeamID"], right_on=["Season", "TeamID"], how="left")
df.rename(columns={"AvgPointsScored": "WAvgPointsScored", "AvgPointsAllowed": "WAvgPointsAllowed"}, inplace=True)
df.drop(columns=["TeamID"], inplace=True)

df = df.merge(team_points_extended[["Season", "TeamID", "AvgPointsScored", "AvgPointsAllowed"]], left_on=["Season", "LTeamID"], right_on=["Season", "TeamID"], how="left")
df.rename(columns={"AvgPointsScored": "LAvgPointsScored", "AvgPointsAllowed": "LAvgPointsAllowed"}, inplace=True)
df.drop(columns=["TeamID"], inplace=True)


In [22]:
#Grouping overtime periods as a categorical column
def group_overtime_periods(num_ot):
    if num_ot == 0:
        return "No Overtime"
    elif num_ot == 1:
        return "1 Overtime"
    elif num_ot == 2:
        return "2 Overtimes"
    else:
        return "3 or More Overtimes"

df["OTCategory"] = df["NumOT"].apply(group_overtime_periods)


In [23]:
# Create a binary target variable (1 if the winning team has a higher TeamID, 0 otherwise)
df['WinPercentage'] = (df['WTeamID'] > df['LTeamID']).astype(int)

# Drop unnecessary columns
df = df.drop(['WTeamID', 'LTeamID', 'WTeamName', 'LTeamName', 'FirstD1Season_x', 'LastD1Season_x', 'FirstD1Season_y', 'LastD1Season_y', 'WRegion', 'LRegion', 'Season', 'DayNum'], axis=1)

In [24]:
# Define additional features
df['WFGP'] = df['WFGM'] / df['WFGA']
df['LFGP'] = df['LFGM'] / df['LFGA']
df['WFGP3'] = df['WFGM3'] / df['WFGA3']
df['LFGP3'] = df['LFGM3'] / df['LFGA3']
df['WFTP'] = df['WFTM'] / df['WFTA']
df['LFTP'] = df['LFTM'] / df['LFTA']

In [44]:
df.head()

Unnamed: 0,WScore,LScore,NumOT,WOR,WDR,WAst,WTO,WStl,WBlk,WPF,...,LFGP3,WFTP,LFTP,WLoc_A,WLoc_H,WLoc_N,OTCategory_1 Overtime,OTCategory_2 Overtimes,OTCategory_3 or More Overtimes,OTCategory_No Overtime
0,68,62,0,14,24,13,23,7,1,22,...,0.2,0.611111,0.727273,0,0,1,0,0,0,1
1,70,63,0,15,28,16,13,4,4,18,...,0.25,0.526316,0.45,0,0,1,0,0,0,1
2,73,61,0,17,26,15,10,5,2,25,...,0.115385,0.586207,0.608696,0,0,1,0,0,0,1
3,56,50,0,6,19,11,12,14,2,18,...,0.272727,0.548387,0.533333,0,0,1,0,0,0,1
4,77,71,0,17,22,12,14,4,4,20,...,0.375,0.846154,0.62963,0,0,1,0,0,0,1


In [27]:
# Drop redundant columns
redundant_columns = ['WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA']
df = df.drop(columns=redundant_columns)


In [28]:
# One-hot encode the 'WLoc' and 'OTCategory' columns
df = pd.get_dummies(df, columns=['WLoc', 'OTCategory'])

In [46]:
# Handle missing values using median imputation
imputer = SimpleImputer(strategy='median')
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [47]:
# Split the data into training and testing sets
X = df.drop('WinPercentage', axis=1)
y = df['WinPercentage']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
# Create a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

In [49]:
# Train the classifier
clf.fit(X_train, y_train)

RandomForestClassifier(n_jobs=-1, random_state=42)

In [50]:
# Make predictions and calculate accuracy
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.5548


In [54]:
from sklearn.feature_selection import RFE

selector = RFE(estimator=clf, n_features_to_select=10, step=1)
selector = selector.fit(X_train, y_train)

X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

clf.fit(X_train_selected, y_train)


RandomForestClassifier(n_jobs=-1, random_state=42)

In [55]:
# Make predictions and calculate accuracy
y_pred = clf.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.6101


In [58]:
# Get the feature names from the DataFrame
feature_names = X_train.columns

# Get the boolean mask for selected features
selected_mask = selector.support_

# Get the names of the selected and non-selected features
selected_features = feature_names[selected_mask]
non_selected_features = feature_names[~selected_mask]

# Print the selected and non-selected features
print("Selected features:")
print(selected_features)

print("\nNon-selected features:")
print(non_selected_features)


Selected features:
Index(['WAvgPointsScored', 'WAvgPointsAllowed', 'LAvgPointsScored',
       'LAvgPointsAllowed', 'WFGP', 'LFGP', 'WFGP3', 'LFGP3', 'WFTP', 'LFTP'],
      dtype='object')

Non-selected features:
Index(['WScore', 'LScore', 'NumOT', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl',
       'WBlk', 'WPF', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF',
       'WSeed', 'LSeed', 'SeedDiff', 'WLoc_A', 'WLoc_H', 'WLoc_N',
       'OTCategory_1 Overtime', 'OTCategory_2 Overtimes',
       'OTCategory_3 or More Overtimes', 'OTCategory_No Overtime'],
      dtype='object')


In [59]:
from sklearn.metrics import classification_report

# Make predictions using the selected features
y_pred = clf.predict(X_test_selected)

# Calculate precision, recall, and F1-score
report = classification_report(y_test, y_pred)

print(report)


              precision    recall  f1-score   support

         0.0       0.60      0.57      0.59     10437
         1.0       0.62      0.64      0.63     11090

    accuracy                           0.61     21527
   macro avg       0.61      0.61      0.61     21527
weighted avg       0.61      0.61      0.61     21527



In [None]:
!pip install tensorflow

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Preprocess the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# Create a neural network model
model = Sequential([
    Dense(32, activation='relu', input_dim=X_train_scaled.shape[1]),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy', Precision(), Recall()])

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=10, batch_size=64, validation_split=0.2, verbose=2)

# Evaluate the model on the test set
_, test_acc, test_precision, test_recall = model.evaluate(X_test_scaled, y_test, verbose=0)

# Calculate F1-score
test_f1 = 2 * (test_precision * test_recall) / (test_precision + test_recall)

print(f"\nTest accuracy: {test_acc:.4f}")
print(f"Test precision: {test_precision:.4f}")
print(f"Test recall: {test_recall:.4f}")
print(f"Test F1-score: {test_f1:.4f}")

# Generate the classification report
y_pred = model.predict(X_test_scaled)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import BatchNormalization

# Create a neural network model with more layers and neurons
model = Sequential([
    Dense(64, activation='relu', input_dim=X_train_scaled.shape[1]),
    BatchNormalization(),
    Dropout(0.2),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.0005),
              loss='binary_crossentropy',
              metrics=['accuracy', Precision(), Recall()])

# Implement early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model with a larger number of epochs
history = model.fit(X_train_scaled, y_train, epochs=50, batch_size=64, validation_split=0.2, callbacks=[early_stopping], verbose=2)

# Evaluate the model on the test set
_, test_acc, test_precision, test_recall = model.evaluate(X_test_scaled, y_test, verbose=0)

# Calculate F1-score
test_f1 = 2 * (test_precision * test_recall) / (test_precision + test_recall)

print(f"\nTest accuracy: {test_acc:.4f}")
print(f"Test precision: {test_precision:.4f}")
print(f"Test recall: {test_recall:.4f}")
print(f"Test F1-score: {test_f1:.4f}")

# Generate the classification report
y_pred_probs = model.predict(X_test_scaled)
y_pred = (y_pred_probs > 0.5).astype(int).flatten()
print("\nClassification Report:\n", classification_report(y_test, y_pred))

