CSCE 4143: Data Mining

Group 7 Final Project

UFC Stuff

https://www.kaggle.com/datasets/akshaysinghim/ufc-fight-data-1993-to-2023

Levi Crider, Spencer Smith, Caleb Holmes, and Ethan Weems

## Installing Dependencies

In [None]:
import pandas as pd
import numpy as np
# load data
df = pd.read_csv("ufc_data.csv")

In [None]:
# preview dataset
pd.set_option('display.max_columns', None)
df

In [None]:
df = df.rename(columns={'R_fighter': 'R_FIGHTER'})
df = df.rename(columns={'B_fighter': 'B_FIGHTER'})
df = df.rename(columns={'R_SIG_STR.': 'R_SIG_STR'})
df = df.rename(columns={'B_SIG_STR.': 'B_SIG_STR'})
df = df.rename(columns={'R_TOTAL_STR.': 'R_TOT_STR'})
df = df.rename(columns={'B_TOTAL_STR.': 'B_TOT_STR'})
df = df.rename(columns={'R_SIG_STR_pct': 'R_SIG_PCT'})
df = df.rename(columns={'B_SIG_STR_pct': 'B_SIG_PCT'})
df = df.rename(columns={'R_TD_pct': 'R_TD_PCT'})
df = df.rename(columns={'B_TD_pct': 'B_TD_PCT'})
df = df.rename(columns={'win_by': 'WIN_TYPE'})
df = df.rename(columns={'last_round': 'LAST_RD'})
df = df.rename(columns={'last_round_time': 'LAST_RD_TIME'})
df = df.rename(columns={'Format': 'NUM_RD'})
df = df.rename(columns={'Referee': 'REF'})
df = df.rename(columns={'date': 'DATE'})
df = df.rename(columns={'location': 'LOC'})
df = df.rename(columns={'Fight_type': 'TYPE'})
df = df.rename(columns={'Winner': 'WINNER'})

# Cleaning Data

In [None]:
# dropping all rows with missing values
attsWithPct = ["R_SIG_PCT", "B_SIG_PCT", "R_TD_PCT", "B_TD_PCT"]

for attribute in df.columns:
    if attribute in attsWithPct:
        continue
    else:
        df[attribute].replace("---", pd.NaT, inplace=True)
        df[attribute].dropna(inplace=True)

# dropping all rows before May of 2001
df['DATE'] = pd.to_datetime(df['DATE'], format="%B %d, %Y").dt.strftime("%Y-%m-%d")
limit_date = '2001-04-01'
df = df[(df['DATE'] > limit_date)]

In [None]:
# remove percentage signs from columns
df = df.replace('%', '', regex=True)

In [None]:
# combine last round and last round time into one column called total_time

# convert LAST_RD to time elapsed in rounds before final round
df['LAST_RD'] = (df['LAST_RD'].astype(int) * 5) - 5
df['LAST_RD'] = pd.to_timedelta(df['LAST_RD'], unit='m')

# convert LAST_RD_TIME to format acceptable by timedelta
df['LAST_RD_TIME'] = '00:' + df['LAST_RD_TIME'].astype(str)
df['LAST_RD_TIME'] = pd.to_timedelta(df['LAST_RD_TIME'])

# add times
df['TOTAL_TIME'] = df['LAST_RD'] + df['LAST_RD_TIME']

# reformat
# df['TOTAL_TIME'] = df['TOTAL_TIME'].apply(lambda x: '{:02}:{:02}'.format(x.components.minutes, x.components.seconds))
df['TOTAL_TIME'] = df['TOTAL_TIME'].dt.total_seconds()

df['R_CTRL'] = '00:' + df['R_CTRL'].astype(str)
df['R_CTRL'] = pd.to_timedelta(df['R_CTRL'])
df['R_CTRL'] = df['R_CTRL'].dt.total_seconds()

df['B_CTRL'] = '00:' + df['B_CTRL'].astype(str)
df['B_CTRL'] = pd.to_timedelta(df['B_CTRL'])
df['B_CTRL'] = df['B_CTRL'].dt.total_seconds()

# drop old columns
df = df.drop(columns=['LAST_RD', 'LAST_RD_TIME'])

In [None]:
df["NUM_RD"] = df["NUM_RD"].str[:1]

In [None]:
df

In [None]:
def makeTwoCols(dataFrame, origCol):
    dataFrame[[origCol + "_SUC", origCol + "_ATT"]] = df[origCol].str.split(' of ', expand=True)
    dataFrame.drop(columns=[origCol], inplace=True)

In [None]:
attsToSplit = ["R_SIG_STR", "B_SIG_STR", "R_TOT_STR", "B_TOT_STR", "R_TD", "B_TD", "R_HEAD", "B_HEAD", "R_BODY", "B_BODY", 
               "R_LEG", "B_LEG", "R_DISTANCE", "B_DISTANCE", "R_CLINCH", "B_CLINCH", "R_GROUND", "B_GROUND"]

for attribute in attsToSplit:
    makeTwoCols(dataFrame=df, origCol=attribute)

In [None]:
R_subset = ['R_FIGHTER','R_KD','R_SIG_PCT','R_TD_PCT', 'R_SUB_ATT', 'R_GROUND_ATT', 'WIN_TYPE', 'NUM_RD', 'TYPE',
                'R_SIG_STR_SUC', 'R_SIG_STR_ATT', 'R_TOT_STR_SUC', 'R_TOT_STR_ATT', 'R_TD_SUC', 'R_TD_ATT',
                'R_GROUND_SUC', 'TOTAL_TIME','WINNER', 'LOC', 'REF', 'DATE']
R_df = df[R_subset]
R_df = R_df.rename(columns=lambda x: x.replace('R_', ''))
R_df = R_df.rename(columns={'SIG_STSUC':'SIG_STR_SUC'})
R_df = R_df.rename(columns={'SIG_STATT':'SIG_STR_ATT'})
R_df = R_df.rename(columns={'TOT_STSUC':'TOT_STR_SUC'})
R_df = R_df.rename(columns={'TOT_STATT':'TOT_STR_ATT'})
R_df['WINNER'] = np.where(R_df['WINNER'] == R_df['FIGHTER'], 1, 0)
R_df.head()

In [None]:
B_subset = ['B_FIGHTER','B_KD','B_SIG_PCT','B_TD_PCT', 'B_SUB_ATT', 'B_GROUND_ATT', 'WIN_TYPE', 'NUM_RD', 'TYPE',
                'B_SIG_STR_SUC', 'B_SIG_STR_ATT', 'B_TOT_STR_SUC', 'B_TOT_STR_ATT', 'B_TD_SUC', 'B_TD_ATT',
                'B_GROUND_SUC', 'TOTAL_TIME','WINNER', 'LOC', 'REF', 'DATE']
B_df = df[B_subset]
B_df = B_df.rename(columns=lambda x: x.replace('B_', ''))
B_df = B_df.rename(columns={'SUATT':'SUB_ATT'})
B_df['WINNER'] = np.where(R_df['WINNER'] == R_df['FIGHTER'], 1, 0)
B_df.head()

In [None]:
new_df = pd.concat([R_df, B_df])

# sort the combined dataframe by date
df = new_df
# reset the index
df = df.reset_index(drop=True)
df

In [None]:
df.replace("---", int(0), inplace=True)

In [None]:
df['TYPE'].unique()

In [None]:
df['TYPE'] = df['TYPE'].str.replace('Bout', '')
df['TYPE'] = df['TYPE'].str.replace('Title', '')
df['TYPE'] = df['TYPE'].str.replace('Tournament', '')
df['TYPE'] = df['TYPE'].str.replace('Ultimate Fighter', '')
df['TYPE'] = df['TYPE'].str.replace('UFC', '')
df['TYPE'] = df['TYPE'].str.replace('Interim', '')
df['TYPE'] = df['TYPE'].str.replace('Brazil', '')
df['TYPE'] = df['TYPE'].str.replace('America', '')
df['TYPE'] = df['TYPE'].str.replace('China', '')
df['TYPE'] = df['TYPE'].str.replace('TUF', '')
df['TYPE'] = df['TYPE'].str.replace('Australia', '')
df['TYPE'] = df['TYPE'].str.replace('Nations', '')
df['TYPE'] = df['TYPE'].str.replace('Canada', '')
df['TYPE'] = df['TYPE'].str.replace('vs.', '')
df['TYPE'] = df['TYPE'].str.replace('UK', '')
df['TYPE'] = df['TYPE'].str.replace('Latin', '')
df['TYPE'] = df['TYPE'].str.replace("Women's", 'W')
df['TYPE'].unique()

In [None]:
df['TYPE'] = df['TYPE'].str.strip() # remove whitespaces
df['TYPE'] = df['TYPE'].str.replace(r'\d+','',regex=True) # remove numbers
df['TYPE'].unique()

In [None]:
for attribute in df.columns:
    if str(df[attribute].iloc[0])[0].isnumeric() and attribute != "DATE":
        df[attribute] = pd.to_numeric(df[attribute]).astype("int16")

In [None]:
from sklearn.preprocessing import OneHotEncoder
attsToEncode = ['REF', 'TYPE', 'DATE','WIN_TYPE', 'LOC', 'FIGHTER']

In [None]:
# Extract the columns to be encoded
df_to_encode = df[attsToEncode]

# Create a OneHotEncoder instance
encoder = OneHotEncoder()

# Fit and transform the selected columns
encoded_data = encoder.fit_transform(df_to_encode)

# Convert the encoded data to a DataFrame
encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out(attsToEncode))

# Concatenate the original DataFrame with the encoded DataFrame
df_encoded = pd.concat([df, encoded_df], axis=1)

# Drop the original categorical columns
df_encoded.drop(columns=attsToEncode, inplace=True)

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = df_encoded.drop(columns=['WINNER'])  
y = df_encoded['WINNER']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77)

clf = DecisionTreeClassifier(random_state=41)

clf.fit(X_train, y_train)

accuracy = clf.score(X_test, y_test)
print(f"Accuracy of the Decision Tree classifier is: {accuracy:.4f}")

# Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB

X = df_encoded.drop(columns=['WINNER'])
y = df_encoded['WINNER']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77)

gnb = GaussianNB()

gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Naive Bayes classifier is: {accuracy:.4f}")


# KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier


X = df_encoded.drop(columns=['WINNER'])  
y = df_encoded['WINNER']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77)

knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train, y_train)

accuracy = knn.score(X_test, y_test)
print(f"Accuracy of the K-Nearest Neighbors classifier is: {accuracy:.4f}")

# SVM Classifier

In [None]:
from sklearn.svm import SVC

X = df_encoded.drop(columns=['WINNER'])  
y = df_encoded['WINNER']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77)

svc = SVC(random_state=41)

svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy of the Support Vector Classifier is: {accuracy:.4f}")

# Neural Network Classifier

In [None]:
from sklearn.neural_network import MLPClassifier

X = df_encoded.drop(columns=['WINNER'])  
y = df_encoded['WINNER']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77)

clf = MLPClassifier(hidden_layer_sizes=(100, 100), activation='relu', solver='adam', random_state=42)

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Neural Network classifier is: {accuracy:.4f}")
