In [1]:
import sqlite3
import pandas as pd



In [2]:
# Connect to the SQLite database
conn = sqlite3.connect(r"C:\Users\Sarthak\Documents\ML\sherdog-api\lib\mma_fighters.db")

def get_all_fighter_data():
    query = '''
    SELECT f.*, fh.* 
    FROM fighters f
    LEFT JOIN fights fh ON f.id = fh.fighter_id
    '''

    all_fighter_data = pd.read_sql_query(query, conn)
    return all_fighter_data

In [3]:
def preprocess_fighter_data(df):
    # Convert event_date to datetime and sort by fighter name and date
    df['event_date'] = pd.to_datetime(df['event_date'], errors='coerce')
    df = df.sort_values(by=['name', 'event_date'])
    
    # Calculate cumulative wins and losses
    df['wins_so_far'] = df.groupby('name', group_keys=False)['result'].apply(lambda x: (x == 'win').cumsum() - (x == 'win')).reset_index(drop=True)
    df['losses_so_far'] = df.groupby('name', group_keys=False)['result'].apply(lambda x: (x == 'loss').cumsum() - (x == 'loss')).reset_index(drop=True)

    # Calculate cumulative wins by specific methods
    df['wins_knockouts_so_far'] = df.groupby('name', group_keys=False).apply(
        lambda x: ((x['result'] == 'win') & (x['method'].str.contains("KO|TKO", na=False))).cumsum()
    ).reset_index(drop=True)
    df['wins_submissions_so_far'] = df.groupby('name', group_keys=False).apply(
        lambda x: ((x['result'] == 'win') & (x['method'].str.contains("Submission", na=False))).cumsum()
    ).reset_index(drop=True)
    df['wins_decisions_so_far'] = df.groupby('name', group_keys=False).apply(
        lambda x: ((x['result'] == 'win') & (x['method'].str.contains("Decision", na=False))).cumsum()
    ).reset_index(drop=True)
    df['wins_others_so_far'] = df['wins_so_far'] - (df['wins_knockouts_so_far'] + df['wins_submissions_so_far'] + df['wins_decisions_so_far'])

    # Calculate days since last fight
    df['days_since_last_fight'] = df.groupby('name')['event_date'].diff().dt.days.fillna(0)

    # Calculate recent win streak (last 3 fights)
    df['recent_win_streak'] = df.groupby('name')['result'].apply(
        lambda x: (x.eq('win').cumsum() - x.eq('loss').cumsum().where(x == 'loss').ffill().fillna(0)).astype(int)
    ).reset_index(drop=True)

    # Drop unnecessary columns
    cols_to_drop = ['nickname', 'locality', 'nationality', 'image_url', 'url', 'referee', 'id', 'fighter_id']
    df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])

    return df


In [4]:

# Retrieve all fighter data and preprocess
all_fighters_data = get_all_fighter_data()
preprocessed_data = preprocess_fighter_data(all_fighters_data)

print(preprocessed_data.columns)
# Close the database connection
conn.close()


Index(['name', 'age', 'birthday', 'association', 'height', 'weight',
       'weight_class', 'wins_total', 'wins_knockouts', 'wins_submissions',
       'wins_decisions', 'wins_others', 'losses_total', 'losses_knockouts',
       'losses_submissions', 'losses_decisions', 'losses_others',
       'no_contests', 'event_name', 'event_date', 'result', 'method', 'round',
       'time', 'opponent', 'wins_so_far', 'losses_so_far',
       'wins_knockouts_so_far', 'wins_submissions_so_far',
       'wins_decisions_so_far', 'wins_others_so_far', 'days_since_last_fight',
       'recent_win_streak'],
      dtype='object')


In [5]:
preprocessed_data = preprocessed_data[:365]

In [6]:
preprocessed_data[40:50]

Unnamed: 0,name,age,birthday,association,height,weight,weight_class,wins_total,wins_knockouts,wins_submissions,...,time,opponent,wins_so_far,losses_so_far,wins_knockouts_so_far,wins_submissions_so_far,wins_decisions_so_far,wins_others_so_far,days_since_last_fight,recent_win_streak
340,Charles Oliveira,34,"Oct 17, 1989",Chute Boxe Diego LimaGold Team,"5'10""",155 lbs,Lightweight,34,10,21,...,0:19,Michael Chandler,32,3,21,6,6,-1,154.0,30
341,Charles Oliveira,34,"Oct 17, 1989",Chute Boxe Diego LimaGold Team,"5'10""",155 lbs,Lightweight,34,10,21,...,1:02,Dustin Poirier,33,3,21,7,6,-1,210.0,31
342,Charles Oliveira,34,"Oct 17, 1989",Chute Boxe Diego LimaGold Team,"5'10""",155 lbs,Lightweight,34,10,21,...,3:22,Justin Gaethje,34,3,21,8,6,-1,147.0,32
343,Charles Oliveira,34,"Oct 17, 1989",Chute Boxe Diego LimaGold Team,"5'10""",155 lbs,Lightweight,34,10,21,...,3:16,Islam Makhachev,35,3,21,9,6,-1,168.0,33
344,Charles Oliveira,34,"Oct 17, 1989",Chute Boxe Diego LimaGold Team,"5'10""",155 lbs,Lightweight,34,10,21,...,4:10,Beneil Dariush,36,3,22,9,6,-1,231.0,34
345,Charles Oliveira,34,"Oct 17, 1989",Chute Boxe Diego LimaGold Team,"5'10""",155 lbs,Lightweight,34,10,21,...,5:00,Arman Tsarukyan,37,3,23,9,6,-1,308.0,35
60,Ciryl Gane,34,"May 6, 1990",MMA Factory,"6'5""",250 lbs,Heavyweight,12,6,3,...,1:42,Bobby Sullivan,0,0,0,0,1,-1,0.0,1
58,Ciryl Gane,34,"May 6, 1990",MMA Factory,"6'5""",250 lbs,Heavyweight,12,6,3,...,4:57,Adam Dyczka,11,1,5,3,3,0,50.0,9
59,Ciryl Gane,34,"May 6, 1990",MMA Factory,"6'5""",250 lbs,Heavyweight,12,6,3,...,4:26,Roggers Souza,11,2,6,3,3,-1,245.0,10
65,Ciryl Gane,34,"May 6, 1990",MMA Factory,"6'5""",250 lbs,Heavyweight,12,6,3,...,4:12,Raphael Pessoa,3,2,1,1,1,0,78.0,0


In [32]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  # or another classifier if preferred
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Optional: Standardizing data if needed
from sklearn.preprocessing import StandardScaler

In [None]:
features = preprocessed_data[['height', '','wins_so_far', 'losses_so_far', 'recent_win_streak', ]]  # include chosen features
labels = preprocessed_data['result']  #

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)


In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)


In [11]:
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Model accuracy:", accuracy)


Model accuracy: 0.8767123287671232


In [15]:
features

Unnamed: 0,wins_so_far,losses_so_far,recent_win_streak
320,0,0,1
327,1,0,2
328,2,0,3
329,3,0,4
324,4,0,5
...,...,...,...
90,21,2,20
91,22,2,21
92,23,2,20
93,23,3,21
