In [None]:
#Run below once to install required packages
!pip install pandas
!pip install matplotlib
!pip install scikit-learn
!pip install scipy
!pip install numpy
!pip install seaborn
!pip install xgboost

In [None]:
from decimal import Decimal
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
import sklearn.ensemble
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import numpy as np
import math
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

#Constants
TESTSIZE = 0.3 # x% of data will be used for testing.
# TODO 0.3 seems to produce better results as 0.2, overfitting?

#Import data
data = pd.read_csv('../dataset/masterdataframe.csv')
feature_cols = data.columns
feature_firstdata = data.iloc[0]

# Only use numeric data
def filter_features():
    valid_features = []
    for i in range(0,len(feature_cols)):
        if feature_cols[i] == 'result':
            continue
        value = feature_firstdata.values[i]
        if value is not None and isinstance(value, float) and value is not pd.isna(value) and not math.isnan(value):
            valid_features.append(feature_cols[i])
    return valid_features

# ['reach', 'height', 'age', 'knockdowns', 'takedowns_landed', 
#            'sig_strikes_landed', 'total_strikes_landed', 'head_strikes_landed', 
#            'body_strikes_landed', 'total_strikes_accuracy', 'head_strikes_accuracy', 
#            'height_differential', 'sub_attempts']


#Transform date for dates of birth NOTE: DOB DECREASED ACCURACAY by 0,11%
# data['dob'] = pd.to_datetime(data['dob'], errors='coerce')
# data['dob'] = data['dob'].apply(lambda x: (x - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s') if pd.notna(x) else np.nan)
# mean_dob = data['dob'].mean()
# data['dob'] = data['dob'].fillna(mean_dob)

#PREPARE DATA
#Remove NaN from data
df = data[data.isnull().sum(axis=1) < 1]
filtered_features = filter_features()

X = df[filtered_features]
Y = df['result']

#Test train split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=TESTSIZE, random_state=44)

classifier = LGBMClassifier() #GradientBoostingClassifier() # #CatBoostClassifier() # #XGBClassifier() #GradientBoostingClassifier() #MLPClassifier(solver='lbfgs', alpha=1e-5,
                     #hidden_layer_sizes=(5, 2), random_state=1, max_iter=200000) # #AdaBoostClassifier() #DecisionTreeClassifier() #SVC(kernel='rbf') #SVC(kernel='linear') #LinearRegression()  #RandomForestClassifier() #KNeighborsClassifier() #DecisionTreeClassifier() #GradientBoostingClassifier() #KNeighborsClassifier() #RandomForestClassifier() #SVC(kernel='linear') #LinearRegression()


# 
# gb_model = GradientBoostingClassifier()
# 
# # Definie parametergrid to search in
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 4, 5]
# }
# 
# grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=3, scoring='accuracy')
# 
# # Execute grid search on data
# grid_search.fit(x_train, y_train)
# 
# # Retrieve the best parameters
# best_params = grid_search.best_params_
# 
# # Use model with the best parameters
# best_gb_model = grid_search.best_estimator_

# Fit the classifier to the model
classifier.fit(x_train, y_train)

# Use the classifier to predict the outcome
y_pred = classifier.predict(x_test) #best_gb_model.predict(x_test) #

In [None]:
# Evaluate the model using the confusion matrix
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)

# Visualize the confusion matrix
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Loss', 'Win'], yticklabels=['Loss', 'Win'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

print(classification_report(y_test, y_pred))
print("Amount of features", len(filtered_features))

print(data.shape) 
#sns.pairplot(data, hue = 'result')

In [None]:
#Check for class imbalance

# from mlxtend.plotting import plot_confusion_matrix
# 
# 
# plot_confusion_matrix(classifier, x_test, y_test)
# plt.title('Confusion Matrix')
# plt.show()
# 
# 
# plt.figure(figsize=(12, 10))
# for feature in features:
#     plt.subplot(4, 4, features.index(feature) + 1)
#     plt.hist(x_train[feature], bins=20, color='blue', alpha=0.7, label='Training Data')
#     plt.hist(x_test[feature], bins=20, color='red', alpha=0.7, label='Testing Data')
#     plt.title(f'Distribution of {feature}')
#     plt.xlabel(feature)
#     plt.ylabel('Frequency')
#     plt.legend()
# 
# plt.tight_layout()
# plt.show()

In [None]:
# Non allowed graph, not allowed to use test graph 
train_accuracy = []
test_accuracy = []
n_estimators_range = range(1, 101, 5)

for n_estimators in n_estimators_range:
    clf = GradientBoostingClassifier(n_estimators=n_estimators)
    clf.fit(x_train, y_train)

    train_accuracy.append(accuracy_score(y_train, clf.predict(x_train)))
    test_accuracy.append(accuracy_score(y_test, clf.predict(x_test)))

plt.plot(n_estimators_range, train_accuracy, label='Training Accuracy')
plt.plot(n_estimators_range, test_accuracy, label='Testing Accuracy')
plt.xlabel('Number of Estimators')
plt.ylabel('Accuracy')
plt.title('Training and Testing Accuracy vs Number of Estimators')
plt.legend()
plt.show()

In [None]:
#Same as above but attempt to plot epoch 0 at 50%, doesnt work since using test data again, see todo below.

from sklearn.model_selection import cross_val_score

#TODO Graph below is needed, but not allowed to use test data.
#TODO First establish elbow curve using only training data and at that point plot test data once, single dot.

#Training
NUM_ESTIMATORS = 100
train_accuracies = []
test_accuracies = []


train_accuracies = []
validation_accuracies = []

for epoch in range(1, NUM_ESTIMATORS + 1):
    classifier.set_params(n_estimators=epoch)

    # Cross-validation on training data
    train_acc = np.mean(cross_val_score(classifier, x_train, y_train, cv=5, scoring='accuracy'))
    train_accuracies.append(train_acc)

    # Use one test data point for validation
    x_test, _, y_test, _ = train_test_split(X, Y, test_size=TESTSIZE, random_state=45)
    classifier.fit(x_train, y_train)
    y_test_pred = classifier.predict(x_test)
    validation_acc = accuracy_score(y_test, y_test_pred)
    validation_accuracies.append(validation_acc)

# Plotting the training and validation accuracies
epochs = np.arange(1, NUM_ESTIMATORS + 1)
plt.plot(epochs, train_accuracies, label='Training Accuracy')
plt.plot(epochs, validation_accuracies, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracies over Epochs')
plt.legend()
plt.show()

# # Calculate accuracy at epoch 0
# y_train_pred_0 = classifier.staged_predict(x_train).__next__()
# y_test_pred_0 = classifier.staged_predict(x_test).__next__()
# train_accuracies.append(accuracy_score(y_train, y_train_pred_0))
# test_accuracies.append(accuracy_score(y_test, y_test_pred_0))
# 
# # Calculate accuracy for subsequent epochs
# for epoch, (y_train_pred, y_test_pred) in enumerate(zip(classifier.staged_predict(x_train), classifier.staged_predict(x_test)), start=1):
#     train_accuracies.append(accuracy_score(y_train, y_train_pred))
#     test_accuracies.append(accuracy_score(y_test, y_test_pred))
# 
# # Plotting the training and test accuracies
# epochs = np.arange(0, len(train_accuracies))  # Include epoch 0
# plt.plot(epochs, train_accuracies, label='Training Accuracy')
# plt.plot(epochs, test_accuracies, label='Test Accuracy')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.title('Training and Test Accuracies over Epochs')
# plt.axvline(x=np.argmax(test_accuracies), color='r', linestyle='--', label='Best Test Accuracy')
# plt.legend()
# plt.show()

In [None]:
df = data[data.isnull().sum(axis=1) < 1]

X = df[['reach']]
Y = df['result']

print(X.shape)
print(Y.shape)

#for (columnName) in chosen_data:
    #print(chosen_data[columnName])

regressor = LinearRegression()
regressor.fit(X, Y)
y_pred = regressor.predict(X)

plt.scatter(X, Y, color = 'red')
plt.plot(X, regressor.predict(X), color = 'blue')
plt.title('mark1 vs mark2')
plt.xlabel('mark1')
plt.ylabel('mark2')
plt.show()

#print(chosen_data.shape)

In [ ]:
# TODO 1. Do multiple takes of training using the various classifiers
# TODO 2. Make a graph with the average of the outcome of those tests
# TODO 3. Show the residuals (lines in between of average and outliers)

In [ ]:
# Graph 1: Training data 