In [None]:
import clf
import np
#Run below once to install required packages
!pip install pandas
!pip install matplotlib
!pip install scikit-learn
!pip install scipy
!pip install numpy
!pip install seaborn
!pip install xgboost

In [None]:
#Run below once to import required packages
import math
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import learning_curve

In [None]:
TESTSIZE = 0.3 # %X of data will be used for testing.
VALIDSPLITSIZE = 0.1 # %X of data will be used for validation.

data = pd.read_csv('../dataset/masterdataframe.csv')
feature_cols = data.columns
feature_firstdata = data.iloc[0]

forbidden_features = [
    'result', 'dob'
]
filtered_features = list(filter(lambda feature: feature not in forbidden_features and not feature.endswith('_url'), feature_cols))

#Transform data for dates
data['date'] = pd.to_datetime(data['date'], errors='coerce')
data['date'] = data['date'].astype('int64') / 10**9

#Transform time length of round (mm:ss) to seconds
data[['minutes', 'seconds']] = data['time'].str.split(':', expand=True)
data['time'] = pd.to_numeric(data['minutes']) * 60 + pd.to_numeric(data['seconds'])

#Transform data for fighter names
le = LabelEncoder()
data['fighter'] = le.fit_transform(data['fighter'])
data['opponent'] = le.fit_transform(data['opponent'])
data['division'] = le.fit_transform(data['division'])
data['stance'] = le.fit_transform(data['stance'])
data['method'] = le.fit_transform(data['method'])
data['referee'] = le.fit_transform(data['referee'])
data['time_format'] = le.fit_transform(data['time_format'])

X = data[filtered_features]
Y = data['result']


# First split train section
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=TESTSIZE, random_state=44)

# Second split validate section
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=VALIDSPLITSIZE, random_state=44)

classifier = LGBMClassifier()

# Fit the classifier to the model
classifier.fit(x_train, y_train)

# Use the classifier to predict the outcome
y_pred = classifier.predict(x_test) 

In [None]:
# Percentage NAN values in dob feature
df = pd.DataFrame(data)

nan_values = df[df['dob'].isna()]['dob']
nan_percentage = (df['dob'].isna().sum() / len(df['dob'])) * 100

print("NaN-values in column 'dob (date of birth)':")
print(nan_values)
print(f"Percentage NaN-values in column 'dob': {nan_percentage:.2f}%")

In [None]:
#Show importance of feature
feat_imp = pd.Series(classifier.feature_importances_, index=X.columns)
feat_imp.nlargest(30).plot(kind='barh', figsize=(8,10))

In [None]:
# Log loss plot

# Parameters for LightGBM model
NUM_ITERATIONS = 750
LEARNING_RATE = 0.02
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_iterations': NUM_ITERATIONS,
    'learning_rate': LEARNING_RATE,
}

train_data = lgb.Dataset(x_train, label=y_train)
val_data = lgb.Dataset(x_val, label=y_val, reference=train_data)
test_data = lgb.Dataset(x_test, label=y_test, reference=train_data)

# Lists to store evaluation results
evals_result_train = {'train_acc': []}
evals_result_val = {'val_acc': []}
evals_result_test = {'test_acc': []}

# Custom callback function to collect evaluation results
def validation_callback(env):
    train_metric = env.evaluation_result_list[0][2]
    val_metric = env.evaluation_result_list[1][2]
    test_metric = env.evaluation_result_list[2][2]
    evals_result_train['train_acc'].append(train_metric)
    evals_result_val['val_acc'].append(val_metric)
    evals_result_test['test_acc'].append(test_metric)

# Train the model with the validation callback
model = lgb.train(params, train_data, valid_sets=[train_data, val_data, test_data], num_boost_round=100, callbacks=[validation_callback])

# Get predictions on the test set
y_pred = model.predict(x_test, num_iteration=model.best_iteration)
y_pred_class = np.round(y_pred)

# Accuracy on the test set
accuracy = accuracy_score(y_test, y_pred_class)
print(f'Accuracy on test set: {accuracy:.4f}')

# Gradient of the train accuracy curve
grad_train_acc = np.gradient(evals_result_train['train_acc'])

# Threshold percentage for determining the flat region
threshold_percentage = 0.05

# Index where the gradient is below the threshold
cutoff_point_index = np.where(np.abs(grad_train_acc) < threshold_percentage * np.max(np.abs(grad_train_acc)))[0][0]

# Accuracy over iterations
plt.plot(evals_result_train['train_acc'], label='Train')
plt.plot(evals_result_val['val_acc'], label='Validation')
#plt.plot(evals_result_test['test_acc'], label='Test')
plt.axvline(x=cutoff_point_index, color='red', linestyle='--', label='Cutoff line')

# Marker for the cutoff point
plt.scatter(cutoff_point_index, evals_result_test['test_acc'][cutoff_point_index], color='red', marker='o', label='Cutoff point (Test Set)')

plt.xlabel('Iterations')
plt.ylabel('Log loss')
plt.title('Learning Curve')
plt.legend()
plt.show()

In [None]:
# Training data graph
train_errors = []

for epoch in range(1, 101):  # Amount of epochs
    # Train model on trainingsset
    classifier.fit(x_train, y_train, eval_metric='binary_error')  # Verander hier eval_metric

    # Evaluete trainingsset
    train_error = 1 - classifier.score(x_train, y_train)  # Omdat eval_metric 'binary_error' is
    train_errors.append(train_error)

# Plot training 
plt.plot(range(epoch), train_errors, label='Training Error')
plt.xlabel('Epoch')
plt.ylabel('Error')
plt.title('Training Progress')
plt.legend()
plt.show()

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)

# Confusion matrix
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Loss', 'Win'], yticklabels=['Loss', 'Win'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

print(classification_report(y_test, y_pred))
print("Amount of features", len(filtered_features))

print(data.shape) 
#sns.pairplot(data, hue = 'result')

In [None]:
# Class imbalance

from mlxtend.plotting import plot_confusion_matrix


plot_confusion_matrix(classifier, x_test, y_test)
plt.title('Confusion Matrix')
plt.show()


plt.figure(figsize=(12, 10))
for feature in features:
    plt.subplot(4, 4, features.index(feature) + 1)
    plt.hist(x_train[feature], bins=20, color='blue', alpha=0.7, label='Training Data')
    plt.hist(x_test[feature], bins=20, color='red', alpha=0.7, label='Testing Data')
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Non allowed graph, not allowed to use test graph 
train_accuracy = []
test_accuracy = []
n_estimators_range = range(1, 101, 5)

for n_estimators in n_estimators_range:
    clf = GradientBoostingClassifier(n_estimators=n_estimators)
    clf.fit(x_train, y_train)

    train_accuracy.append(accuracy_score(y_train, clf.predict(x_train)))
    test_accuracy.append(accuracy_score(y_test, clf.predict(x_test)))

plt.plot(n_estimators_range, train_accuracy, label='Training Accuracy')
plt.plot(n_estimators_range, test_accuracy, label='Testing Accuracy')
plt.xlabel('Number of Estimators')
plt.ylabel('Accuracy')
plt.title('Training and Testing Accuracy vs Number of Estimators')
plt.legend()
plt.show()

In [None]:
#Same as above but attempt to plot epoch 0 at 50%, doesnt work since using test data again, see todo below.

from sklearn.model_selection import cross_val_score

#Training
NUM_ESTIMATORS = 100
train_accuracies = []
test_accuracies = []


train_accuracies = []
validation_accuracies = []

for epoch in range(1, NUM_ESTIMATORS + 1):
    classifier.set_params(n_estimators=epoch)

    # Cross-validation on training data
    train_acc = np.mean(cross_val_score(classifier, x_train, y_train, cv=5, scoring='accuracy'))
    train_accuracies.append(train_acc)

    # Use one test data point for validation
    x_test, _, y_test, _ = train_test_split(X, Y, test_size=TESTSIZE, random_state=45)
    classifier.fit(x_train, y_train)
    y_test_pred = classifier.predict(x_test)
    validation_acc = accuracy_score(y_test, y_test_pred)
    validation_accuracies.append(validation_acc)

# Plotting the training and validation accuracies
epochs = np.arange(1, NUM_ESTIMATORS + 1)
plt.plot(epochs, train_accuracies, label='Training Accuracy')
plt.plot(epochs, validation_accuracies, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracies over Epochs')
plt.legend()
plt.show()

# # Calculate accuracy at epoch 0
# y_train_pred_0 = classifier.staged_predict(x_train).__next__()
# y_test_pred_0 = classifier.staged_predict(x_test).__next__()
# train_accuracies.append(accuracy_score(y_train, y_train_pred_0))
# test_accuracies.append(accuracy_score(y_test, y_test_pred_0))
# 
# # Calculate accuracy for subsequent epochs
# for epoch, (y_train_pred, y_test_pred) in enumerate(zip(classifier.staged_predict(x_train), classifier.staged_predict(x_test)), start=1):
#     train_accuracies.append(accuracy_score(y_train, y_train_pred))
#     test_accuracies.append(accuracy_score(y_test, y_test_pred))
# 
# # Plotting the training and test accuracies
# epochs = np.arange(0, len(train_accuracies))  # Include epoch 0
# plt.plot(epochs, train_accuracies, label='Training Accuracy')
# plt.plot(epochs, test_accuracies, label='Test Accuracy')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.title('Training and Test Accuracies over Epochs')
# plt.axvline(x=np.argmax(test_accuracies), color='r', linestyle='--', label='Best Test Accuracy')
# plt.legend()
# plt.show()

In [None]:
df = data[data.isnull().sum(axis=1) < 1]

X = df[['reach']]
Y = df['result']

print(X.shape)
print(Y.shape)

#for (columnName) in chosen_data:
    #print(chosen_data[columnName])

regressor = LinearRegression()
regressor.fit(X, Y)
y_pred = regressor.predict(X)

plt.scatter(X, Y, color = 'red')
plt.plot(X, regressor.predict(X), color = 'blue')
plt.title('mark1 vs mark2')
plt.xlabel('mark1')
plt.ylabel('mark2')
plt.show()

#print(chosen_data.shape)