In [None]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import os


In [None]:
# Read in the CSV file (data.csv) as a DataFrame
ufc_df = pd.read_csv("Resources/data.csv")
ufc_df.head()


In [None]:
# Drop the non-beneficial columns
ufc_df = ufc_df.drop(
    columns=[
        "BPrev",
        "RPrev",
        "BStreak",
        "B_Location",
        "R_Location",
        "Event_ID",
        "Fight_ID",
        "B_ID",
        "R_ID",
        "B_HomeTown",
        "R_HomeTown",
        "Date",
    ]
)
ufc_df.head()


In [None]:
# Keep only wins and losses (i.e., Red & Blue)
ufc_df = ufc_df.loc[(ufc_df.winner == "blue") | (ufc_df.winner == "red")]


## Logistic Regression


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector as selector
from sklearn.linear_model import LogisticRegression

# Imputation transformer for completing missing values.
# Standardize features by removing the mean and scaling to unit variance with `StandardScalar()`.
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant")),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="object")),
        ("cat", categorical_transformer, selector(dtype_include="object")),
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(max_iter=500, random_state=1)),
    ]
)

X = ufc_df.drop("winner", axis=1)
y = ufc_df["winner"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))


### Display Diagram of Pipeline


In [None]:
from sklearn import set_config

set_config(display="diagram")
clf


### Classification Report


In [None]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


### Test Grid Search


Grid search can also be performed on the different preprocessing steps defined in the `ColumnTransformer` object, together with the classifier’s hyperparameters as part of the `Pipeline`.


Search for both the imputer strategy of the numeric preprocessing and the regularization parameter of the logistic regression using GridSearchCV.


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "preprocessor__num__imputer__strategy": [
        "mean",
        "median",
        "most_frequent",
        "constant",
    ],
    "classifier__C": [0.1, 1.0, 10, 100],
    "classifier__solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
}


grid_search = GridSearchCV(clf, param_grid, cv=3)  # `n_jobs=-1`
grid_search


Calling `grid_search.fit` triggers the cross-validated search for the best hyper-parameters combination:


In [None]:
grid_search.fit(X_train, y_train)

print("Best params:")
print(grid_search.best_params_)


In [None]:
# grid_search.get_params().keys()

# for parameter in clf.get_params():
#     print(parameter)

# clf.get_params()

# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# Solver: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


The internal cross-validation scores obtained by those parameters is:


In [None]:
print(f"Internal CV score: {grid_search.best_score_:.3f}")


We can also introspect the top grid search results as a pandas dataframe:


In [None]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[
    [
        "mean_test_score",
        "std_test_score",
        "param_preprocessor__num__imputer__strategy",
        "param_classifier__C",
        "param_classifier__solver",
    ]
].head(20)


The best hyper-parameters have be used to re-fit a final model on the full training set. Evaluate that final model on held out test data that was not used for hyperparameter tuning.


In [None]:
print(
    (
        "best logistic regression from grid search: %.3f"
        % grid_search.score(X_test, y_test)
    )
)


## Regression without Column Transformer


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Split features and target arrays

# Binary encoding
dummies_df = pd.get_dummies(ufc_df)
X = dummies_df.drop(columns=["winner_blue", "winner_red"])
X = X.fillna(0)

y = ufc_df["winner"]
le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=1
)

# Scaling the data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create and train the Logistic Regression Model
clf = LogisticRegression(solver="lbfgs", max_iter=200)
clf.fit(X_train_scaled, y_train)

# Predict Outcomes
y_pred = clf.predict(X_test_scaled)

# Print accuracy score
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")


## Test: Logistic Regression with Subset of Features


In [None]:
numeric_features = [
    "Max_round",
    "B_Age",
    "B_Height",
    "B_Weight",
    "R_Age",
    "R_Height",
    "R_Weight",
]
categorical_features = ["winby"]
target_value = ["winner"]

subset_features = numeric_features + categorical_features + target_value

# Select subset of features
ufc_subset_df = ufc_df[subset_features]

# Drop null (?)
# ufc_subset_df = ufc_subset_df.dropna(how="any", axis="rows")

# Reset index
ufc_subset_df = ufc_subset_df.reset_index(drop=True)


In [None]:
# Binary encoding
dummies_df = pd.get_dummies(ufc_subset_df)
X = dummies_df.drop(columns=["winner_blue", "winner_red"])
X = X.fillna(0)

y = ufc_subset_df["winner"]
le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=1
)

# Scaling the data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create and train the Logistic Regression Model
clf = LogisticRegression(solver="lbfgs", max_iter=200)
clf.fit(X_train_scaled, y_train)

# Predict Outcomes
y_pred = clf.predict(X_test_scaled)

# Print accuracy score
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")


## Test: StandardScaler on get_dummies()


In [None]:
# TODO: Should StandScalar() be applied to non-numerical categories that got encoded? (Ex. OneHotEncoder() or get_dummies())

# We know that there are 4 columns that are going to be encoded with get_dummies() since they are `object` dtype
# `columns=` -> Column names in the DataFrame to be encoded. If columns is None then all the columns with object or category dtype will be converted.
display(ufc_df.dtypes.value_counts())

# Should `dummy_na=True`? -> Add a column to indicate NaNs, if False NaNs are ignored.
display(ufc_df.select_dtypes(include="object").isnull().sum())


# We are encoding the object variables into uint8
dummies_df = pd.get_dummies(ufc_df)
display(dummies_df.dtypes.value_counts())

# Should StandScalar() be applied to non-numerical categories that got encoded? (Ex. OneHotEncoder() or get_dummies())
# dummies_df.select_dtypes(include="uint8").columns.tolist()
# dummies_df.select_dtypes(exclude="uint8").columns.tolist()

# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html


## Test: tpot AutoML

Dirty run of getting `tpot` to run with subset of features.


In [None]:
# TODO: Try the following:
# TODO: Built-in TPOT configurations -> https://epistasislab.github.io/tpot/using/#built-in-tpot-configurations
# TODO: Customizing TPOT's operators and parameters -> https://epistasislab.github.io/tpot/using/#customizing-tpots-operators-and-parameters
# TODO: Neural Networks in TPOT -> https://epistasislab.github.io/tpot/using/#neural-networks-in-tpot-tpotnn
# TODO: Load without pre-selected subset of features

import numpy as np
import pandas as pd
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split

# ufc_subset_df = ufc_df.convert_dtypes()

numeric_features = [
    "Max_round",
    "B_Age",
    "B_Height",
    "B_Weight",
    "R_Age",
    "R_Height",
    "R_Weight",
]
categorical_features = ["winby"]
target_value = ["winner"]
subset_features = numeric_features + categorical_features + target_value

# Conduct analysis only on subset of features
ufc_subset_df = ufc_df[subset_features]

# Drop rows that contain null values
# ufc_subset_df.isnull().sum()
ufc_subset_df = ufc_subset_df.dropna(axis=0, how="any")

# Encode categorical features with `get_dummies`
ufc_subset_df = pd.get_dummies(ufc_subset_df, columns=categorical_features)

# "Encode" text as numbers
ufc_subset_df["winner"] = np.where(
    ufc_subset_df["winner"] == "red", 0, 1
)  # 0 = value if true (red), 1 = value if not true (blue)

# Reset index
ufc_subset_df = ufc_subset_df.reset_index(drop=True)

# Set features (X) and target (y)
X = ufc_subset_df.drop("winner", axis=1)
y = ufc_subset_df["winner"]

# Split data into test/train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Run tpot classifier on light mode
tpot = TPOTClassifier(
    generations=5, population_size=20, verbosity=2, config_dict="TPOT light"
)

tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export("tpot_TPOTClassifier_light_pipeline.py")
