In [None]:
# this walk-through was adapted from the following two tutorials, check them out for more info:
# https://machinelearningmastery.com/machine-learning-in-python-step-by-step/
# https://towardsdatascience.com/exploring-classifiers-with-python-scikit-learn-iris-dataset-2bcb490d2e1b


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Import and examine data

In [None]:
iris = pd.read_csv("iris.csv")
iris.head()

In [None]:
iris.shape

In [None]:
iris.describe()

In [None]:
iris["class"].value_counts()

# Data formatting

In [None]:
train, test = train_test_split(iris, test_size = .2, stratify = iris['class'], random_state = 0)
# splits the data into a training and testing set
# test_size = .3 makes the test set 30% of the data
# stratify = iris['class'] makes the classes equally represented in the training/testing sets
# setting a random state ensures that the data will be divided the same way each time

In [None]:
train["class"].value_counts()
# in fact we do have balanced training/testing classes

In [None]:
# lets visualize the training set
train.plot(kind = "box", subplots = True, layout = (2,2), sharey = True)
plt.show()

In [None]:
train.hist()
plt.show()

In [None]:
sns.pairplot(train, hue="class", palette = 'colorblind')

# Build a model

In [None]:
# split yourdata sets into predictive attributs (x)
# and the class you are trying to predict (y)

x_train = train[['sepal_length','sepal_width', 'petal_length', 'petal_width']]
y_train = train['class']

x_test = test[['sepal_length','sepal_width', 'petal_length', 'petal_width']]
y_test = test['class']

#first model we will try is a decision tree
mod_dt = DecisionTreeClassifier(max_depth = 3,random_state = 1) #declare model "estimator"
mod_dt.fit(x_train,y_train) #fit the model to your data
y_pred=mod_dt.predict(x_test) #use your "fitted" model to predict labels for the test data
print(y_pred)

In [None]:
# how good was the model at predicting?
metrics.accuracy_score(y_pred,y_test)

In [None]:
mod_dt.feature_importances_

In [None]:
print(x_train.columns)

In [None]:
#lets visualize the tree
plt.figure(figsize = (5,5), dpi = 200)
plot_tree(mod_dt,feature_names=x_train.columns,class_names=mod_dt.classes_, filled = True)
plt.show()

In [None]:
#lets look at precision, recall, and F1
print(metrics.classification_report(y_test, y_pred))

# Lets try a different model
Naive Bayes!

In [None]:
mod_nb = GaussianNB()
mod_nb.fit(x_train,y_train) #fit the model to your data
y_pred2=mod_nb.predict(x_test) #use your "fitted" model to predict labels for the test data
# how good was the model at predicting?
metrics.accuracy_score(y_pred2,y_test)

In [None]:
#refit the model with just the petal attributes

mod_nb.fit(x_train[["petal_length", "petal_width"]], y_train)
y_pred3 = mod_nb.predict(x_test[["petal_length", "petal_width"]])
print("accuracy: ", metrics.accuracy_score(y_pred3,y_test))
#lets look at precision, recall, and F1
print(metrics.classification_report(y_test, y_pred3))

The model does **better** using only two features. This implies that the model may be "overfitting" when using all 4 features

# What if we randomly got a bad "split" of the data into training and testing sets?
There are solutions to this problem, one of the most common is called **k-fold cross validation**. K-fold cross validation divides your data into k smaller data sets, then trains the model on k different subsets of the data.

Lets test out a few different models using k-fold cross validation

In [None]:
# Different models to test
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('KNN', KNeighborsClassifier()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print(name, round(cv_results.mean(),3), round(cv_results.std(),3))

In [None]:
plt.boxplot(results, labels=names)
plt.title('Algorithm Comparison')
plt.show()

# Your turn: build a model to predict basketball games!
Download the basketballData.csv file from Canvas. Choose what features (columns) you think are most predictive and choose a model from above and try to build the best model you can! Remember, the basic format for modeling in python is:

1) divide your data into a training and testing set

2) declare a model esitmator

3) "fit" the model

4) evaluate the model

For information about what each column means, see: https://www.sports-reference.com/cbb/seasons/2021-school-stats.html

The first step has already been done for you below. The column you are interested in predicting is 'Team1Win?'. Don't worry about trying to do k-fold cross validation. Just choose parameters and try it out!

In [None]:
basketballData = pd.read_csv("basketballData.csv")
pd.set_option("display.max.columns", None)

basketballData.head()

In [None]:
#divide into training and testing
train, test = train_test_split(basketballData, test_size = .2, stratify = basketballData['Team1Win?'], random_state = 0)

#divide into x and y
x_train = train[["Team1SRS"]]#fill in with the columns you want to predict
y_train = train["Team1Win?"]

x_test = test[["Team1SRS"]]# fill in with the same columns as x_train
y_test = test["Team1Win?"]

#build your model
mod_bball = KNeighborsClassifier()# could use a different one instead, e.g., LogisticRegression(solver='liblinear', multi_class='ovr')
mod_bball.fit(x_train, y_train)
bball_pred = mod_bball.predict(x_test)
print("accuracy: ", metrics.accuracy_score(bball_pred,y_test))
