In [27]:
#Importing libraries

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import warnings
import sklearn as skl
from sklearn import decomposition, datasets
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import (NeighborhoodComponentsAnalysis, KNeighborsClassifier)

In [28]:
warnings.filterwarnings("ignore")
%matplotlib inline

## Features and Split


In [29]:
#Defining the dataframe that was used for the analyses.
df = pd.read_csv("AnalysisSet.csv")

In [30]:
#Removing pitcher's name since i don't use this variable

df_notused = df.loc[:, df.columns != "Pitcher"]
DV_PROBINV = df_notused.loc[:, df_notused.columns != "Ranking"]
DV_RANKING = df_notused.loc[:, df_notused.columns != "PROBINV"]

In [31]:
#Checking the distribution of scores on probability to invest.

df["PROBINV"].value_counts(ascending=False)

50    10
10    10
20     8
5      8
30     7
60     6
40     5
0      5
15     4
80     3
70     3
45     2
90     1
75     1
65     1
55     1
Name: PROBINV, dtype: int64

In [32]:
#Given that probability to invest is continous, we made two categories to help run the classifying analysis.
#The split is chosen based on the distribution above.

PROB3 = df.loc[:, "PROBINV"]
PROBLIST = []
for i in PROB3:
    if i <= 29:
        PROBLIST.append(1)
    if i >= 30 and i <= 100:
        PROBLIST.append(2)
        
DV_PROBINV["PROBCATEGORY"] = PROBLIST

In [33]:
#Making sure the classes are balanced

print("Amount of class 1:", PROBLIST.count(1))
print("Amount of class 2:", PROBLIST.count(2))

Amount of class 1: 35
Amount of class 2: 40
Amount of class 3: 0


In [34]:
#Here we have X, which are the action units. 
#The y variable, are 2 categories of Probability to invest.
X = DV_RANKING.loc[:, DV_RANKING.columns != "Ranking"]
y = DV_PROBINV.loc[:, "PROBCATEGORY"]

In [35]:
#Standardizing the features.

X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

In [36]:
#Splitting the data into a training and testset.

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [37]:
#Checking the shape of the input and target values.

print("X_train shape is:", X_train.shape)
print("X_test shape is:", X_test.shape)
print("y_train shape is:", y_train.shape)
print("y_test shape is:", y_test.shape)

X_train shape is: (60, 3)
X_test shape is: (15, 3)
y_train shape is: (60,)
y_test shape is: (15,)


## Decision Tree Classifier

In [44]:
#Defining the decision tree classifier and searching for the best
#hyperparameters.

dtc = tree.DecisionTreeClassifier()

sample_split_range = list(range(1, 40))
grid = dict(min_samples_split=sample_split_range)

parameters={
    'min_samples_split' : range(1,20,1),
    'max_depth': range(2,20,1),
    'criterion': ["gini", "entropy"],
    'splitter': ["best", "random"],
}

dtcsearch = GridSearchCV(dtc, parameters, cv = 5, scoring = 'accuracy')
dtcsearch.fit(X_train, y_train)

print("Best parameters: ", dtcsearch.best_params_)

Best parameters:  {'min_samples_split': 4, 'splitter': 'random', 'max_depth': 7, 'criterion': 'entropy'}


In [45]:
#Checking the mean accuracy on the test set.

accuracy_scores = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
    dtcb=tree.DecisionTreeClassifier(min_samples_split=4, max_depth=7, criterion='entropy', splitter="random")
    dtcb.fit(X_train, y_train)
    y_pred_class = dtcb.predict(X_test)
    accuracy_scores.append(sum(y_pred_class == y_test) / len(y_test))
np.mean(accuracy_scores)

0.49800000000000005

## K-Nearest Neighbours

In [46]:
#Defining the k-Nearest Neighours classifier and searching for the best
#hyperparameters.

knn = KNeighborsClassifier()

para_grid = {
    'n_neighbors': [3,4,5,7,9,11,13],
    'weights': ['uniform', 'distance'],
    'metric': ["euclidean", "manhattan"]
}

knngridsearch = GridSearchCV(knn, para_grid, verbose = 1, cv = 5, n_jobs = -1)
knnsearch = knngridsearch.fit(X_train, y_train)

print("The best parameters for knn are:", knnsearch.best_params_)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.6s


The best parameters for knn are: {'n_neighbors': 9, 'metric': 'euclidean', 'weights': 'uniform'}


[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:    4.0s finished


In [47]:
#Checking the mean accuracy on the test set.

accuracy_scores = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
    knnb = KNeighborsClassifier(metric="euclidean", n_neighbors=9, weights="uniform")
    knnb.fit(X_train, y_train)
    y_predknn = knnb.predict(X_test)
    accuracy_scores.append(sum(y_predknn == y_test) / len(y_test))
np.mean(accuracy_scores)

0.5066666666666666

## Support Vector Machine

In [51]:
#Defining the Support Vector Machine classifier and searching for the best
#hyperparameters.

from sklearn import svm
SVMclassifier = svm.SVC()

para_grid2 = {
    'C': range(1,30, 2),
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ["scale", "auto"]
}

SVMsearch2 = GridSearchCV(svm.SVC(), para_grid2, refit=True, verbose=2)
SVMsearch2.fit(X_train, y_train)

print("The best parameters for knn are:", SVMsearch2.best_params_)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV] kernel=linear, gamma=scale, C=1 .................................
[CV] .................. kernel=linear, gamma=scale, C=1, total=   0.0s
[CV] kernel=linear, gamma=scale, C=1 .................................
[CV] .................. kernel=linear, gamma=scale, C=1, total=   0.0s
[CV] kernel=linear, gamma=scale, C=1 .................................
[CV] .................. kernel=linear, gamma=scale, C=1, total=   0.0s
[CV] kernel=linear, gamma=scale, C=1 .................................
[CV] .................. kernel=linear, gamma=scale, C=1, total=   0.0s
[CV] kernel=linear, gamma=scale, C=1 .................................
[CV] .................. kernel=linear, gamma=scale, C=1, total=   0.0s
[CV] kernel=rbf, gamma=scale, C=1 ....................................
[CV] ..................... kernel=rbf, gamma=scale, C=1, total=   0.0s
[CV] kernel=rbf, gamma=scale, C=1 ....................................
[CV] .........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ................. kernel=sigmoid, gamma=scale, C=1, total=   0.0s
[CV] kernel=linear, gamma=auto, C=1 ..................................
[CV] ................... kernel=linear, gamma=auto, C=1, total=   0.0s
[CV] kernel=linear, gamma=auto, C=1 ..................................
[CV] ................... kernel=linear, gamma=auto, C=1, total=   0.0s
[CV] kernel=linear, gamma=auto, C=1 ..................................
[CV] ................... kernel=linear, gamma=auto, C=1, total=   0.0s
[CV] kernel=linear, gamma=auto, C=1 ..................................
[CV] ................... kernel=linear, gamma=auto, C=1, total=   0.0s
[CV] kernel=linear, gamma=auto, C=1 ..................................
[CV] ................... kernel=linear, gamma=auto, C=1, total=   0.0s
[CV] kernel=rbf, gamma=auto, C=1 .....................................
[CV] ...................... kernel=rbf, gamma=auto, C=1, total=   0.0s
[CV] kernel=rbf, gamma=auto, C=1 .....................................
[CV] .

[CV] .................. kernel=sigmoid, gamma=auto, C=3, total=   0.0s
[CV] kernel=linear, gamma=scale, C=5 .................................
[CV] .................. kernel=linear, gamma=scale, C=5, total=   0.0s
[CV] kernel=linear, gamma=scale, C=5 .................................
[CV] .................. kernel=linear, gamma=scale, C=5, total=   0.0s
[CV] kernel=linear, gamma=scale, C=5 .................................
[CV] .................. kernel=linear, gamma=scale, C=5, total=   0.0s
[CV] kernel=linear, gamma=scale, C=5 .................................
[CV] .................. kernel=linear, gamma=scale, C=5, total=   0.1s
[CV] kernel=linear, gamma=scale, C=5 .................................
[CV] .................. kernel=linear, gamma=scale, C=5, total=   0.0s
[CV] kernel=rbf, gamma=scale, C=5 ....................................
[CV] ..................... kernel=rbf, gamma=scale, C=5, total=   0.0s
[CV] kernel=rbf, gamma=scale, C=5 ....................................
[CV] .

[CV] ................. kernel=sigmoid, gamma=scale, C=7, total=   0.1s
[CV] kernel=sigmoid, gamma=scale, C=7 ................................
[CV] ................. kernel=sigmoid, gamma=scale, C=7, total=   0.0s
[CV] kernel=linear, gamma=auto, C=7 ..................................
[CV] ................... kernel=linear, gamma=auto, C=7, total=   0.0s
[CV] kernel=linear, gamma=auto, C=7 ..................................
[CV] ................... kernel=linear, gamma=auto, C=7, total=   0.0s
[CV] kernel=linear, gamma=auto, C=7 ..................................
[CV] ................... kernel=linear, gamma=auto, C=7, total=   0.0s
[CV] kernel=linear, gamma=auto, C=7 ..................................
[CV] ................... kernel=linear, gamma=auto, C=7, total=   0.0s
[CV] kernel=linear, gamma=auto, C=7 ..................................
[CV] ................... kernel=linear, gamma=auto, C=7, total=   0.0s
[CV] kernel=rbf, gamma=auto, C=7 .....................................
[CV] .

[CV] .................. kernel=sigmoid, gamma=auto, C=9, total=   0.0s
[CV] kernel=sigmoid, gamma=auto, C=9 .................................
[CV] .................. kernel=sigmoid, gamma=auto, C=9, total=   0.0s
[CV] kernel=sigmoid, gamma=auto, C=9 .................................
[CV] .................. kernel=sigmoid, gamma=auto, C=9, total=   0.0s
[CV] kernel=linear, gamma=scale, C=11 ................................
[CV] ................. kernel=linear, gamma=scale, C=11, total=   0.0s
[CV] kernel=linear, gamma=scale, C=11 ................................
[CV] ................. kernel=linear, gamma=scale, C=11, total=   0.0s
[CV] kernel=linear, gamma=scale, C=11 ................................
[CV] ................. kernel=linear, gamma=scale, C=11, total=   0.1s
[CV] kernel=linear, gamma=scale, C=11 ................................
[CV] ................. kernel=linear, gamma=scale, C=11, total=   0.0s
[CV] kernel=linear, gamma=scale, C=11 ................................
[CV] .

[CV] .................... kernel=poly, gamma=auto, C=13, total=   0.0s
[CV] kernel=poly, gamma=auto, C=13 ...................................
[CV] .................... kernel=poly, gamma=auto, C=13, total=   0.1s
[CV] kernel=sigmoid, gamma=auto, C=13 ................................
[CV] ................. kernel=sigmoid, gamma=auto, C=13, total=   0.0s
[CV] kernel=sigmoid, gamma=auto, C=13 ................................
[CV] ................. kernel=sigmoid, gamma=auto, C=13, total=   0.0s
[CV] kernel=sigmoid, gamma=auto, C=13 ................................
[CV] ................. kernel=sigmoid, gamma=auto, C=13, total=   0.0s
[CV] kernel=sigmoid, gamma=auto, C=13 ................................
[CV] ................. kernel=sigmoid, gamma=auto, C=13, total=   0.0s
[CV] kernel=sigmoid, gamma=auto, C=13 ................................
[CV] ................. kernel=sigmoid, gamma=auto, C=13, total=   0.0s
[CV] kernel=linear, gamma=scale, C=15 ................................
[CV] .

[CV] ................... kernel=poly, gamma=scale, C=17, total=   0.0s
[CV] kernel=poly, gamma=scale, C=17 ..................................
[CV] ................... kernel=poly, gamma=scale, C=17, total=   0.0s
[CV] kernel=poly, gamma=scale, C=17 ..................................
[CV] ................... kernel=poly, gamma=scale, C=17, total=   0.0s
[CV] kernel=sigmoid, gamma=scale, C=17 ...............................
[CV] ................ kernel=sigmoid, gamma=scale, C=17, total=   0.0s
[CV] kernel=sigmoid, gamma=scale, C=17 ...............................
[CV] ................ kernel=sigmoid, gamma=scale, C=17, total=   0.0s
[CV] kernel=sigmoid, gamma=scale, C=17 ...............................
[CV] ................ kernel=sigmoid, gamma=scale, C=17, total=   0.0s
[CV] kernel=sigmoid, gamma=scale, C=17 ...............................
[CV] ................ kernel=sigmoid, gamma=scale, C=17, total=   0.0s
[CV] kernel=sigmoid, gamma=scale, C=17 ...............................
[CV] .

[CV] .................... kernel=poly, gamma=auto, C=19, total=   0.0s
[CV] kernel=poly, gamma=auto, C=19 ...................................
[CV] .................... kernel=poly, gamma=auto, C=19, total=   0.0s
[CV] kernel=poly, gamma=auto, C=19 ...................................
[CV] .................... kernel=poly, gamma=auto, C=19, total=   0.1s
[CV] kernel=poly, gamma=auto, C=19 ...................................
[CV] .................... kernel=poly, gamma=auto, C=19, total=   0.0s
[CV] kernel=sigmoid, gamma=auto, C=19 ................................
[CV] ................. kernel=sigmoid, gamma=auto, C=19, total=   0.0s
[CV] kernel=sigmoid, gamma=auto, C=19 ................................
[CV] ................. kernel=sigmoid, gamma=auto, C=19, total=   0.0s
[CV] kernel=sigmoid, gamma=auto, C=19 ................................
[CV] ................. kernel=sigmoid, gamma=auto, C=19, total=   0.0s
[CV] kernel=sigmoid, gamma=auto, C=19 ................................
[CV] .

[CV] .................. kernel=linear, gamma=auto, C=23, total=   0.0s
[CV] kernel=rbf, gamma=auto, C=23 ....................................
[CV] ..................... kernel=rbf, gamma=auto, C=23, total=   0.0s
[CV] kernel=rbf, gamma=auto, C=23 ....................................
[CV] ..................... kernel=rbf, gamma=auto, C=23, total=   0.0s
[CV] kernel=rbf, gamma=auto, C=23 ....................................
[CV] ..................... kernel=rbf, gamma=auto, C=23, total=   0.0s
[CV] kernel=rbf, gamma=auto, C=23 ....................................
[CV] ..................... kernel=rbf, gamma=auto, C=23, total=   0.0s
[CV] kernel=rbf, gamma=auto, C=23 ....................................
[CV] ..................... kernel=rbf, gamma=auto, C=23, total=   0.0s
[CV] kernel=poly, gamma=auto, C=23 ...................................
[CV] .................... kernel=poly, gamma=auto, C=23, total=   0.0s
[CV] kernel=poly, gamma=auto, C=23 ...................................
[CV] .

[CV] ................. kernel=linear, gamma=scale, C=27, total=   0.0s
[CV] kernel=linear, gamma=scale, C=27 ................................
[CV] ................. kernel=linear, gamma=scale, C=27, total=   0.0s
[CV] kernel=linear, gamma=scale, C=27 ................................
[CV] ................. kernel=linear, gamma=scale, C=27, total=   0.0s
[CV] kernel=rbf, gamma=scale, C=27 ...................................
[CV] .................... kernel=rbf, gamma=scale, C=27, total=   0.0s
[CV] kernel=rbf, gamma=scale, C=27 ...................................
[CV] .................... kernel=rbf, gamma=scale, C=27, total=   0.1s
[CV] kernel=rbf, gamma=scale, C=27 ...................................
[CV] .................... kernel=rbf, gamma=scale, C=27, total=   0.0s
[CV] kernel=rbf, gamma=scale, C=27 ...................................
[CV] .................... kernel=rbf, gamma=scale, C=27, total=   0.0s
[CV] kernel=rbf, gamma=scale, C=27 ...................................
[CV] .

[CV] ................. kernel=sigmoid, gamma=auto, C=29, total=   0.1s
[CV] kernel=sigmoid, gamma=auto, C=29 ................................
[CV] ................. kernel=sigmoid, gamma=auto, C=29, total=   0.0s
[CV] kernel=sigmoid, gamma=auto, C=29 ................................
[CV] ................. kernel=sigmoid, gamma=auto, C=29, total=   0.0s
The best parameters for knn are: {'kernel': 'sigmoid', 'gamma': 'scale', 'C': 5}


[Parallel(n_jobs=1)]: Done 600 out of 600 | elapsed:    6.6s finished


In [52]:
#Checking the mean accuracy on the test set.

accuracy_scores = []
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
    SVMbest = svm.SVC(gamma='scale', C=5, kernel="sigmoid", class_weight="balanced")
    SVMbest.fit(X_train, y_train)
    y_predsvm = SVMbest.predict(X_test)
    accuracy_scores.append(sum(y_predsvm == y_test) / len(y_test))
np.mean(accuracy_scores)

0.514