## Read in Packages

In [315]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score, roc_curve, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# 1. Prediction question that can be answered with data and a machine learning model: Can we guess what position a player is in baseball using how he did in that season?

# 2. Collect data to answer your question via webscraping, APIs and/or combining several readily available dataset (i.e. kaggle, uci ML repo, etc.). 
### In this case I webscraped the MLB's website and used a loop to do it for the last 30 years which are all on different web-pages

In [90]:
page1 = pd.read_html("https://www.mlb.com/stats/regular-season")
url = "https://www.mlb.com/stats/regular-season?page=2"
mlb = pd.DataFrame()
while len(pd.read_html(url)[0])>0:
    mlb = pd.concat([mlb, pd.read_html(url)[0]])
    num = int(url[-1])
    num +=1
    url = url.replace(url[-1],str(num))
mlb = pd.concat([page1[0], mlb])
mlb["Year"] = 2020
mlb

Unnamed: 0,PLAYER,TEAM,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,AVG,OBP,SLG,caret-upcaret-downOPS,Year
0,1JuanJ SotoSotoLF1‌‌‌,WSH,47,154,39,54,14,0,13,37,41,28,6,2,0.351,0.490,0.695,1.185,2020
1,2FreddieF FreemanFreeman1B2‌‌‌,ATL,60,214,51,73,23,1,13,53,45,37,2,0,0.341,0.462,0.640,1.102,2020
2,3MarcellM OzunaOzunaDH3‌‌‌,ATL,60,228,38,77,14,0,18,56,38,60,0,0,0.338,0.431,0.636,1.067,2020
3,4DJD LeMahieuLeMahieu2B4‌‌‌,NYY,50,195,41,71,10,2,10,27,18,21,3,0,0.364,0.421,0.590,1.011,2020
4,5JoseJ RamírezRamirez3B5‌‌‌,CLE,58,219,45,64,16,1,17,46,31,43,10,3,0.292,0.386,0.607,0.993,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,138EduardoE EscobarEscobar3B138‌‌‌,ARI,54,203,22,43,7,3,4,20,15,41,1,0,0.212,0.270,0.335,0.605,2020
13,139JavierJ BáezBaezSS139‌‌‌,CHC,59,222,27,45,9,1,8,24,7,75,3,0,0.203,0.238,0.360,0.599,2020
14,140EvanE WhiteWhite1B140‌‌‌,SEA,54,182,19,32,7,0,8,26,18,84,1,2,0.176,0.252,0.346,0.599,2020
15,141JonathanJ VillarVillar2B141‌‌‌,TOR,52,185,13,43,5,0,2,15,19,54,16,5,0.232,0.301,0.292,0.593,2020


In [91]:
"https://www.mlb.com/stats/2019/regular-season"

years = np.arange(1990,2020)

for year in years:
    page1 = pd.read_html("https://www.mlb.com/stats/"+str(year)+"/regular-season")
    page1 = page1[0]
    page1["Year"] = year
    url = "https://www.mlb.com/stats/"+str(year)+"?page=2"
    test = pd.DataFrame()
    while len(pd.read_html(url)[0])>0:
        test = pd.concat([test, pd.read_html(url)[0]])
        num = int(url[-1])
        num +=1
        url = url.replace(url[-1],str(num))
    test["Year"] = year
    test = pd.concat([page1,test])
    mlb = pd.concat([mlb,test])
    
    
#https://www.mlb.com/stats/2019?page=2

In [5]:
mlb

Unnamed: 0,PLAYER,TEAM,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,AVG,OBP,SLG,caret-upcaret-downOPS,Year
0,1JuanJ SotoSotoLF1‌‌‌,WSH,47,154,39,54,14,0,13,37,41,28,6,2,0.351,0.490,0.695,1.185,2020
1,2FreddieF FreemanFreeman1B2‌‌‌,ATL,60,214,51,73,23,1,13,53,45,37,2,0,0.341,0.462,0.640,1.102,2020
2,3MarcellM OzunaOzunaDH3‌‌‌,ATL,60,228,38,77,14,0,18,56,38,60,0,0,0.338,0.431,0.636,1.067,2020
3,4DJD LeMahieuLeMahieu2B4‌‌‌,NYY,50,195,41,71,10,2,10,27,18,21,3,0,0.364,0.421,0.590,1.011,2020
4,5JoseJ RamírezRamirez3B5‌‌‌,CLE,58,219,45,64,16,1,17,46,31,43,10,3,0.292,0.386,0.607,0.993,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,138EduardoE EscobarEscobar3B138‌‌‌,ARI,54,203,22,43,7,3,4,20,15,41,1,0,0.212,0.270,0.335,0.605,2019
13,139JavierJ BáezBaezSS139‌‌‌,CHC,59,222,27,45,9,1,8,24,7,75,3,0,0.203,0.238,0.360,0.599,2019
14,140EvanE WhiteWhite1B140‌‌‌,SEA,54,182,19,32,7,0,8,26,18,84,1,2,0.176,0.252,0.346,0.599,2019
15,141JonathanJ VillarVillar2B141‌‌‌,TOR,52,185,13,43,5,0,2,15,19,54,16,5,0.232,0.301,0.292,0.593,2019


# 3. Clean / wrangle your data

In [6]:
mlb["PLAYER"]

0                  1JuanJ SotoSotoLF1‌‌‌
1         2FreddieF FreemanFreeman1B2‌‌‌
2             3MarcellM OzunaOzunaDH3‌‌‌
3            4DJD LeMahieuLeMahieu2B4‌‌‌
4            5JoseJ RamírezRamirez3B5‌‌‌
                     ...                
12    138EduardoE EscobarEscobar3B138‌‌‌
13           139JavierJ BáezBaezSS139‌‌‌
14           140EvanE WhiteWhite1B140‌‌‌
15     141JonathanJ VillarVillar2B141‌‌‌
16          142NickyN LopezLopez2B142‌‌‌
Name: PLAYER, Length: 4503, dtype: object

In [61]:
pattern = '(?<=[\d+])\w+.\w+(?=[A-Z]+)'
#pattern = '[a-z][A-Z](?=[A-Z]+)'
#'(?<=\d+)\w+'
re.findall(pattern, '1JuanJ SotoSotoLF1\u200c\u200c\u200c')

['JuanJ SotoSotoL']

In [65]:
pattern = '(?<=[\d+])\w+.\w+(?=(?:[\d+]))'
#pattern = '[a-z][A-Z](?=[A-Z]+)'
#'(?<=\d+)\w+'
re.findall(pattern, mlb['PLAYER'].iloc[0])

['JuanJ SotoSotoLF']

In [66]:
pattern = '[A-Z]+(?=\d+)'
#pattern = '[a-z][A-Z](?=[A-Z]+)'
#'(?<=\d+)\w+'
re.findall(pattern, mlb['PLAYER'].iloc[0])

['LF']

In [76]:
pattern = '(?<=[\d+])\w+.[A-Z]\w+(?=[A-Z]+)'
#pattern = '[a-z][A-Z](?=[A-Z]+)'
#'(?<=\d+)\w+'
re.findall(pattern, mlb['PLAYER'].iloc[4])

['JoseJ RamírezRamirez3']

In [123]:
pattern = '(?<=[a-z\.])[A-Z1-9][A-Z]|C(?=\d+)'
#pattern = '[a-z][A-Z](?=[A-Z]+)'
#'(?<=\d+)\w+'
re.findall(pattern, mlb['PLAYER'].iloc[28])

['LF']

In [120]:
mlb['PLAYER'].iloc[38]

'39J.T.J RealmutoRealmutoC39\u200c\u200c\u200c'

In [93]:
player = []
for person in mlb["PLAYER"]:
    pattern = '(?<=[\d+])\D+(?=(?:[\d+]))'
    player.append(re.findall(pattern,person)[0])
    
mlb['PLAYER_updated'] = player

In [126]:
position = []
for person in mlb["PLAYER"]:
    pattern = '(?<=[a-z\.])[A-Z1-9][A-Z]|C(?=\d+)'
    position.append(re.findall(pattern,person)[0])
    
mlb['PLAYER_position'] = position

In [128]:
mlb

Unnamed: 0,PLAYER,TEAM,G,AB,R,H,2B,3B,HR,RBI,...,SO,SB,CS,AVG,OBP,SLG,caret-upcaret-downOPS,Year,PLAYER_position,PLAYER_updated
0,1JuanJ SotoSotoLF1‌‌‌,WSH,47,154,39,54,14,0,13,37,...,28,6,2,0.351,0.490,0.695,1.185,2020,LF,JuanJ SotoSotoLF
1,2FreddieF FreemanFreeman1B2‌‌‌,ATL,60,214,51,73,23,1,13,53,...,37,2,0,0.341,0.462,0.640,1.102,2020,1B,FreddieF FreemanFreeman
2,3MarcellM OzunaOzunaDH3‌‌‌,ATL,60,228,38,77,14,0,18,56,...,60,0,0,0.338,0.431,0.636,1.067,2020,DH,MarcellM OzunaOzunaDH
3,4DJD LeMahieuLeMahieu2B4‌‌‌,NYY,50,195,41,71,10,2,10,27,...,21,3,0,0.364,0.421,0.590,1.011,2020,2B,DJD LeMahieuLeMahieu
4,5JoseJ RamírezRamirez3B5‌‌‌,CLE,58,219,45,64,16,1,17,46,...,43,10,3,0.292,0.386,0.607,0.993,2020,3B,JoseJ RamírezRamirez
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,138EduardoE EscobarEscobar3B138‌‌‌,ARI,54,203,22,43,7,3,4,20,...,41,1,0,0.212,0.270,0.335,0.605,2019,3B,EduardoE EscobarEscobar
13,139JavierJ BáezBaezSS139‌‌‌,CHC,59,222,27,45,9,1,8,24,...,75,3,0,0.203,0.238,0.360,0.599,2019,SS,JavierJ BáezBaezSS
14,140EvanE WhiteWhite1B140‌‌‌,SEA,54,182,19,32,7,0,8,26,...,84,1,2,0.176,0.252,0.346,0.599,2019,1B,EvanE WhiteWhite
15,141JonathanJ VillarVillar2B141‌‌‌,TOR,52,185,13,43,5,0,2,15,...,54,16,5,0.232,0.301,0.292,0.593,2019,2B,JonathanJ VillarVillar


In [130]:
#Creating the dependent variable class
factor = pd.factorize(mlb['PLAYER_position'])
mlb.PLAYER_position = factor[0]
definitions = factor[1]
print(mlb.PLAYER_position.head())
print(definitions)

0    0
1    1
2    2
3    3
4    4
Name: PLAYER_position, dtype: int64
Index(['LF', '1B', 'DH', '2B', '3B', 'CF', 'SS', 'RF', 'C', 'OF'], dtype='object')


In [131]:
mlb

Unnamed: 0,PLAYER,TEAM,G,AB,R,H,2B,3B,HR,RBI,...,SO,SB,CS,AVG,OBP,SLG,caret-upcaret-downOPS,Year,PLAYER_position,PLAYER_updated
0,1JuanJ SotoSotoLF1‌‌‌,WSH,47,154,39,54,14,0,13,37,...,28,6,2,0.351,0.490,0.695,1.185,2020,0,JuanJ SotoSotoLF
1,2FreddieF FreemanFreeman1B2‌‌‌,ATL,60,214,51,73,23,1,13,53,...,37,2,0,0.341,0.462,0.640,1.102,2020,1,FreddieF FreemanFreeman
2,3MarcellM OzunaOzunaDH3‌‌‌,ATL,60,228,38,77,14,0,18,56,...,60,0,0,0.338,0.431,0.636,1.067,2020,2,MarcellM OzunaOzunaDH
3,4DJD LeMahieuLeMahieu2B4‌‌‌,NYY,50,195,41,71,10,2,10,27,...,21,3,0,0.364,0.421,0.590,1.011,2020,3,DJD LeMahieuLeMahieu
4,5JoseJ RamírezRamirez3B5‌‌‌,CLE,58,219,45,64,16,1,17,46,...,43,10,3,0.292,0.386,0.607,0.993,2020,4,JoseJ RamírezRamirez
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,138EduardoE EscobarEscobar3B138‌‌‌,ARI,54,203,22,43,7,3,4,20,...,41,1,0,0.212,0.270,0.335,0.605,2019,4,EduardoE EscobarEscobar
13,139JavierJ BáezBaezSS139‌‌‌,CHC,59,222,27,45,9,1,8,24,...,75,3,0,0.203,0.238,0.360,0.599,2019,6,JavierJ BáezBaezSS
14,140EvanE WhiteWhite1B140‌‌‌,SEA,54,182,19,32,7,0,8,26,...,84,1,2,0.176,0.252,0.346,0.599,2019,1,EvanE WhiteWhite
15,141JonathanJ VillarVillar2B141‌‌‌,TOR,52,185,13,43,5,0,2,15,...,54,16,5,0.232,0.301,0.292,0.593,2019,3,JonathanJ VillarVillar


In [184]:
#Splitting the data into independent and dependent variables
X = mlb.iloc[:,2:-2]
y = mlb.iloc[:,-2]
X["Year"] = pd.Categorical(X.Year)

In [219]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21, stratify = mlb.PLAYER_position)

In [332]:
#List Hyperparameters that we want to tune.
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p=[1,2]
#Convert to dictionary
#hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)
#Create new KNN object
#knn_2 = KNeighborsClassifier()
#Use GridSearch
#clf = GridSearchCV(knn_2, hyperparameters, cv=10)
#Fit the model
#best_model = clf.fit(X_train,y_train)
#Print The value of best Hyperparameters
print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])

Best leaf_size: 1
Best p: 1
Best n_neighbors: 1


In [220]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [333]:
knn = KNeighborsClassifier(leaf_size = 1, n_neighbors = 1, p = 1)
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=1, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=1,
                     weights='uniform')

In [334]:
yhat_knn = knn.predict(X_test)
confusion_matrix(y_test, yhat_knn)
print(classification_report(y_test, yhat_knn))
#print(roc_auc_score(y_test.reset_index(drop = True), yhat_knn, multi_class = 'ovr'))

              precision    recall  f1-score   support

           0       0.63      0.63      0.63       122
           1       0.65      0.66      0.66       181
           2       0.44      0.50      0.47        60
           3       0.66      0.70      0.68       133
           4       0.64      0.63      0.64       160
           5       0.80      0.77      0.78       120
           6       0.80      0.76      0.78       131
           7       0.52      0.52      0.52       115
           8       0.44      0.48      0.46        42
           9       0.56      0.44      0.49        43

    accuracy                           0.64      1107
   macro avg       0.61      0.61      0.61      1107
weighted avg       0.65      0.64      0.64      1107



In [335]:
test = []
for index, value in enumerate(confusion_matrix(y_test, yhat_knn)):

    test.append(value[index])
    
accuracy = sum(test)/sum(sum(confusion_matrix(y_test, yhat_knn)))
accuracy

0.6422764227642277

In [336]:
print(f1_score(y_test, yhat_knn, average="macro"))
print(precision_score(y_test, yhat_knn, average="macro"))
print(recall_score(y_test, yhat_knn, average="macro"))

0.610337296824461
0.614157581927372
0.6086811015946622


In [281]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 5)]

n_estimators = 100
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [284]:
#rf_pipe = Pipeline(
  #  [('rf', RandomForestClassifier())])
#param_grid = [{'rf__n_estimators': [100],
 #             'rf__max_features':max_features,
 #             'rf__max_depth':max_depth,
 #             'rf__min_samples_split':min_samples_split,
 #             'rf__min_samples_leaf':min_samples_leaf,
 #             'rf__bootstrap':bootstrap}]
#gs = GridSearchCV(rf_pipe, param_grid)
#gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('rf',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                               max_samples=None,
                                                               min_impurity_decrease=0.0,
                                                               min_impurity_split=None,
                                            

In [285]:
gs.best_params_

{'rf__bootstrap': False,
 'rf__max_depth': None,
 'rf__max_features': 'auto',
 'rf__min_samples_leaf': 2,
 'rf__min_samples_split': 5,
 'rf__n_estimators': 100}

In [292]:
# Fitting Random Forest Classification to the Training set
rf = RandomForestClassifier(n_estimators = 2000, random_state = 42, max_features = 'auto', max_depth = None, min_samples_leaf = 2, min_samples_split = 5)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=2000,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [295]:
yhat_rf = rf.predict(X_test)
confusion_matrix(y_test, yhat_rf)

array([[ 71,  14,   2,   7,  15,   2,   1,   8,   0,   2],
       [  2, 149,   4,   0,  13,   0,   0,  10,   1,   2],
       [  0,  26,  26,   2,   4,   0,   0,   0,   2,   0],
       [  2,   4,   0,  98,   8,   5,   7,   7,   0,   2],
       [  3,  31,   0,   5, 106,   2,   1,  10,   2,   0],
       [  9,   3,   0,   1,   4,  91,   4,   7,   0,   1],
       [  5,   4,   0,  12,   2,   3, 100,   5,   0,   0],
       [ 17,  17,   0,   2,   6,   2,   4,  66,   0,   1],
       [  1,  16,   3,   3,   3,   0,   0,   1,  15,   0],
       [  1,   5,   0,  10,   4,   0,   2,   1,   0,  20]], dtype=int64)

In [294]:
test = []
for index, value in enumerate(confusion_matrix(y_test, yhat_rf)):

    test.append(value[index])
    
accuracy = sum(test)/sum(sum(confusion_matrix(y_test, yhat_rf)))
accuracy

0.6702800361336947

In [296]:
print(f1_score(y_test, yhat_rf, average="macro"))
print(precision_score(y_test, yhat_rf, average="macro"))
print(recall_score(y_test, yhat_rf, average="macro"))

0.6419342295416326
0.7024025929530355
0.6155711363250921


In [225]:
# Predicting the Test set results
y_pred = rf.predict(X_test)
reversefactor = dict(zip(range(10),definitions))
y_test_vector = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
# Making the Confusion Matrix
print(pd.crosstab(y_test_vector, y_pred, rownames=['Actual Positions'], colnames=['Predicted Positions']))

Predicted Positions   1B  2B   3B   C  CF  DH  LF  OF  RF  SS
Actual Positions                                             
1B                   135   4   13   0   3   8   8   1   8   1
2B                     3  90    6   0   7   2   5   3   8   9
3B                    24   5  101   2   2   4  12   0   7   3
C                      8   3    2  16   0   7   4   0   2   0
CF                     0   5    2   0  83   1  12   2   9   6
DH                    15   2    4   3   0  28   6   1   0   1
LF                    20   2    8   1   4   2  74   3   6   2
OF                     1   3    6   0   4   0   5  18   3   3
RF                    12   4   11   0   3   2  13   2  63   5
SS                     1  10    6   0   4   1   9   1   2  97


In [297]:
rf.feature_importances_

array([0.05405674, 0.06602886, 0.06556681, 0.06488666, 0.06602847,
       0.03259342, 0.05629805, 0.07123276, 0.07490124, 0.05638086,
       0.07128701, 0.04562135, 0.0644728 , 0.06036188, 0.06107793,
       0.05936493, 0.02984024])

In [298]:
print(list(zip(X.columns, classifier.feature_importances_)))


[('G', 0.052993247625239014), ('AB', 0.06972686705326339), ('R', 0.06756122704994139), ('H', 0.06583686120116572), ('2B', 0.06373547716904004), ('3B', 0.02951502082296631), ('HR', 0.06149663688913407), ('RBI', 0.06498413292472377), ('BB', 0.07542693918518935), ('SO', 0.04854746162054352), ('SB', 0.08455703489375979), ('CS', 0.06007461946870311), ('AVG', 0.05883372571449804), ('OBP', 0.04912065582116712), ('SLG', 0.05196689236847161), ('caret-upcaret-downOPS', 0.05693703984965394), ('Year', 0.03868616034253983)]


In [299]:
test = pd.DataFrame(X.columns, classifier.feature_importances_).reset_index()
test.columns = ["Importance","Stat"]
test.sort_values(by = "Importance", ascending = False)

Unnamed: 0,Importance,Stat
10,0.084557,SB
8,0.075427,BB
1,0.069727,AB
2,0.067561,R
3,0.065837,H
7,0.064984,RBI
4,0.063735,2B
6,0.061497,HR
11,0.060075,CS
12,0.058834,AVG


In [300]:


# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

splitter = ['best','random']
max_features = ["auto", "sqrt", "log2"]

In [302]:
dt_pipe = Pipeline(
    [('dt', DecisionTreeClassifier())])
param_grid = [{'dt__max_features':max_features,
              'dt__max_depth':max_depth,
              'dt__min_samples_split':min_samples_split,
              'dt__min_samples_leaf':min_samples_leaf,
              'dt__splitter':splitter}]
gs = GridSearchCV(dt_pipe, param_grid)
gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('dt',
                                        DecisionTreeClassifier(ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features=None,
                                                               max_leaf_nodes=None,
                                                               min_impurity_decrease=0.0,
                                                               min_impurity_split=None,
                                                               min_samples_leaf=1,
                                                               min_samples_split=2,
                                       

In [303]:
gs.best_params_

{'dt__max_depth': 80,
 'dt__max_features': 'auto',
 'dt__min_samples_leaf': 1,
 'dt__min_samples_split': 2,
 'dt__splitter': 'best'}

In [307]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [308]:
yhat_dt = dt.predict(X_test)
confusion_matrix(y_test, yhat_dt)

array([[ 71,   4,   5,   3,   8,  11,   3,  10,   1,   6],
       [  8, 120,  13,   4,  11,   3,   3,  11,   4,   4],
       [  3,  10,  31,   1,   5,   1,   1,   3,   5,   0],
       [ 11,   3,   3,  90,   2,  10,   3,   7,   2,   2],
       [  7,  16,   4,   2, 101,   6,   5,  15,   3,   1],
       [  6,   1,   3,   3,   1,  92,   2,  12,   0,   0],
       [  4,   2,   2,   5,   3,   6,  96,   8,   2,   3],
       [ 12,   8,   4,   6,  10,   2,   8,  58,   4,   3],
       [  3,   6,   5,   2,   2,   2,   1,   3,  18,   0],
       [  4,   3,   1,   2,   3,   2,   2,   4,   3,  19]], dtype=int64)

In [309]:
test = []
for index, value in enumerate(confusion_matrix(y_test, yhat_dt)):

    test.append(value[index])
    
accuracy = sum(test)/sum(sum(confusion_matrix(y_test, yhat_dt)))
accuracy

0.6287262872628726

In [310]:
print(f1_score(y_test, yhat_dt, average="macro"))
print(precision_score(y_test, yhat_dt, average="macro"))
print(recall_score(y_test, yhat_dt, average="macro"))

0.5938009539702958
0.5962136170073842
0.594382984844134


In [311]:
ac = AdaBoostClassifier()
ac.fit(X_train, y_train)
yhat_ac = ac.predict(X_test)

In [312]:
test = []
for index, value in enumerate(confusion_matrix(y_test, yhat_ac)):

    test.append(value[index])
    
accuracy = sum(test)/sum(sum(confusion_matrix(y_test, yhat_ac)))
accuracy

0.15718157181571815

In [313]:
print(f1_score(y_test, yhat_ac, average="macro"))
print(precision_score(y_test, yhat_ac, average="macro"))
print(recall_score(y_test, yhat_ac, average="macro"))

0.14319941685720342
0.19619299171777113
0.16818904741091495


In [314]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

yhat_nb = nb.predict(X_test)
confusion_matrix(y_test, yhat_nb)

ValueError: Negative values in data passed to MultinomialNB (input X)

In [240]:
test = []
for index, value in enumerate(confusion_matrix(y_test, yhat_nb)):

    test.append(value[index])
    
accuracy = sum(test)/sum(sum(confusion_matrix(y_test, yhat_nb)))
accuracy

NameError: name 'yhat_nb' is not defined

In [241]:
print(f1_score(y_test, yhat_nb, average="macro"))
print(precision_score(y_test, yhat_nb, average="macro"))
print(recall_score(y_test, yhat_nb, average="macro"))

NameError: name 'yhat_nb' is not defined

In [339]:
mlb

Unnamed: 0,PLAYER,TEAM,G,AB,R,H,2B,3B,HR,RBI,...,SO,SB,CS,AVG,OBP,SLG,caret-upcaret-downOPS,Year,PLAYER_position,PLAYER_updated
0,1JuanJ SotoSotoLF1‌‌‌,WSH,47,154,39,54,14,0,13,37,...,28,6,2,0.351,0.490,0.695,1.185,2020,0,JuanJ SotoSotoLF
1,2FreddieF FreemanFreeman1B2‌‌‌,ATL,60,214,51,73,23,1,13,53,...,37,2,0,0.341,0.462,0.640,1.102,2020,1,FreddieF FreemanFreeman
2,3MarcellM OzunaOzunaDH3‌‌‌,ATL,60,228,38,77,14,0,18,56,...,60,0,0,0.338,0.431,0.636,1.067,2020,2,MarcellM OzunaOzunaDH
3,4DJD LeMahieuLeMahieu2B4‌‌‌,NYY,50,195,41,71,10,2,10,27,...,21,3,0,0.364,0.421,0.590,1.011,2020,3,DJD LeMahieuLeMahieu
4,5JoseJ RamírezRamirez3B5‌‌‌,CLE,58,219,45,64,16,1,17,46,...,43,10,3,0.292,0.386,0.607,0.993,2020,4,JoseJ RamírezRamirez
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,138EduardoE EscobarEscobar3B138‌‌‌,ARI,54,203,22,43,7,3,4,20,...,41,1,0,0.212,0.270,0.335,0.605,2019,4,EduardoE EscobarEscobar
13,139JavierJ BáezBaezSS139‌‌‌,CHC,59,222,27,45,9,1,8,24,...,75,3,0,0.203,0.238,0.360,0.599,2019,6,JavierJ BáezBaezSS
14,140EvanE WhiteWhite1B140‌‌‌,SEA,54,182,19,32,7,0,8,26,...,84,1,2,0.176,0.252,0.346,0.599,2019,1,EvanE WhiteWhite
15,141JonathanJ VillarVillar2B141‌‌‌,TOR,52,185,13,43,5,0,2,15,...,54,16,5,0.232,0.301,0.292,0.593,2019,3,JonathanJ VillarVillar


In [359]:
position = []
for person in mlb["PLAYER"]:
    pattern = '(?<=[a-z\.])[A-Z1-9][A-Z]|C(?=\d+)'
    position.append(re.findall(pattern,person)[0])
    
mlb['PLAYER_position'] = position

In [360]:
mlb

Unnamed: 0,PLAYER,TEAM,G,AB,R,H,2B,3B,HR,RBI,...,SO,SB,CS,AVG,OBP,SLG,caret-upcaret-downOPS,Year,PLAYER_position,PLAYER_updated
0,1JuanJ SotoSotoLF1‌‌‌,WSH,47,154,39,54,14,0,13,37,...,28,6,2,0.351,0.490,0.695,1.185,2020,LF,JuanJ SotoSotoLF
1,2FreddieF FreemanFreeman1B2‌‌‌,ATL,60,214,51,73,23,1,13,53,...,37,2,0,0.341,0.462,0.640,1.102,2020,1B,FreddieF FreemanFreeman
2,3MarcellM OzunaOzunaDH3‌‌‌,ATL,60,228,38,77,14,0,18,56,...,60,0,0,0.338,0.431,0.636,1.067,2020,DH,MarcellM OzunaOzunaDH
3,4DJD LeMahieuLeMahieu2B4‌‌‌,NYY,50,195,41,71,10,2,10,27,...,21,3,0,0.364,0.421,0.590,1.011,2020,2B,DJD LeMahieuLeMahieu
4,5JoseJ RamírezRamirez3B5‌‌‌,CLE,58,219,45,64,16,1,17,46,...,43,10,3,0.292,0.386,0.607,0.993,2020,3B,JoseJ RamírezRamirez
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,138EduardoE EscobarEscobar3B138‌‌‌,ARI,54,203,22,43,7,3,4,20,...,41,1,0,0.212,0.270,0.335,0.605,2019,3B,EduardoE EscobarEscobar
13,139JavierJ BáezBaezSS139‌‌‌,CHC,59,222,27,45,9,1,8,24,...,75,3,0,0.203,0.238,0.360,0.599,2019,SS,JavierJ BáezBaezSS
14,140EvanE WhiteWhite1B140‌‌‌,SEA,54,182,19,32,7,0,8,26,...,84,1,2,0.176,0.252,0.346,0.599,2019,1B,EvanE WhiteWhite
15,141JonathanJ VillarVillar2B141‌‌‌,TOR,52,185,13,43,5,0,2,15,...,54,16,5,0.232,0.301,0.292,0.593,2019,2B,JonathanJ VillarVillar


In [361]:
OF = ["LF","RF","CF","OF"]
IF = ["1B","2B","3B","SS","C"]
position = []
for player in mlb["PLAYER_position"]:
    if player in OF:
        position.append("OF")
    elif player in IF:
        position.append("IF")
    else:
        position.append("DH")
mlb["PLAYER_position"] = position

In [362]:
#Creating the dependent variable class
factor = pd.factorize(mlb['PLAYER_position'])
mlb.PLAYER_position = factor[0]
definitions = factor[1]
print(mlb.PLAYER_position.head())
print(definitions)

0    0
1    1
2    2
3    1
4    1
Name: PLAYER_position, dtype: int64
Index(['OF', 'IF', 'DH'], dtype='object')


In [363]:
#Splitting the data into independent and dependent variables
X = mlb.iloc[:,2:-2]
y = mlb.iloc[:,-2]
X["Year"] = pd.Categorical(X.Year)

In [364]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size = 0.25, random_state = 21, stratify = mlb.PLAYER_position)

In [365]:
scaler = StandardScaler()
X_train2 = scaler.fit_transform(X_train2)
X_test2 = scaler.transform(X_test2)

In [367]:
knn = KNeighborsClassifier()
knn.fit(X_train2,y_train2)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [368]:
yhat_knn = knn.predict(X_test2)
confusion_matrix(y_test2, yhat_knn)
print(classification_report(y_test2, yhat_knn))
#print(roc_auc_score(y_test.reset_index(drop = True), yhat_knn, multi_class = 'ovr'))

              precision    recall  f1-score   support

           0       0.76      0.69      0.72       400
           1       0.79      0.86      0.82       647
           2       0.71      0.40      0.51        60

    accuracy                           0.78      1107
   macro avg       0.75      0.65      0.69      1107
weighted avg       0.77      0.78      0.77      1107



In [369]:
test = []
for index, value in enumerate(confusion_matrix(y_test2, yhat_knn)):

    test.append(value[index])
    
accuracy = sum(test)/sum(sum(confusion_matrix(y_test2, yhat_knn)))
accuracy

0.7768744354110207

In [370]:
print(f1_score(y_test2, yhat_knn, average="macro"))
print(precision_score(y_test2, yhat_knn, average="macro"))
print(recall_score(y_test2, yhat_knn, average="macro"))

0.686791405727488
0.7512479376355833
0.6524806800618238


In [378]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 5)]

n_estimators = 100
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [379]:
rf_pipe = Pipeline(
    [('rf', RandomForestClassifier())])
param_grid = [{'rf__n_estimators': [100],
              'rf__max_features':max_features,
              'rf__max_depth':max_depth,
              'rf__min_samples_split':min_samples_split,
              'rf__min_samples_leaf':min_samples_leaf,
              'rf__bootstrap':bootstrap}]
gs = GridSearchCV(rf_pipe, param_grid)
gs.fit(X_train2, y_train2)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('rf',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                               max_samples=None,
                                                               min_impurity_decrease=0.0,
                                                               min_impurity_split=None,
                                            

In [380]:
gs.best_params_

{'rf__bootstrap': True,
 'rf__max_depth': 70,
 'rf__max_features': 'auto',
 'rf__min_samples_leaf': 2,
 'rf__min_samples_split': 5,
 'rf__n_estimators': 100}

In [381]:
# Fitting Random Forest Classification to the Training set
rf = RandomForestClassifier(n_estimators = 2000, max_depth = 70, min_samples_leaf = 2, min_samples_split = 5)
rf.fit(X_train2, y_train2)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=70, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=2000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [382]:
yhat_rf = rf.predict(X_test2)
confusion_matrix(y_test2, yhat_rf)

array([[307,  92,   1],
       [ 62, 585,   0],
       [  1,  36,  23]], dtype=int64)

In [383]:
test = []
for index, value in enumerate(confusion_matrix(y_test2, yhat_rf)):

    test.append(value[index])
    
accuracy = sum(test)/sum(sum(confusion_matrix(y_test2, yhat_rf)))
accuracy

0.8265582655826558

In [374]:
print(f1_score(y_test2, yhat_rf, average="macro"))
print(precision_score(y_test2, yhat_rf, average="macro"))
print(recall_score(y_test2, yhat_rf, average="macro"))

0.734781798832521
0.85255971755532
0.687269448737764


In [384]:
# Predicting the Test set results
y_pred = rf.predict(X_test2)
reversefactor = dict(zip(range(10),definitions))
y_test_vector = np.vectorize(reversefactor.get)(y_test2)
y_pred = np.vectorize(reversefactor.get)(y_pred)
# Making the Confusion Matrix
print(pd.crosstab(y_test_vector, y_pred, rownames=['Actual Positions'], colnames=['Predicted Positions']))

Predicted Positions  DH   IF   OF
Actual Positions                 
DH                   23   36    1
IF                    0  585   62
OF                    1   92  307


In [385]:
rf.feature_importances_

array([0.05163922, 0.06619743, 0.062046  , 0.06372476, 0.06326442,
       0.03208878, 0.04820226, 0.05735079, 0.05507096, 0.05726525,
       0.09899894, 0.06463403, 0.06127846, 0.06163916, 0.06531742,
       0.06568072, 0.02560141])

In [386]:
print(list(zip(X.columns, classifier.feature_importances_)))


[('G', 0.052993247625239014), ('AB', 0.06972686705326339), ('R', 0.06756122704994139), ('H', 0.06583686120116572), ('2B', 0.06373547716904004), ('3B', 0.02951502082296631), ('HR', 0.06149663688913407), ('RBI', 0.06498413292472377), ('BB', 0.07542693918518935), ('SO', 0.04854746162054352), ('SB', 0.08455703489375979), ('CS', 0.06007461946870311), ('AVG', 0.05883372571449804), ('OBP', 0.04912065582116712), ('SLG', 0.05196689236847161), ('caret-upcaret-downOPS', 0.05693703984965394), ('Year', 0.03868616034253983)]


In [387]:
test = pd.DataFrame(X.columns, classifier.feature_importances_).reset_index()
test.columns = ["Importance","Stat"]
test.sort_values(by = "Importance", ascending = False)

Unnamed: 0,Importance,Stat
10,0.084557,SB
8,0.075427,BB
1,0.069727,AB
2,0.067561,R
3,0.065837,H
7,0.064984,RBI
4,0.063735,2B
6,0.061497,HR
11,0.060075,CS
12,0.058834,AVG


In [375]:
ac = AdaBoostClassifier()
ac.fit(X_train2, y_train2)
yhat_ac = ac.predict(X_test2)

In [376]:
test = []
for index, value in enumerate(confusion_matrix(y_test2, yhat_ac)):

    test.append(value[index])
    
accuracy = sum(test)/sum(sum(confusion_matrix(y_test2, yhat_ac)))
accuracy

0.6260162601626016

In [377]:
print(f1_score(y_test2, yhat_ac, average="macro"))
print(precision_score(y_test2, yhat_ac, average="macro"))
print(recall_score(y_test2, yhat_ac, average="macro"))

0.5553709852119114
0.5877842662488725
0.5346788596943156


In [390]:
from sklearn.linear_model import LogisticRegressionCV
lr = LogisticRegressionCV(multi_class = 'ovr', solver = 'newton-cg')

In [392]:
lr.fit(X_train2, y_train2)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='ovr', n_jobs=None, penalty='l2',
                     random_state=None, refit=True, scoring=None,
                     solver='newton-cg', tol=0.0001, verbose=0)

In [394]:
yhat_lr = lr.predict(X_test2)

In [395]:
test = []
for index, value in enumerate(confusion_matrix(y_test2, yhat_lr)):

    test.append(value[index])
    
accuracy = sum(test)/sum(sum(confusion_matrix(y_test2, yhat_lr)))
accuracy

0.6242095754290876

In [400]:
from sklearn.naive_bayes import GaussianNB
gb = GaussianNB()

In [402]:
gb.fit(X_train2, y_train2)
yhat_gb = gb.predict(X_test2)

In [403]:
test = []
for index, value in enumerate(confusion_matrix(y_test2, yhat_gb)):

    test.append(value[index])
    
accuracy = sum(test)/sum(sum(confusion_matrix(y_test2, yhat_gb)))
accuracy

0.5329719963866305

In [404]:
 from sklearn.svm import LinearSVC
    

In [405]:
lsvc = LinearSVC()
lsvc.fit(X_train2, y_train2)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [None]:
yhat_lsvc