## Read in Packages

In [229]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score, roc_curve, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# 1. Prediction question that can be answered with data and a machine learning model: Can we guess what position a player is in baseball using how he did in that season?

# 2. Collect data to answer your question via webscraping, APIs and/or combining several readily available dataset (i.e. kaggle, uci ML repo, etc.). 
### In this case I webscraped the MLB's website and used a loop to do it for the last 30 years which are all on different web-pages

In [90]:
page1 = pd.read_html("https://www.mlb.com/stats/regular-season")
url = "https://www.mlb.com/stats/regular-season?page=2"
mlb = pd.DataFrame()
while len(pd.read_html(url)[0])>0:
    mlb = pd.concat([mlb, pd.read_html(url)[0]])
    num = int(url[-1])
    num +=1
    url = url.replace(url[-1],str(num))
mlb = pd.concat([page1[0], mlb])
mlb["Year"] = 2020
mlb

Unnamed: 0,PLAYER,TEAM,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,AVG,OBP,SLG,caret-upcaret-downOPS,Year
0,1JuanJ SotoSotoLF1‌‌‌,WSH,47,154,39,54,14,0,13,37,41,28,6,2,0.351,0.490,0.695,1.185,2020
1,2FreddieF FreemanFreeman1B2‌‌‌,ATL,60,214,51,73,23,1,13,53,45,37,2,0,0.341,0.462,0.640,1.102,2020
2,3MarcellM OzunaOzunaDH3‌‌‌,ATL,60,228,38,77,14,0,18,56,38,60,0,0,0.338,0.431,0.636,1.067,2020
3,4DJD LeMahieuLeMahieu2B4‌‌‌,NYY,50,195,41,71,10,2,10,27,18,21,3,0,0.364,0.421,0.590,1.011,2020
4,5JoseJ RamírezRamirez3B5‌‌‌,CLE,58,219,45,64,16,1,17,46,31,43,10,3,0.292,0.386,0.607,0.993,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,138EduardoE EscobarEscobar3B138‌‌‌,ARI,54,203,22,43,7,3,4,20,15,41,1,0,0.212,0.270,0.335,0.605,2020
13,139JavierJ BáezBaezSS139‌‌‌,CHC,59,222,27,45,9,1,8,24,7,75,3,0,0.203,0.238,0.360,0.599,2020
14,140EvanE WhiteWhite1B140‌‌‌,SEA,54,182,19,32,7,0,8,26,18,84,1,2,0.176,0.252,0.346,0.599,2020
15,141JonathanJ VillarVillar2B141‌‌‌,TOR,52,185,13,43,5,0,2,15,19,54,16,5,0.232,0.301,0.292,0.593,2020


In [91]:
"https://www.mlb.com/stats/2019/regular-season"

years = np.arange(1990,2020)

for year in years:
    page1 = pd.read_html("https://www.mlb.com/stats/"+str(year)+"/regular-season")
    page1 = page1[0]
    page1["Year"] = year
    url = "https://www.mlb.com/stats/"+str(year)+"?page=2"
    test = pd.DataFrame()
    while len(pd.read_html(url)[0])>0:
        test = pd.concat([test, pd.read_html(url)[0]])
        num = int(url[-1])
        num +=1
        url = url.replace(url[-1],str(num))
    test["Year"] = year
    test = pd.concat([page1,test])
    mlb = pd.concat([mlb,test])
    
    
#https://www.mlb.com/stats/2019?page=2

In [5]:
mlb

Unnamed: 0,PLAYER,TEAM,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,AVG,OBP,SLG,caret-upcaret-downOPS,Year
0,1JuanJ SotoSotoLF1‌‌‌,WSH,47,154,39,54,14,0,13,37,41,28,6,2,0.351,0.490,0.695,1.185,2020
1,2FreddieF FreemanFreeman1B2‌‌‌,ATL,60,214,51,73,23,1,13,53,45,37,2,0,0.341,0.462,0.640,1.102,2020
2,3MarcellM OzunaOzunaDH3‌‌‌,ATL,60,228,38,77,14,0,18,56,38,60,0,0,0.338,0.431,0.636,1.067,2020
3,4DJD LeMahieuLeMahieu2B4‌‌‌,NYY,50,195,41,71,10,2,10,27,18,21,3,0,0.364,0.421,0.590,1.011,2020
4,5JoseJ RamírezRamirez3B5‌‌‌,CLE,58,219,45,64,16,1,17,46,31,43,10,3,0.292,0.386,0.607,0.993,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,138EduardoE EscobarEscobar3B138‌‌‌,ARI,54,203,22,43,7,3,4,20,15,41,1,0,0.212,0.270,0.335,0.605,2019
13,139JavierJ BáezBaezSS139‌‌‌,CHC,59,222,27,45,9,1,8,24,7,75,3,0,0.203,0.238,0.360,0.599,2019
14,140EvanE WhiteWhite1B140‌‌‌,SEA,54,182,19,32,7,0,8,26,18,84,1,2,0.176,0.252,0.346,0.599,2019
15,141JonathanJ VillarVillar2B141‌‌‌,TOR,52,185,13,43,5,0,2,15,19,54,16,5,0.232,0.301,0.292,0.593,2019


# 3. Clean / wrangle your data

In [6]:
mlb["PLAYER"]

0                  1JuanJ SotoSotoLF1‌‌‌
1         2FreddieF FreemanFreeman1B2‌‌‌
2             3MarcellM OzunaOzunaDH3‌‌‌
3            4DJD LeMahieuLeMahieu2B4‌‌‌
4            5JoseJ RamírezRamirez3B5‌‌‌
                     ...                
12    138EduardoE EscobarEscobar3B138‌‌‌
13           139JavierJ BáezBaezSS139‌‌‌
14           140EvanE WhiteWhite1B140‌‌‌
15     141JonathanJ VillarVillar2B141‌‌‌
16          142NickyN LopezLopez2B142‌‌‌
Name: PLAYER, Length: 4503, dtype: object

In [61]:
pattern = '(?<=[\d+])\w+.\w+(?=[A-Z]+)'
#pattern = '[a-z][A-Z](?=[A-Z]+)'
#'(?<=\d+)\w+'
re.findall(pattern, '1JuanJ SotoSotoLF1\u200c\u200c\u200c')

['JuanJ SotoSotoL']

In [65]:
pattern = '(?<=[\d+])\w+.\w+(?=(?:[\d+]))'
#pattern = '[a-z][A-Z](?=[A-Z]+)'
#'(?<=\d+)\w+'
re.findall(pattern, mlb['PLAYER'].iloc[0])

['JuanJ SotoSotoLF']

In [66]:
pattern = '[A-Z]+(?=\d+)'
#pattern = '[a-z][A-Z](?=[A-Z]+)'
#'(?<=\d+)\w+'
re.findall(pattern, mlb['PLAYER'].iloc[0])

['LF']

In [76]:
pattern = '(?<=[\d+])\w+.[A-Z]\w+(?=[A-Z]+)'
#pattern = '[a-z][A-Z](?=[A-Z]+)'
#'(?<=\d+)\w+'
re.findall(pattern, mlb['PLAYER'].iloc[4])

['JoseJ RamírezRamirez3']

In [123]:
pattern = '(?<=[a-z\.])[A-Z1-9][A-Z]|C(?=\d+)'
#pattern = '[a-z][A-Z](?=[A-Z]+)'
#'(?<=\d+)\w+'
re.findall(pattern, mlb['PLAYER'].iloc[28])

['LF']

In [120]:
mlb['PLAYER'].iloc[38]

'39J.T.J RealmutoRealmutoC39\u200c\u200c\u200c'

In [93]:
player = []
for person in mlb["PLAYER"]:
    pattern = '(?<=[\d+])\D+(?=(?:[\d+]))'
    player.append(re.findall(pattern,person)[0])
    
mlb['PLAYER_updated'] = player

In [126]:
position = []
for person in mlb["PLAYER"]:
    pattern = '(?<=[a-z\.])[A-Z1-9][A-Z]|C(?=\d+)'
    position.append(re.findall(pattern,person)[0])
    
mlb['PLAYER_position'] = position

In [128]:
mlb

Unnamed: 0,PLAYER,TEAM,G,AB,R,H,2B,3B,HR,RBI,...,SO,SB,CS,AVG,OBP,SLG,caret-upcaret-downOPS,Year,PLAYER_position,PLAYER_updated
0,1JuanJ SotoSotoLF1‌‌‌,WSH,47,154,39,54,14,0,13,37,...,28,6,2,0.351,0.490,0.695,1.185,2020,LF,JuanJ SotoSotoLF
1,2FreddieF FreemanFreeman1B2‌‌‌,ATL,60,214,51,73,23,1,13,53,...,37,2,0,0.341,0.462,0.640,1.102,2020,1B,FreddieF FreemanFreeman
2,3MarcellM OzunaOzunaDH3‌‌‌,ATL,60,228,38,77,14,0,18,56,...,60,0,0,0.338,0.431,0.636,1.067,2020,DH,MarcellM OzunaOzunaDH
3,4DJD LeMahieuLeMahieu2B4‌‌‌,NYY,50,195,41,71,10,2,10,27,...,21,3,0,0.364,0.421,0.590,1.011,2020,2B,DJD LeMahieuLeMahieu
4,5JoseJ RamírezRamirez3B5‌‌‌,CLE,58,219,45,64,16,1,17,46,...,43,10,3,0.292,0.386,0.607,0.993,2020,3B,JoseJ RamírezRamirez
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,138EduardoE EscobarEscobar3B138‌‌‌,ARI,54,203,22,43,7,3,4,20,...,41,1,0,0.212,0.270,0.335,0.605,2019,3B,EduardoE EscobarEscobar
13,139JavierJ BáezBaezSS139‌‌‌,CHC,59,222,27,45,9,1,8,24,...,75,3,0,0.203,0.238,0.360,0.599,2019,SS,JavierJ BáezBaezSS
14,140EvanE WhiteWhite1B140‌‌‌,SEA,54,182,19,32,7,0,8,26,...,84,1,2,0.176,0.252,0.346,0.599,2019,1B,EvanE WhiteWhite
15,141JonathanJ VillarVillar2B141‌‌‌,TOR,52,185,13,43,5,0,2,15,...,54,16,5,0.232,0.301,0.292,0.593,2019,2B,JonathanJ VillarVillar


In [130]:
#Creating the dependent variable class
factor = pd.factorize(mlb['PLAYER_position'])
mlb.PLAYER_position = factor[0]
definitions = factor[1]
print(mlb.PLAYER_position.head())
print(definitions)

0    0
1    1
2    2
3    3
4    4
Name: PLAYER_position, dtype: int64
Index(['LF', '1B', 'DH', '2B', '3B', 'CF', 'SS', 'RF', 'C', 'OF'], dtype='object')


In [131]:
mlb

Unnamed: 0,PLAYER,TEAM,G,AB,R,H,2B,3B,HR,RBI,...,SO,SB,CS,AVG,OBP,SLG,caret-upcaret-downOPS,Year,PLAYER_position,PLAYER_updated
0,1JuanJ SotoSotoLF1‌‌‌,WSH,47,154,39,54,14,0,13,37,...,28,6,2,0.351,0.490,0.695,1.185,2020,0,JuanJ SotoSotoLF
1,2FreddieF FreemanFreeman1B2‌‌‌,ATL,60,214,51,73,23,1,13,53,...,37,2,0,0.341,0.462,0.640,1.102,2020,1,FreddieF FreemanFreeman
2,3MarcellM OzunaOzunaDH3‌‌‌,ATL,60,228,38,77,14,0,18,56,...,60,0,0,0.338,0.431,0.636,1.067,2020,2,MarcellM OzunaOzunaDH
3,4DJD LeMahieuLeMahieu2B4‌‌‌,NYY,50,195,41,71,10,2,10,27,...,21,3,0,0.364,0.421,0.590,1.011,2020,3,DJD LeMahieuLeMahieu
4,5JoseJ RamírezRamirez3B5‌‌‌,CLE,58,219,45,64,16,1,17,46,...,43,10,3,0.292,0.386,0.607,0.993,2020,4,JoseJ RamírezRamirez
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12,138EduardoE EscobarEscobar3B138‌‌‌,ARI,54,203,22,43,7,3,4,20,...,41,1,0,0.212,0.270,0.335,0.605,2019,4,EduardoE EscobarEscobar
13,139JavierJ BáezBaezSS139‌‌‌,CHC,59,222,27,45,9,1,8,24,...,75,3,0,0.203,0.238,0.360,0.599,2019,6,JavierJ BáezBaezSS
14,140EvanE WhiteWhite1B140‌‌‌,SEA,54,182,19,32,7,0,8,26,...,84,1,2,0.176,0.252,0.346,0.599,2019,1,EvanE WhiteWhite
15,141JonathanJ VillarVillar2B141‌‌‌,TOR,52,185,13,43,5,0,2,15,...,54,16,5,0.232,0.301,0.292,0.593,2019,3,JonathanJ VillarVillar


In [184]:
#Splitting the data into independent and dependent variables
X = mlb.iloc[:,2:-2]
y = mlb.iloc[:,-2]
X["Year"] = pd.Categorical(X.Year)

In [219]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21, stratify = mlb.PLAYER_position)

In [220]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [230]:
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [231]:
yhat_knn = knn.predict(X_test)
confusion_matrix(y_test, yhat_knn)

array([[ 77,  15,   1,   5,   9,   3,   4,   6,   0,   2],
       [ 17, 132,   8,   0,  11,   1,   2,   5,   3,   2],
       [  5,  19,  28,   1,   3,   0,   0,   3,   1,   0],
       [  8,   4,   1,  90,  10,   6,   9,   5,   0,   0],
       [  7,  25,   6,   7,  99,   3,   1,  10,   1,   1],
       [  5,   4,   3,   4,   3,  88,   4,   7,   0,   2],
       [  2,   5,   0,   5,   4,   9,  99,   6,   1,   0],
       [ 15,  22,   2,   3,  13,   3,   3,  53,   1,   0],
       [  1,  12,   3,   3,   1,   0,   0,   2,  20,   0],
       [  8,   7,   2,   4,   5,   5,   4,   0,   1,   7]], dtype=int64)

In [232]:
test = []
for index, value in enumerate(confusion_matrix(y_test, yhat_knn)):

    test.append(value[index])
    
accuracy = sum(test)/sum(sum(confusion_matrix(y_test, yhat_knn)))
accuracy

0.6260162601626016

In [233]:
print(f1_score(y_test, yhat_knn, average="macro"))
print(precision_score(y_test, yhat_knn, average="macro"))
print(recall_score(y_test, yhat_knn, average="macro"))

0.5843215689026782
0.6244770172436748
0.5711446968184697


In [234]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [236]:
yhat_dt = dt.predict(X_test)
confusion_matrix(y_test, yhat_dt)
sum(sum(confusion_matrix(y_test, yhat_dt)))

1107

In [237]:
test = []
for index, value in enumerate(confusion_matrix(y_test, yhat_dt)):

    test.append(value[index])
    
accuracy = sum(test)/sum(sum(confusion_matrix(y_test, yhat_dt)))
accuracy

0.6214995483288166

In [238]:
print(f1_score(y_test, yhat_dt, average="macro"))
print(precision_score(y_test, yhat_dt, average="macro"))
print(recall_score(y_test, yhat_dt, average="macro"))

0.5911995699972052
0.5953568773695456
0.5889999424249622


In [221]:
# Fitting Random Forest Classification to the Training set
rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [222]:
yhat_rf = rf.predict(X_test)
confusion_matrix(y_test, yhat_rf)
sum(sum(confusion_matrix(y_test, yhat_rf)))

1107

In [223]:
test = []
for index, value in enumerate(confusion_matrix(y_test, yhat_rf)):

    test.append(value[index])
    
accuracy = sum(test)/sum(sum(confusion_matrix(y_test, yhat_rf)))
accuracy

0.6368563685636857

In [224]:
print(f1_score(y_test, yhat_rf, average="macro"))
print(precision_score(y_test, yhat_rf, average="macro"))
print(recall_score(y_test, yhat_rf, average="macro"))

0.605817470536852
0.6373450595042349
0.5906529927635851


In [225]:
# Predicting the Test set results
y_pred = rf.predict(X_test)
reversefactor = dict(zip(range(10),definitions))
y_test_vector = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
# Making the Confusion Matrix
print(pd.crosstab(y_test_vector, y_pred, rownames=['Actual Positions'], colnames=['Predicted Positions']))

Predicted Positions   1B  2B   3B   C  CF  DH  LF  OF  RF  SS
Actual Positions                                             
1B                   135   4   13   0   3   8   8   1   8   1
2B                     3  90    6   0   7   2   5   3   8   9
3B                    24   5  101   2   2   4  12   0   7   3
C                      8   3    2  16   0   7   4   0   2   0
CF                     0   5    2   0  83   1  12   2   9   6
DH                    15   2    4   3   0  28   6   1   0   1
LF                    20   2    8   1   4   2  74   3   6   2
OF                     1   3    6   0   4   0   5  18   3   3
RF                    12   4   11   0   3   2  13   2  63   5
SS                     1  10    6   0   4   1   9   1   2  97


In [226]:
rf.feature_importances_

array([0.05357534, 0.06590779, 0.0633963 , 0.06298683, 0.05757098,
       0.0305421 , 0.05850467, 0.06509777, 0.08014582, 0.05172646,
       0.08833672, 0.05142844, 0.06362651, 0.05197244, 0.0585361 ,
       0.05621334, 0.04043239])

In [227]:
print(list(zip(X.columns, classifier.feature_importances_)))


[('G', 0.052993247625239014), ('AB', 0.06972686705326339), ('R', 0.06756122704994139), ('H', 0.06583686120116572), ('2B', 0.06373547716904004), ('3B', 0.02951502082296631), ('HR', 0.06149663688913407), ('RBI', 0.06498413292472377), ('BB', 0.07542693918518935), ('SO', 0.04854746162054352), ('SB', 0.08455703489375979), ('CS', 0.06007461946870311), ('AVG', 0.05883372571449804), ('OBP', 0.04912065582116712), ('SLG', 0.05196689236847161), ('caret-upcaret-downOPS', 0.05693703984965394), ('Year', 0.03868616034253983)]


In [228]:
test = pd.DataFrame(X.columns, classifier.feature_importances_).reset_index()
test.columns = ["Importance","Stat"]
test.sort_values(by = "Importance", ascending = False)

Unnamed: 0,Importance,Stat
10,0.084557,SB
8,0.075427,BB
1,0.069727,AB
2,0.067561,R
3,0.065837,H
7,0.064984,RBI
4,0.063735,2B
6,0.061497,HR
11,0.060075,CS
12,0.058834,AVG


In [234]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [236]:
yhat_dt = dt.predict(X_test)
confusion_matrix(y_test, yhat_dt)
sum(sum(confusion_matrix(y_test, yhat_dt)))

1107

In [237]:
test = []
for index, value in enumerate(confusion_matrix(y_test, yhat_dt)):

    test.append(value[index])
    
accuracy = sum(test)/sum(sum(confusion_matrix(y_test, yhat_dt)))
accuracy

0.6214995483288166

In [238]:
print(f1_score(y_test, yhat_dt, average="macro"))
print(precision_score(y_test, yhat_dt, average="macro"))
print(recall_score(y_test, yhat_dt, average="macro"))

0.5911995699972052
0.5953568773695456
0.5889999424249622


In [None]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

yhat_nb = nb.predict(X_test)
confusion_matrix(y_test, yhat_nb)

In [None]:
test = []
for index, value in enumerate(confusion_matrix(y_test, yhat_nb)):

    test.append(value[index])
    
accuracy = sum(test)/sum(sum(confusion_matrix(y_test, yhat_nb)))
accuracy

In [None]:

print(f1_score(y_test, yhat_dt, average="macro"))
print(precision_score(y_test, yhat_dt, average="macro"))
print(recall_score(y_test, yhat_dt, average="macro"))