# Logistic Regression in Python

Learning Python

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import RepeatedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import preprocessing
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
os.chdir("C:\\Users\\Matt\\Documents\\Python_Projects")

In [3]:
pwd

'C:\\Users\\Matt\\Documents\\Python_Projects'

In [4]:
baseball_train = pd.read_csv(r"baseball_train.csv",index_col=0,
                             dtype={'Opp': 'category', 'Result': 'category', 'Name': 'category'}, header=0)
baseball_test = pd.read_csv(r"baseball_test.csv",index_col=0,
                            dtype={'Opp': 'category', 'Result': 'category', 'Name': 'category'}, header=0)
print(baseball_test.head())
encoded_categories = dict(enumerate(baseball_test.Name.cat.categories))
print(encoded_categories)

      Opp  DR   IP   H  R  ER  BB  SO  HR  HBP  ...  CS  PO  X2B  X3B  IBB  \
788   DET   4  4.2   3  5   5   7   5   2    0  ...   0   0    0    0    0   
1463  DET   5  6.0   7  2   2   1   5   0    0  ...   1   0    1    0    0   
1272  TOR   4  9.0  10  5   5   6   6   0    0  ...   1   0    1    0    0   
639   PIT   5  7.0   2  0   0   5   6   0    0  ...   1   0    0    0    0   
41    ATL   4  2.1   0  0   0   1   0   0    0  ...   0   0    0    0    0   

      GDP  SF  ROE  Result   Name  
788     0   0    0       L  Nolan  
1463    1   0    0       L  Tommy  
1272    1   0    0       L  Tommy  
639     1   0    0       W  Nolan  
41      0   0    1       W  Nolan  

[5 rows x 28 columns]
{0: 'Nolan', 1: 'Tommy'}


In [5]:
X = baseball_train.iloc[:,:-1]
X = X.drop(['Opp','Result'],axis=1)
X_scaled = preprocessing.scale(X)
y = baseball_train.iloc[:,-1]

# Create logistic regression
logit = LogisticRegression(fit_intercept=True)

# Create repeated kfold
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=21191)

# Do repeated k-fold cross-validation
cv_results = cross_val_score(logit,
                             X_scaled,
                             y,
                             cv=rkf,
                             scoring="roc_auc")

  This is separate from the ipykernel package so we can avoid doing imports until


# Repeated K-Fold Cross Validation

In [6]:
print(cv_results.min())
print(np.percentile(cv_results, 25))
print(cv_results.mean())
print(np.percentile(cv_results, 50))
print(np.percentile(cv_results, 75))
print(cv_results.max())

0.9528705724357899
0.9615656446540881
0.9669098554298791
0.966001901675526
0.9724200712028797
0.9801243802000168


In [7]:
model = logit.fit(X_scaled,y)

intercept = model.intercept_[0]

print("intercept = {}".format(intercept))
for idx, col_name in enumerate(X.columns):
    print("{} = {}".format(col_name, model.coef_[0][idx]))

intercept = -0.5442528190372741
DR = 0.19303169134961257
IP = -0.3217307281537315
H = 0.62027917893443
R = -0.6302400324747851
ER = -0.03378545218684243
BB = -1.0099327019247948
SO = -1.966148484593929
HR = 0.05372705984100189
HBP = -0.05660186321333444
ERA = -0.2131987636961451
BF = -0.13783343362321548
GB = 2.1982709392611306
FB = -1.0837530975652592
LD = 0.43992433562689254
PU = -0.5245051439159379
Unk = 0.8226412145312731
SB = -0.535448754974902
CS = 0.1305539222199926
PO = -0.22076071468841496
X2B = -0.00047088924832914755
X3B = -0.01630430881607756
IBB = 0.21234604265601845
GDP = 0.1509776805643962
SF = -0.029957463266641927
ROE = -0.007653828997020928


In [8]:
Xnew = baseball_test.iloc[:,:-1]
Xnew = Xnew.drop(['Opp','Result'],axis=1)
Xnew_scaled = preprocessing.scale(Xnew)
yTrue = baseball_test.iloc[:,-1]

# make a prediction
ynew = model.predict(Xnew_scaled)
# show the inputs and predicted outputs
#for i in range(len(Xnew)):
#    print("Predicted=%s" % (ynew[i]))
    
baseball = {'predicted': ynew, 'truth': yTrue}
pd.DataFrame(data=baseball)
metrics.accuracy_score(yTrue, ynew)

  This is separate from the ipykernel package so we can avoid doing imports until


0.8