In [1]:
import pandas as pd
import numpy as np
import os
from acquire import get_hwyrail, get_equiprail
from prepare import prep_hwy_df, prep_equip_df
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from scipy import stats
from sklearn.model_selection import train_test_split

from datetime import date, datetime
from explore import train_validate_test_split, freq_table


# modeling methods
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import graphviz
from graphviz import Graph
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler

from preprocessing import get_object_cols, get_numeric_X_cols, train_validate_test, min_max_scale, get_dummies

In [2]:
df = get_hwyrail()

In [3]:
df = prep_hwy_df(df)

In [4]:
df = get_dummies(df)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12181 entries, 0 to 12180
Data columns (total 55 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   railroad_company    12181 non-null  object        
 1   station             12181 non-null  object        
 2   county              12181 non-null  object        
 3   state_fips          12181 non-null  int64         
 4   region              12181 non-null  int64         
 5   city                12181 non-null  object        
 6   vehicle_speed       12181 non-null  float64       
 7   vehicle_type        12181 non-null  object        
 8   vehicle_direction   12181 non-null  object        
 9   position            12181 non-null  object        
 10  accident_type       12181 non-null  int64         
 11  hazmat_entity       12181 non-null  object        
 12  temp                12181 non-null  int64         
 13  visibility          12181 non-null  int64     

In [6]:
hwydf = df[['railroad_company', 'fips', 'railcar_quantity', 'vehicle_damage' ]]
hwydf.info()   

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12181 entries, 0 to 12180
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   railroad_company  12181 non-null  object 
 1   fips              12181 non-null  int64  
 2   railcar_quantity  12181 non-null  int64  
 3   vehicle_damage    12181 non-null  float64
dtypes: float64(1), int64(2), object(1)
memory usage: 475.8+ KB


In [7]:
train, validate, X_train, y_train, X_validate, y_validate, X_test, y_test = train_validate_test(hwydf, 'railroad_company')

In [8]:
X_train.shape, X_validate.shape, X_test.shape

((6820, 3), (2924, 3), (2437, 3))

In [9]:
object_cols = get_object_cols(hwydf)

In [10]:
numeric_cols = get_numeric_X_cols(X_train, object_cols)

In [11]:
X_train_scaled, X_validate_scaled, X_test_scaled = min_max_scale(X_train, X_validate, X_test, numeric_cols)

In [12]:
X_train_scaled

Unnamed: 0,fips,railcar_quantity,vehicle_damage
2108,0.290363,0.342776,0.001333
1914,0.145181,0.410765,0.008667
6197,0.002503,0.246459,0.015467
7030,0.195244,0.110482,0.002267
420,0.035044,0.169972,0.002667
...,...,...,...
9383,0.122653,0.025496,0.001333
2434,0.120150,0.147309,0.004000
6351,0.165207,0.260623,0.020000
3450,0.042553,0.161473,0.013333


## Baseline

In [13]:
df.railroad_company.value_counts()

UP      2959
CSX     2825
NS      2715
BNSF    2274
ATK      956
KCS      452
Name: railroad_company, dtype: int64

In [14]:
print(f'Baseline Accuracy: {round(max(df.railroad_company.value_counts()) / df.shape[0] *100)}%')

Baseline Accuracy: 24%


--------

## Logistic Regression

-----------

In [15]:
#Using the scaled data, make a logistic regression model
lm = LogisticRegression()

#Fit the lm object to the vectorized data
lm.fit(X_train_scaled, y_train)

#Create corresponding dataframes 
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

#Form predictions uisng the lm model
train['predicted'] = lm.predict(X_train_scaled)
validate["predicted"] = lm.predict(X_validate_scaled)
test['predicted'] = lm.predict(X_test_scaled)

#Review how the lm model performed on the in-sample data
print('In-sample data model performance:')
print(classification_report(train.actual, train.predicted, zero_division=0))
print('----------------')
#Review how the lm model performed on the out-of-sample data
print('Out-of-sample data model performance:')
print(classification_report(validate.actual, validate.predicted, zero_division=0))

In-sample data model performance:
              precision    recall  f1-score   support

        ATK        0.02      0.01      0.01       536
        BNSF       0.00      0.00      0.00      1246
        CSX        0.24      0.11      0.15      1553
        KCS        0.00      0.00      0.00       251
        NS         0.26      0.45      0.33      1513
        UP         0.32      0.62      0.42      1721

    accuracy                           0.28      6820
   macro avg       0.14      0.20      0.15      6820
weighted avg       0.20      0.28      0.22      6820

----------------
Out-of-sample data model performance:
              precision    recall  f1-score   support

        ATK        0.05      0.01      0.02       241
        BNSF       0.00      0.00      0.00       567
        CSX        0.27      0.13      0.18       705
        KCS        0.00      0.00      0.00       104
        NS         0.26      0.47      0.33       647
        UP         0.29      0.61      0.40

-------

## KNN

----------

In [16]:
#Create the KNN object with a k = 5
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')

#Fit the object to the scaled training data
knn.fit(X_train_scaled, y_train)

#Create corresponding dataframes 
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

#Form predictions uisng the knn model
train['predicted'] = knn.predict(X_train_scaled)
validate["predicted"] = knn.predict(X_validate_scaled)

#Review how the knn model performed on the in-sample data
print('In-sample data model performance:')
print(classification_report(train.actual, train.predicted,  zero_division=0))
print('----------------')
#Review how the knn model performed on the out-of-sample data
print('Out-of-sample data model performance:')
print(classification_report(validate.actual, validate.predicted, zero_division=0))

In-sample data model performance:
              precision    recall  f1-score   support

        ATK        0.97      1.00      0.99       536
        BNSF       0.97      1.00      0.99      1246
        CSX        0.99      0.99      0.99      1553
        KCS        0.99      0.99      0.99       251
        NS         0.99      0.98      0.99      1513
        UP         1.00      0.98      0.99      1721

    accuracy                           0.99      6820
   macro avg       0.99      0.99      0.99      6820
weighted avg       0.99      0.99      0.99      6820

----------------
Out-of-sample data model performance:
              precision    recall  f1-score   support

        ATK        0.58      0.68      0.63       241
        BNSF       0.40      0.40      0.40       567
        CSX        0.41      0.41      0.41       705
        KCS        0.09      0.03      0.04       104
        NS         0.33      0.30      0.31       647
        UP         0.41      0.46      0.43

--------

## Random Forest

--------

In [17]:
#Create the RF object
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=2,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

#Fit the RF object to the training data
rf.fit(X_train_scaled, y_train)

#Predict on y
y_pred = rf.predict(X_train_scaled)

#Evaluate on train
print('In-sample data model performance:')
print(classification_report(y_train, y_pred))


#Predict using out of sample data
y_pred = rf.predict(X_validate_scaled)
print('----------------')
print('Out-of-sample data model performance:')
print(classification_report(y_validate, y_pred, zero_division=0))

In-sample data model performance:
              precision    recall  f1-score   support

        ATK        0.67      0.89      0.77       536
        BNSF       0.70      0.76      0.73      1246
        CSX        0.90      0.69      0.78      1553
        KCS        0.95      0.08      0.15       251
        NS         0.59      0.67      0.63      1513
        UP         0.67      0.72      0.69      1721

    accuracy                           0.70      6820
   macro avg       0.75      0.64      0.63      6820
weighted avg       0.72      0.70      0.69      6820

----------------
Out-of-sample data model performance:
              precision    recall  f1-score   support

        ATK        0.58      0.79      0.67       241
        BNSF       0.62      0.66      0.64       567
        CSX        0.79      0.58      0.67       705
        KCS        0.00      0.00      0.00       104
        NS         0.49      0.53      0.51       647
        UP         0.51      0.60      0.55

---------

## Decision Tree

-----------

In [18]:
#Make the CLF object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=5, random_state=123)

#Fit the model on the training set 
clf = clf.fit(X_train_scaled, y_train)

#Make predictions
y_pred = clf.predict(X_train_scaled)

#Evaluate model performance on training data
print('In-sample data model performance:')
print(classification_report(y_train, y_pred))

# Predict on validate
y_pred = clf.predict(X_validate_scaled)

# Evaluate model performance on out-of-sample data
print('Out-of-sample data model performance:')
print(classification_report(y_validate, y_pred, zero_division=0))

In-sample data model performance:
              precision    recall  f1-score   support

        ATK        0.45      0.95      0.61       536
        BNSF       0.63      0.65      0.64      1246
        CSX        0.89      0.48      0.62      1553
        KCS        0.00      0.00      0.00       251
        NS         0.41      0.52      0.45      1513
        UP         0.49      0.46      0.48      1721

    accuracy                           0.53      6820
   macro avg       0.48      0.51      0.47      6820
weighted avg       0.57      0.53      0.53      6820

Out-of-sample data model performance:
              precision    recall  f1-score   support

        ATK        0.45      0.92      0.60       241
        BNSF       0.63      0.64      0.64       567
        CSX        0.88      0.47      0.61       705
        KCS        0.00      0.00      0.00       104
        NS         0.41      0.53      0.46       647
        UP         0.48      0.46      0.47       660

    a

In [19]:
#Predict on y
y_pred = rf.predict(X_test_scaled)

#Evaluate
print('In-sample data model performance:')
print(classification_report(y_test, y_pred))

In-sample data model performance:
              precision    recall  f1-score   support

        ATK        0.59      0.83      0.69       179
        BNSF       0.63      0.70      0.67       461
        CSX        0.80      0.60      0.69       567
        KCS        0.33      0.01      0.02        97
        NS         0.47      0.53      0.49       555
        UP         0.50      0.54      0.52       578

    accuracy                           0.58      2437
   macro avg       0.55      0.54      0.51      2437
weighted avg       0.59      0.58      0.57      2437

