In [1]:
import pandas as pd
import numpy as np

import acquire
import prepare
import preprocessing

#Visualization Imports
import matplotlib.pyplot as plt
import seaborn as sns

#Sklearn Tools and Modules
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
import sklearn.preprocessing

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
#Acquire the Data
df = acquire.get_hwyrail()
df.shape

(18995, 104)

In [3]:
#Prepare the data
df = prepare.prep_hwy_df(df)
df.shape

(12181, 49)

In [30]:
hwy_df = df[['temp', 'total_killed', 'railcar_quantity', 'vehicle_speed', 'railroad_company', 'driver_gender']]
hwy_df.info() 

<class 'pandas.core.frame.DataFrame'>
Index: 12181 entries, 0312RS009 to 193825
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   temp              12181 non-null  int64  
 1   total_killed      12181 non-null  int64  
 2   railcar_quantity  12181 non-null  int64  
 3   vehicle_speed     12181 non-null  float64
 4   railroad_company  12181 non-null  object 
 5   driver_gender     12181 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 666.1+ KB


In [31]:
#Split the Data
train, validate, X_train, y_train, X_validate, y_validate, X_test, y_test = preprocessing.train_validate_test(hwy_df, 'railroad_company')

In [32]:
X_train.shape

(6820, 5)

In [33]:
y_train.shape

(6820,)

In [34]:
X_validate.shape

(2924, 5)

In [35]:
object_cols = preprocessing.get_object_cols(hwy_df)
numeric_cols = preprocessing.get_numeric_X_cols(X_train, object_cols)

In [36]:
def min_max_scale(X_train, X_validate, X_test, numeric_cols):
    '''
    this function takes in 3 dataframes with the same columns, 
    a list of numeric column names (because the scaler can only work with numeric columns),
    and fits a min-max scaler to the first dataframe and transforms all
    3 dataframes using that scaler. 
    it returns 3 dataframes with the same column names and scaled values. 
    '''
    # create the scaler object and fit it to X_train (i.e. identify min and max)
    # if copy = false, inplace row normalization happens and avoids a copy (if the input is already a numpy array).

    scaler = sklearn.preprocessing.MinMaxScaler()
    scaler.fit(X_train[numeric_cols])

    #scale X_train, X_validate, X_test using the mins and maxes stored in the scaler derived from X_train. 
    # 
    X_train_scaled_array = scaler.transform(X_train[numeric_cols])
    X_validate_scaled_array = scaler.transform(X_validate[numeric_cols])
    X_test_scaled_array = scaler.transform(X_test[numeric_cols])

    # convert arrays to dataframes
    X_train_scaled = pd.DataFrame(X_train_scaled_array, 
                                  columns=numeric_cols).\
                                  set_index([X_train.index.values])

    X_validate_scaled = pd.DataFrame(X_validate_scaled_array, 
                                     columns=numeric_cols).\
                                     set_index([X_validate.index.values])

    X_test_scaled = pd.DataFrame(X_test_scaled_array, 
                                 columns=numeric_cols).\
                                 set_index([X_test.index.values])

    
    return X_train_scaled, X_validate_scaled, X_test_scaled

In [37]:
X_train_scaled, X_validate_scaled, X_test_scaled = min_max_scale(X_train, X_validate, X_test, numeric_cols)

In [38]:
X_train_scaled

Unnamed: 0,temp,total_killed,railcar_quantity,vehicle_speed
1113FW028,0.636364,0.0,0.000000,0.049505
TX0813202,0.757576,0.0,0.181303,0.000000
0316SA020,0.719697,0.0,0.000000,0.297030
0217LK004,0.606061,0.0,0.138810,0.000000
97867,0.492424,0.0,0.025496,0.000000
...,...,...,...,...
155511,0.878788,0.0,0.019830,0.247525
KS0513202,0.628788,0.2,0.283286,0.099010
NE0716200,0.909091,0.0,0.382436,0.049505
1014PD018,0.583333,0.0,0.308782,0.000000


### Creating the baseline

In [39]:
df.railroad_company.value_counts()

UP      2959
CSX     2825
NS      2715
BNSF    2274
ATK      956
KCS      452
Name: railroad_company, dtype: int64

In [40]:
print(f'Baseline Accuracy: {round(max(df.railroad_company.value_counts()) / df.shape[0] *100)}%')

Baseline Accuracy: 24%


### Logistic Regression Model

In [41]:
#Using the scaled data, make a logistic regression model
lm = LogisticRegression()

#Fit the lm object to the vectorized data
lm.fit(X_train_scaled, y_train)

#Create corresponding dataframes 
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

#Form predictions uisng the lm model
train['predicted'] = lm.predict(X_train_scaled)
validate["predicted"] = lm.predict(X_validate_scaled)
test['predicted'] = lm.predict(X_test_scaled)

#Review how the lm model performed on the in-sample data
print('In-sample data model performance:')
print(classification_report(train.actual, train.predicted, zero_division=0))
print('----------------')
#Review how the lm model performed on the out-of-sample data
print('Out-of-sample data model performance:')
print(classification_report(validate.actual, validate.predicted, zero_division=0))

In-sample data model performance:
              precision    recall  f1-score   support

        ATK        0.37      0.25      0.29       541
        BNSF       0.33      0.11      0.17      1310
        CSX        0.30      0.12      0.17      1537
        KCS        0.00      0.00      0.00       257
        NS         0.32      0.55      0.40      1525
        UP         0.32      0.53      0.40      1650

    accuracy                           0.32      6820
   macro avg       0.27      0.26      0.24      6820
weighted avg       0.31      0.32      0.28      6820

----------------
Out-of-sample data model performance:
              precision    recall  f1-score   support

        ATK        0.38      0.26      0.31       233
        BNSF       0.26      0.10      0.14       518
        CSX        0.29      0.10      0.15       700
        KCS        0.00      0.00      0.00       111
        NS         0.29      0.53      0.38       645
        UP         0.32      0.52      0.39

### Takeaway
- My LR model beat my baseline (.24) by .16

### Decision Tree Model

In [42]:
#Make the CLF object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=9, random_state=123)

#Fit the model on the training set 
clf = clf.fit(X_train_scaled, y_train)

#Make predictions
y_pred = clf.predict(X_train_scaled)

#Evaluate model performance on training data
print('In-sample data model performance:')
print(classification_report(y_train, y_pred))

In-sample data model performance:
              precision    recall  f1-score   support

        ATK        0.69      0.82      0.75       541
        BNSF       0.54      0.29      0.38      1310
        CSX        0.41      0.40      0.41      1537
        KCS        0.74      0.13      0.22       257
        NS         0.46      0.54      0.49      1525
        UP         0.41      0.53      0.46      1650

    accuracy                           0.46      6820
   macro avg       0.54      0.45      0.45      6820
weighted avg       0.48      0.46      0.45      6820



In [43]:
# Predict on validate
y_pred = clf.predict(X_validate_scaled)

# Evaluate model performance on out-of-sample data
print('Out-of-sample data model performance:')
print(classification_report(y_validate, y_pred, zero_division=0))

Out-of-sample data model performance:
              precision    recall  f1-score   support

        ATK        0.66      0.85      0.74       233
        BNSF       0.28      0.16      0.21       518
        CSX        0.34      0.30      0.32       700
        KCS        0.11      0.02      0.03       111
        NS         0.37      0.43      0.40       645
        UP         0.35      0.45      0.39       717

    accuracy                           0.38      2924
   macro avg       0.35      0.37      0.35      2924
weighted avg       0.35      0.38      0.36      2924



### Takeaway
- My dt model performed quite well on all railroads, UP was 46%
- On validate, the perfomance dropped to 39%

### Random Forest Model

In [47]:
#Create the RF object
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=2,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

#Fit the RF object to the training data
rf.fit(X_train_scaled, y_train)

#Predict on y
y_pred = rf.predict(X_train_scaled)

#Evaluate
print('In-sample data model performance:')
print(classification_report(y_train, y_pred))

In-sample data model performance:
              precision    recall  f1-score   support

        ATK        0.66      0.88      0.76       541
        BNSF       0.65      0.42      0.51      1310
        CSX        0.56      0.48      0.52      1537
        KCS        1.00      0.07      0.13       257
        NS         0.51      0.65      0.57      1525
        UP         0.51      0.62      0.56      1650

    accuracy                           0.55      6820
   macro avg       0.65      0.52      0.51      6820
weighted avg       0.58      0.55      0.54      6820



In [48]:
#Predict using out of sample data
y_pred = rf.predict(X_validate_scaled)

print('Out-of-sample data model performance:')
print(classification_report(y_validate, y_pred, zero_division=0))

Out-of-sample data model performance:
              precision    recall  f1-score   support

        ATK        0.62      0.88      0.73       233
        BNSF       0.36      0.23      0.28       518
        CSX        0.37      0.28      0.32       700
        KCS        0.00      0.00      0.00       111
        NS         0.39      0.51      0.44       645
        UP         0.35      0.43      0.38       717

    accuracy                           0.40      2924
   macro avg       0.35      0.39      0.36      2924
weighted avg       0.37      0.40      0.38      2924



### Takeaway
- Rf performed well on train data at 56%, but dropped off 38% 

### KNN Model

In [58]:
#Create the KNN object with a k = 6
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')

#Fit the object to the scaled training data
knn.fit(X_train_scaled, y_train)

#Create corresponding dataframes 
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

#Form predictions uisng the knn model
train['predicted'] = knn.predict(X_train_scaled)
validate["predicted"] = knn.predict(X_validate_scaled)

#Review how the knn model performed on the in-sample data
print('In-sample data model performance:')
print(classification_report(train.actual, train.predicted,  zero_division=0))
print('----------------')
#Review how the knn model performed on the out-of-sample data
print('Out-of-sample data model performance:')
print(classification_report(validate.actual, validate.predicted, zero_division=0))

In-sample data model performance:
              precision    recall  f1-score   support

        ATK        0.90      0.99      0.95       541
        BNSF       0.90      0.98      0.94      1310
        CSX        0.93      0.95      0.94      1537
        KCS        0.93      0.92      0.93       257
        NS         0.96      0.92      0.94      1525
        UP         0.99      0.90      0.94      1650

    accuracy                           0.94      6820
   macro avg       0.93      0.94      0.94      6820
weighted avg       0.94      0.94      0.94      6820

----------------
Out-of-sample data model performance:
              precision    recall  f1-score   support

        ATK        0.55      0.75      0.63       233
        BNSF       0.25      0.27      0.26       518
        CSX        0.29      0.27      0.28       700
        KCS        0.09      0.05      0.07       111
        NS         0.34      0.34      0.34       645
        UP         0.30      0.29      0.30

### Takeaway
- No matter the n_neighbor value, the training data performed exceptionally well and the validate data dropped dramatically