In [1]:
import pandas as pd
import numpy as np

import acquire
import prepare
import preprocessing

#Visualization Imports
import matplotlib.pyplot as plt
import seaborn as sns

#Sklearn Tools and Modules
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
import sklearn.preprocessing

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
#Acquire the Data
df = acquire.get_equiprail()
df.shape

(22402, 146)

In [3]:
#Prepare the data
df = prepare.prep_equip_df(df)
df.shape

(11504, 35)

In [4]:
equip_df = df[['track_damage', 'total_damage', 'weather', 'equip_damage', 'railroad_company']]
equip_df.info()           

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11504 entries, 0 to 11503
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   track_damage      11504 non-null  int64 
 1   total_damage      11504 non-null  int64 
 2   weather           11504 non-null  int64 
 3   equip_damage      11504 non-null  int64 
 4   railroad_company  11504 non-null  object
dtypes: int64(4), object(1)
memory usage: 539.2+ KB


In [5]:
#Split the Data
train, validate, X_train, y_train, X_validate, y_validate, X_test, y_test = preprocessing.train_validate_test(equip_df, 'railroad_company')

In [6]:
#X_train.shape

In [7]:
#y_train.shape

In [8]:
object_cols = preprocessing.get_object_cols(equip_df)
numeric_cols = preprocessing.get_numeric_X_cols(X_train, object_cols)

In [9]:
#X_train_scaled, X_validate_scaled, X_test_scaled = preprocessing.min_max_scale(X_train, X_validate, X_test, numeric_cols)

In [10]:
def min_max_scale(X_train, X_validate, X_test, numeric_cols):
    '''
    this function takes in 3 dataframes with the same columns, 
    a list of numeric column names (because the scaler can only work with numeric columns),
    and fits a min-max scaler to the first dataframe and transforms all
    3 dataframes using that scaler. 
    it returns 3 dataframes with the same column names and scaled values. 
    '''
    # create the scaler object and fit it to X_train (i.e. identify min and max)
    # if copy = false, inplace row normalization happens and avoids a copy (if the input is already a numpy array).

    scaler = sklearn.preprocessing.MinMaxScaler()
    scaler.fit(X_train[numeric_cols])

    #scale X_train, X_validate, X_test using the mins and maxes stored in the scaler derived from X_train. 
    # 
    X_train_scaled_array = scaler.transform(X_train[numeric_cols])
    X_validate_scaled_array = scaler.transform(X_validate[numeric_cols])
    X_test_scaled_array = scaler.transform(X_test[numeric_cols])

    # convert arrays to dataframes
    X_train_scaled = pd.DataFrame(X_train_scaled_array, 
                                  columns=numeric_cols).\
                                  set_index([X_train.index.values])

    X_validate_scaled = pd.DataFrame(X_validate_scaled_array, 
                                     columns=numeric_cols).\
                                     set_index([X_validate.index.values])

    X_test_scaled = pd.DataFrame(X_test_scaled_array, 
                                 columns=numeric_cols).\
                                 set_index([X_test.index.values])

    
    return X_train_scaled, X_validate_scaled, X_test_scaled

In [11]:
X_train_scaled, X_validate_scaled, X_test_scaled = min_max_scale(X_train, X_validate, X_test, numeric_cols)

In [12]:
X_train_scaled

Unnamed: 0,track_damage,total_damage,weather,equip_damage
8116,0.003655,0.000850,0.0,0.000004
2539,0.000005,0.002467,0.0,0.002511
10506,0.000033,0.000571,0.2,0.000576
6836,0.013576,0.071481,0.0,0.069471
7873,0.000033,0.001157,0.2,0.001173
...,...,...,...,...
4466,0.001582,0.000807,0.2,0.000451
5250,0.002074,0.000910,0.0,0.000439
9997,0.005115,0.006726,0.2,0.005633
10320,0.000000,0.000887,0.2,0.000905


---

## Modeling

**Baseline Model**

In [13]:
df.railroad_company.value_counts()

UP      4345
BNSF    3160
NS      1774
CSX     1482
ATK      743
Name: railroad_company, dtype: int64

In [14]:
print(f'Baseline Accuracy: {round(max(df.railroad_company.value_counts()) / df.shape[0] *100)}%')

Baseline Accuracy: 38%


**Logistic Regression Model**

In [15]:
#Using the scaled data, make a logistic regression model
lm = LogisticRegression()

#Fit the lm object to the vectorized data
lm.fit(X_train_scaled, y_train)

#Create corresponding dataframes 
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

#Form predictions uisng the lm model
train['predicted'] = lm.predict(X_train_scaled)
validate["predicted"] = lm.predict(X_validate_scaled)
test['predicted'] = lm.predict(X_test_scaled)

#Review how the lm model performed on the in-sample data
print('In-sample data model performance:')
print(classification_report(train.actual, train.predicted, zero_division=0))
print('----------------')
#Review how the lm model performed on the out-of-sample data
print('Out-of-sample data model performance:')
print(classification_report(validate.actual, validate.predicted, zero_division=0))

In-sample data model performance:
              precision    recall  f1-score   support

         ATK       0.00      0.00      0.00       436
        BNSF       0.66      0.04      0.07      1769
         CSX       0.00      0.00      0.00       851
          NS       0.00      0.00      0.00       975
          UP       0.38      0.99      0.55      2411

    accuracy                           0.38      6442
   macro avg       0.21      0.21      0.12      6442
weighted avg       0.32      0.38      0.22      6442

----------------
Out-of-sample data model performance:
              precision    recall  f1-score   support

         ATK       0.00      0.00      0.00       181
        BNSF       0.47      0.02      0.05       748
         CSX       0.00      0.00      0.00       339
          NS       0.00      0.00      0.00       432
          UP       0.38      0.99      0.55      1061

    accuracy                           0.39      2761
   macro avg       0.17      0.20      0.1

**Takeaways:**
- The LM model does not improve upon the baseline accuracy. 39% on validate.
- What does this mean for the features we selected?

---

**KNN**

In [16]:
#Create the KNN object with a k = 7
knn = KNeighborsClassifier(n_neighbors=6, weights='distance')

#Fit the object to the scaled training data
knn.fit(X_train_scaled, y_train)

#Create corresponding dataframes 
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

#Form predictions uisng the knn model
train['predicted'] = knn.predict(X_train_scaled)
validate["predicted"] = knn.predict(X_validate_scaled)

#Review how the knn model performed on the in-sample data
print('In-sample data model performance:')
print(classification_report(train.actual, train.predicted,  zero_division=0))
print('----------------')
#Review how the knn model performed on the out-of-sample data
print('Out-of-sample data model performance:')
print(classification_report(validate.actual, validate.predicted, zero_division=0))

In-sample data model performance:
              precision    recall  f1-score   support

         ATK       0.99      0.99      0.99       436
        BNSF       0.97      1.00      0.99      1769
         CSX       1.00      0.99      1.00       851
          NS       1.00      0.97      0.99       975
          UP       1.00      0.99      1.00      2411

    accuracy                           0.99      6442
   macro avg       0.99      0.99      0.99      6442
weighted avg       0.99      0.99      0.99      6442

----------------
Out-of-sample data model performance:
              precision    recall  f1-score   support

         ATK       0.19      0.14      0.16       181
        BNSF       0.35      0.38      0.36       748
         CSX       0.20      0.15      0.17       339
          NS       0.17      0.13      0.15       432
          UP       0.45      0.51      0.48      1061

    accuracy                           0.35      2761
   macro avg       0.27      0.26      0.2

**Takeaways:**
   - Using a K=6 returns the best accuracy score for predicting railroad company. 34% on accuracy.
   - Using weights=distance parameter results in overfitting on the train dataset but retains a similar accuracy score on the validate data as the default weights hyperparamter of 'uniform'

---

**Random Forest**

In [17]:
#Create the RF object
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=2,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

#Fit the RF object to the training data
rf.fit(X_train_scaled, y_train)

#Predict on y
y_pred = rf.predict(X_train_scaled)

#Evaluate
print('In-sample data model performance:')
print(classification_report(y_train, y_pred))

In-sample data model performance:
              precision    recall  f1-score   support

         ATK       0.69      0.37      0.48       436
        BNSF       0.60      0.66      0.63      1769
         CSX       0.76      0.32      0.45       851
          NS       0.84      0.19      0.31       975
          UP       0.58      0.88      0.70      2411

    accuracy                           0.61      6442
   macro avg       0.69      0.48      0.51      6442
weighted avg       0.66      0.61      0.57      6442



In [18]:
#Predict using out of sample data
y_pred = rf.predict(X_validate_scaled)
print('----------------')
print('Out-of-sample data model performance:')
print(classification_report(y_validate, y_pred, zero_division=0))

----------------
Out-of-sample data model performance:
              precision    recall  f1-score   support

         ATK       0.31      0.14      0.19       181
        BNSF       0.36      0.39      0.38       748
         CSX       0.36      0.13      0.19       339
          NS       0.24      0.04      0.07       432
          UP       0.44      0.69      0.54      1061

    accuracy                           0.41      2761
   macro avg       0.34      0.28      0.27      2761
weighted avg       0.37      0.41      0.36      2761



**Takeaways:**
   - Random Forest achieves the highest accuracy score thus far. 42% on validate

---

**Decision Tree Modeling**

In [19]:
#Make the CLF object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=9, random_state=123)

#Fit the model on the training set 
clf = clf.fit(X_train_scaled, y_train)

#Make predictions
y_pred = clf.predict(X_train_scaled)

#Evaluate model performance on training data
print('In-sample data model performance:')
print(classification_report(y_train, y_pred))

In-sample data model performance:
              precision    recall  f1-score   support

         ATK       0.59      0.20      0.30       436
        BNSF       0.46      0.51      0.48      1769
         CSX       0.56      0.30      0.39       851
          NS       0.71      0.14      0.23       975
          UP       0.51      0.77      0.61      2411

    accuracy                           0.50      6442
   macro avg       0.57      0.38      0.40      6442
weighted avg       0.54      0.50      0.47      6442



In [20]:
# Predict on validate
y_pred = clf.predict(X_validate_scaled)

# Evaluate model performance on out-of-sample data
print('Out-of-sample data model performance:')
print(classification_report(y_validate, y_pred, zero_division=0))

Out-of-sample data model performance:
              precision    recall  f1-score   support

         ATK       0.30      0.09      0.14       181
        BNSF       0.36      0.39      0.37       748
         CSX       0.30      0.14      0.19       339
          NS       0.18      0.03      0.06       432
          UP       0.44      0.69      0.54      1061

    accuracy                           0.40      2761
   macro avg       0.32      0.27      0.26      2761
weighted avg       0.35      0.40      0.35      2761



**Takeaways:**
   - Random Forest algorithm achieves a 38% accuracy on validate data set
   - The Decision Tree algoithm also performs best at being able to predict the individual railroad companies and this is evident from the precision scores. 

---

**Evaluate on Test**

In [21]:
#Predict on y
y_pred = rf.predict(X_test_scaled)

#Evaluate
print('In-sample data model performance:')
print(classification_report(y_test, y_pred))

In-sample data model performance:
              precision    recall  f1-score   support

         ATK       0.27      0.14      0.19       126
        BNSF       0.39      0.41      0.40       643
         CSX       0.35      0.15      0.21       292
          NS       0.36      0.08      0.13       367
          UP       0.45      0.71      0.55       873

    accuracy                           0.42      2301
   macro avg       0.37      0.30      0.29      2301
weighted avg       0.40      0.42      0.38      2301



In [22]:
equip_rail_class_report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).T
equip_rail_class_report

Unnamed: 0,precision,recall,f1-score,support
ATK,0.272727,0.142857,0.1875,126.0
BNSF,0.388393,0.40591,0.396958,643.0
CSX,0.352459,0.14726,0.207729,292.0
NS,0.358974,0.076294,0.125843,367.0
UP,0.453412,0.707904,0.552773,873.0
accuracy,0.420687,0.420687,0.420687,0.420687
macro avg,0.365193,0.296045,0.294161,2301.0
weighted avg,0.397475,0.420687,0.377349,2301.0


In [30]:
equip_rail_class_report.to_csv('equip_rail_class_report.csv')

----

----

# Modeling Equipment Rail with Different Features

In [23]:
df.columns.to_list()

['railroad_company',
 'accident_type',
 'state_fips',
 'temp',
 'visibility',
 'weather',
 'train_speed',
 'train_direction',
 'train_weight',
 'train_type',
 'track_type',
 'front_engines',
 'loadfrght_cars',
 'loadpass_cars',
 'emptyfrght_cars',
 'emptypass_cars',
 'equip_damage',
 'track_damage',
 'cause',
 'total_killed',
 'total_injured',
 'max_speed',
 'total_damage',
 'engineers_onduty',
 'conductors_onduty',
 'brakemen_onduty',
 'region',
 'typrr',
 'lat',
 'long',
 'signal_type',
 'date',
 'season',
 'state',
 'year']

In [24]:
equip_df = df[['total_killed','total_injured','max_speed','railroad_company', 'track_damage', 'total_damage', 'equip_damage', ]]

In [25]:
object_cols = preprocessing.get_object_cols(equip_df)
numeric_cols = preprocessing.get_numeric_X_cols(X_train, object_cols)

In [26]:
#Split the Data
train, validate, X_train, y_train, X_validate, y_validate, X_test, y_test = preprocessing.train_validate_test(equip_df, 'railroad_company')

In [27]:
train.head(1)

Unnamed: 0,total_killed,total_injured,max_speed,railroad_company,track_damage,total_damage,equip_damage
8116,0,0,2,BNSF,21949,22049,100


In [28]:
X_train_scaled, X_validate_scaled, X_test_scaled = min_max_scale(X_train, X_validate, X_test, numeric_cols)

KeyError: "['weather'] not in index"

In [None]:
X_train_scaled.head(1)

---

**Logistic Regression**

In [None]:
#Using the scaled data, make a logistic regression model
lm = LogisticRegression()

#Fit the lm object to the vectorized data
lm.fit(X_train_scaled, y_train)

#Create corresponding dataframes 
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

#Form predictions uisng the lm model
train['predicted'] = lm.predict(X_train_scaled)
validate["predicted"] = lm.predict(X_validate_scaled)
test['predicted'] = lm.predict(X_test_scaled)

#Review how the lm model performed on the in-sample data
print('In-sample data model performance:')
print(classification_report(train.actual, train.predicted, zero_division=0))
print('----------------')
#Review how the lm model performed on the out-of-sample data
print('Out-of-sample data model performance:')
print(classification_report(validate.actual, validate.predicted, zero_division=0))

**Takeaways:**

---

**KNN**

In [None]:
#Create the KNN object with a k = 7
knn = KNeighborsClassifier(n_neighbors=15, weights='distance')

#Fit the object to the scaled training data
knn.fit(X_train_scaled, y_train)

#Create corresponding dataframes 
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

#Form predictions uisng the knn model
train['predicted'] = knn.predict(X_train_scaled)
validate["predicted"] = knn.predict(X_validate_scaled)

#Review how the knn model performed on the in-sample data
print('In-sample data model performance:')
print(classification_report(train.actual, train.predicted,  zero_division=0))
print('----------------')
#Review how the knn model performed on the out-of-sample data
print('Out-of-sample data model performance:')
print(classification_report(validate.actual, validate.predicted, zero_division=0))

**Takeaways:**

---

**Random Forest**

In [None]:
#Create the RF object
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=2,
                            n_estimators=100,
                            max_depth=12, 
                            random_state=123)

#Fit the RF object to the training data
rf.fit(X_train_scaled, y_train)

#Predict on y
y_pred = rf.predict(X_train_scaled)

#Evaluate
print('In-sample data model performance:')
print(classification_report(y_train, y_pred))

In [None]:
#Predict using out of sample data
y_pred = rf.predict(X_validate_scaled)
print('----------------')
print('Out-of-sample data model performance:')
print(classification_report(y_validate, y_pred, zero_division=0))

**Takeaways:**

---

**Decision Tree Modeling**

In [None]:
#Make the CLF object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=8, random_state=123)

#Fit the model on the training set 
clf = clf.fit(X_train_scaled, y_train)

#Make predictions
y_pred = clf.predict(X_train_scaled)

#Evaluate model performance on training data
print('In-sample data model performance:')
print(classification_report(y_train, y_pred))

In [None]:
# Predict on validate
y_pred = clf.predict(X_validate_scaled)

# Evaluate model performance on out-of-sample data
print('Out-of-sample data model performance:')
print(classification_report(y_validate, y_pred, zero_division=0))

---

# Modeling After Removing Outliers in Total Damages

In [43]:
#Aquire the data
df = acquire.get_equiprail()
df.shape

(22402, 146)

In [44]:
#Prepare the data
df = prepare.prep_equip_df(df)
df.shape

(11504, 35)

In [45]:
#Remove Outliers from the dataframe
df = df[df.total_damage < 234898.75]
df.shape

(10100, 35)

In [46]:
#Subset the Data Based on Features Selected
equip_df = df[['total_damage', 'railroad_company']]
equip_df.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10100 entries, 0 to 11503
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   total_damage      10100 non-null  int64 
 1   railroad_company  10100 non-null  object
dtypes: int64(1), object(1)
memory usage: 236.7+ KB


In [47]:
#Split the Data
train, validate, X_train, y_train, X_validate, y_validate, X_test, y_test = preprocessing.train_validate_test(equip_df, 'railroad_company')

In [48]:
#Define Object and Numeric Columns
object_cols = preprocessing.get_object_cols(equip_df)
numeric_cols = preprocessing.get_numeric_X_cols(X_train, object_cols)

In [49]:
#Scale the Data
X_train_scaled, X_validate_scaled, X_test_scaled = min_max_scale(X_train, X_validate, X_test, numeric_cols)

---

**Baseline Accuracy**

In [50]:
print(f'Baseline Accuracy: {round(max(df.railroad_company.value_counts()) / df.shape[0] *100)}%')

Baseline Accuracy: 37%


**Logistic Regression**

In [51]:
#Using the scaled data, make a logistic regression model
lm = LogisticRegression()

#Fit the lm object to the vectorized data
lm.fit(X_train_scaled, y_train)

#Create corresponding dataframes 
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

#Form predictions uisng the lm model
train['predicted'] = lm.predict(X_train_scaled)
validate["predicted"] = lm.predict(X_validate_scaled)
test['predicted'] = lm.predict(X_test_scaled)

#Review how the lm model performed on the in-sample data
print('In-sample data model performance:')
print(classification_report(train.actual, train.predicted, zero_division=0))
print('----------------')
#Review how the lm model performed on the out-of-sample data
print('Out-of-sample data model performance:')
print(classification_report(validate.actual, validate.predicted, zero_division=0))

In-sample data model performance:
              precision    recall  f1-score   support

         ATK       0.00      0.00      0.00       402
        BNSF       0.00      0.00      0.00      1547
         CSX       0.00      0.00      0.00       737
          NS       0.00      0.00      0.00       903
          UP       0.37      1.00      0.54      2067

    accuracy                           0.37      5656
   macro avg       0.07      0.20      0.11      5656
weighted avg       0.13      0.37      0.20      5656

----------------
Out-of-sample data model performance:
              precision    recall  f1-score   support

         ATK       0.00      0.00      0.00       162
        BNSF       0.00      0.00      0.00       592
         CSX       0.00      0.00      0.00       325
          NS       0.00      0.00      0.00       394
          UP       0.39      1.00      0.56       951

    accuracy                           0.39      2424
   macro avg       0.08      0.20      0.1

**Random Forest**

In [52]:
#Create the RF object
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=2,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

#Fit the RF object to the training data
rf.fit(X_train_scaled, y_train)

#Predict on y
y_pred = rf.predict(X_train_scaled)

#Evaluate
print('In-sample data model performance:')
print(classification_report(y_train, y_pred))

In-sample data model performance:
              precision    recall  f1-score   support

         ATK       0.00      0.00      0.00       402
        BNSF       0.56      0.37      0.45      1547
         CSX       0.86      0.02      0.03       737
          NS       0.80      0.06      0.11       903
          UP       0.43      0.94      0.59      2067

    accuracy                           0.46      5656
   macro avg       0.53      0.28      0.24      5656
weighted avg       0.55      0.46      0.36      5656



In [53]:
#Predict using out of sample data
y_pred = rf.predict(X_validate_scaled)
print('----------------')
print('Out-of-sample data model performance:')
print(classification_report(y_validate, y_pred, zero_division=0))

----------------
Out-of-sample data model performance:
              precision    recall  f1-score   support

         ATK       0.00      0.00      0.00       162
        BNSF       0.32      0.23      0.27       592
         CSX       0.09      0.00      0.01       325
          NS       0.32      0.02      0.03       394
          UP       0.41      0.84      0.55       951

    accuracy                           0.39      2424
   macro avg       0.23      0.22      0.17      2424
weighted avg       0.30      0.39      0.29      2424



**Takeaways:**
   - Even after removing outliers, the classification models do not make significant imporvements in predicting Railroad involved in an equipment accident.