In [1]:
import pandas as pd
import numpy as np

import acquire
import prepare
import preprocessing

#Visualization Imports
import matplotlib.pyplot as plt
import seaborn as sns

#Sklearn Tools and Modules
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
import sklearn.preprocessing

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
#Acquire the Data
df = acquire.get_equiprail()
df.shape

(22402, 146)

In [3]:
#Prepare the data
df = prepare.prep_equip_df(df)
df.shape

(10550, 37)

In [4]:
equip_df = df[['track_damage', 'total_damage', 'weather', 'equip_damage', 'train_speed', 'railroad_company']]
equip_df.info()           

<class 'pandas.core.frame.DataFrame'>
Index: 10550 entries, GC0512102 to 137065
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   track_damage      10550 non-null  int64 
 1   total_damage      10550 non-null  int64 
 2   weather           10550 non-null  int64 
 3   equip_damage      10550 non-null  int64 
 4   train_speed       10550 non-null  int64 
 5   railroad_company  10550 non-null  object
dtypes: int64(5), object(1)
memory usage: 577.0+ KB


In [5]:
#Split the Data
train, validate, X_train, y_train, X_validate, y_validate, X_test, y_test = preprocessing.train_validate_test(equip_df, 'railroad_company')

In [6]:
#X_train.shape

In [7]:
#y_train.shape

In [8]:
object_cols = preprocessing.get_object_cols(equip_df)
numeric_cols = preprocessing.get_numeric_X_cols(X_train, object_cols)

In [9]:
#X_train_scaled, X_validate_scaled, X_test_scaled = preprocessing.min_max_scale(X_train, X_validate, X_test, numeric_cols)

In [10]:
def min_max_scale(X_train, X_validate, X_test, numeric_cols):
    '''
    this function takes in 3 dataframes with the same columns, 
    a list of numeric column names (because the scaler can only work with numeric columns),
    and fits a min-max scaler to the first dataframe and transforms all
    3 dataframes using that scaler. 
    it returns 3 dataframes with the same column names and scaled values. 
    '''
    # create the scaler object and fit it to X_train (i.e. identify min and max)
    # if copy = false, inplace row normalization happens and avoids a copy (if the input is already a numpy array).

    scaler = sklearn.preprocessing.MinMaxScaler()
    scaler.fit(X_train[numeric_cols])

    #scale X_train, X_validate, X_test using the mins and maxes stored in the scaler derived from X_train. 
    # 
    X_train_scaled_array = scaler.transform(X_train[numeric_cols])
    X_validate_scaled_array = scaler.transform(X_validate[numeric_cols])
    X_test_scaled_array = scaler.transform(X_test[numeric_cols])

    # convert arrays to dataframes
    X_train_scaled = pd.DataFrame(X_train_scaled_array, 
                                  columns=numeric_cols).\
                                  set_index([X_train.index.values])

    X_validate_scaled = pd.DataFrame(X_validate_scaled_array, 
                                     columns=numeric_cols).\
                                     set_index([X_validate.index.values])

    X_test_scaled = pd.DataFrame(X_test_scaled_array, 
                                 columns=numeric_cols).\
                                 set_index([X_test.index.values])

    
    return X_train_scaled, X_validate_scaled, X_test_scaled

In [11]:
X_train_scaled, X_validate_scaled, X_test_scaled = min_max_scale(X_train, X_validate, X_test, numeric_cols)

In [12]:
X_train_scaled

Unnamed: 0,track_damage,total_damage,weather,equip_damage,train_speed
124392,0.002771,0.001453,0.0,0.000071,0.009174
0416HO015,0.000160,0.001685,0.0,0.001646,0.014679
RD0418125,0.000941,0.003499,0.0,0.003099,0.016514
CA0413100,0.001515,0.000754,0.0,0.000000,0.091743
0512PR005,0.000992,0.001399,0.2,0.000926,0.007339
...,...,...,...,...,...
0120HL013,0.000008,0.001170,0.0,0.001197,0.075229
111711,0.012628,0.027468,0.2,0.021600,0.062385
SF0712112,0.004625,0.002512,0.0,0.000203,0.001835
NE0312100,0.000402,0.000651,0.0,0.000465,0.014679


---

## Modeling

**Baseline Model**

In [13]:
df.railroad_company.value_counts()

UP      4090
BNSF    2833
NS      1577
CSX     1352
ATK      698
Name: railroad_company, dtype: int64

In [14]:
print(f'Baseline Accuracy: {round(max(df.railroad_company.value_counts()) / df.shape[0] *100)}%')

Baseline Accuracy: 39%


**Logistic Regression Model**

In [15]:
#Using the scaled data, make a logistic regression model
lm = LogisticRegression()

#Fit the lm object to the vectorized data
lm.fit(X_train_scaled, y_train)

#Create corresponding dataframes 
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

#Form predictions uisng the lm model
train['predicted'] = lm.predict(X_train_scaled)
validate["predicted"] = lm.predict(X_validate_scaled)
test['predicted'] = lm.predict(X_test_scaled)

#Review how the lm model performed on the in-sample data
print('In-sample data model performance:')
print(classification_report(train.actual, train.predicted, zero_division=0))
print('----------------')
#Review how the lm model performed on the out-of-sample data
print('Out-of-sample data model performance:')
print(classification_report(validate.actual, validate.predicted, zero_division=0))

In-sample data model performance:
              precision    recall  f1-score   support

         ATK       0.95      0.05      0.10       377
        BNSF       0.50      0.01      0.01      1578
         CSX       0.00      0.00      0.00       735
          NS       0.00      0.00      0.00       910
          UP       0.39      1.00      0.56      2308

    accuracy                           0.39      5908
   macro avg       0.37      0.21      0.13      5908
weighted avg       0.35      0.39      0.23      5908

----------------
Out-of-sample data model performance:
              precision    recall  f1-score   support

         ATK       1.00      0.08      0.15       177
        BNSF       0.75      0.01      0.02       680
         CSX       0.00      0.00      0.00       332
          NS       0.00      0.00      0.00       360
          UP       0.39      1.00      0.56       983

    accuracy                           0.40      2532
   macro avg       0.43      0.22      0.1

**Takeaways:**
- The LM model does not improve upon the baseline accuracy. 39% on validate.
- What does this mean for the features we selected?

---

**KNN**

In [16]:
#Create the KNN object with a k = 7
knn = KNeighborsClassifier(n_neighbors=6, weights='distance')

#Fit the object to the scaled training data
knn.fit(X_train_scaled, y_train)

#Create corresponding dataframes 
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

#Form predictions uisng the knn model
train['predicted'] = knn.predict(X_train_scaled)
validate["predicted"] = knn.predict(X_validate_scaled)

#Review how the knn model performed on the in-sample data
print('In-sample data model performance:')
print(classification_report(train.actual, train.predicted,  zero_division=0))
print('----------------')
#Review how the knn model performed on the out-of-sample data
print('Out-of-sample data model performance:')
print(classification_report(validate.actual, validate.predicted, zero_division=0))

In-sample data model performance:
              precision    recall  f1-score   support

         ATK       1.00      1.00      1.00       377
        BNSF       1.00      1.00      1.00      1578
         CSX       1.00      1.00      1.00       735
          NS       1.00      1.00      1.00       910
          UP       1.00      1.00      1.00      2308

    accuracy                           1.00      5908
   macro avg       1.00      1.00      1.00      5908
weighted avg       1.00      1.00      1.00      5908

----------------
Out-of-sample data model performance:
              precision    recall  f1-score   support

         ATK       0.62      0.51      0.56       177
        BNSF       0.38      0.36      0.37       680
         CSX       0.22      0.16      0.18       332
          NS       0.21      0.19      0.20       360
          UP       0.46      0.55      0.50       983

    accuracy                           0.39      2532
   macro avg       0.38      0.35      0.3

**Takeaways:**
   - Using a K=6 returns the best accuracy score for predicting railroad company. 34% on accuracy.
   - Using weights=distance parameter results in overfitting on the train dataset but retains a similar accuracy score on the validate data as the default weights hyperparamter of 'uniform'

---

**Random Forest**

In [17]:
#Create the RF object
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=2,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

#Fit the RF object to the training data
rf.fit(X_train_scaled, y_train)

#Predict on y
y_pred = rf.predict(X_train_scaled)

#Evaluate
print('In-sample data model performance:')
print(classification_report(y_train, y_pred))

In-sample data model performance:
              precision    recall  f1-score   support

         ATK       0.94      0.52      0.67       377
        BNSF       0.62      0.61      0.61      1578
         CSX       0.72      0.27      0.39       735
          NS       0.80      0.18      0.30       910
          UP       0.57      0.91      0.70      2308

    accuracy                           0.61      5908
   macro avg       0.73      0.50      0.53      5908
weighted avg       0.66      0.61      0.58      5908



In [18]:
#Predict using out of sample data
y_pred = rf.predict(X_validate_scaled)
print('----------------')
print('Out-of-sample data model performance:')
print(classification_report(y_validate, y_pred, zero_division=0))

----------------
Out-of-sample data model performance:
              precision    recall  f1-score   support

         ATK       0.92      0.56      0.70       177
        BNSF       0.42      0.38      0.40       680
         CSX       0.35      0.09      0.14       332
          NS       0.33      0.04      0.08       360
          UP       0.47      0.81      0.59       983

    accuracy                           0.47      2532
   macro avg       0.50      0.38      0.38      2532
weighted avg       0.45      0.47      0.42      2532



**Takeaways:**
   - Random Forest achieves the highest accuracy score thus far. 42% on validate

---

**Decision Tree Modeling**

In [19]:
#Make the CLF object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=9, random_state=123)

#Fit the model on the training set 
clf = clf.fit(X_train_scaled, y_train)

#Make predictions
y_pred = clf.predict(X_train_scaled)

#Evaluate model performance on training data
print('In-sample data model performance:')
print(classification_report(y_train, y_pred))

In-sample data model performance:
              precision    recall  f1-score   support

         ATK       0.88      0.55      0.68       377
        BNSF       0.46      0.53      0.50      1578
         CSX       0.42      0.31      0.35       735
          NS       0.51      0.16      0.24       910
          UP       0.55      0.72      0.62      2308

    accuracy                           0.52      5908
   macro avg       0.57      0.45      0.48      5908
weighted avg       0.53      0.52      0.50      5908



In [20]:
# Predict on validate
y_pred = clf.predict(X_validate_scaled)

# Evaluate model performance on out-of-sample data
print('Out-of-sample data model performance:')
print(classification_report(y_validate, y_pred, zero_division=0))

Out-of-sample data model performance:
              precision    recall  f1-score   support

         ATK       0.81      0.54      0.64       177
        BNSF       0.38      0.43      0.41       680
         CSX       0.30      0.20      0.24       332
          NS       0.31      0.09      0.14       360
          UP       0.48      0.64      0.55       983

    accuracy                           0.44      2532
   macro avg       0.45      0.38      0.39      2532
weighted avg       0.43      0.44      0.42      2532



**Takeaways:**
   - Random Forest algorithm achieves a 38% accuracy on validate data set
   - The Decision Tree algoithm also performs best at being able to predict the individual railroad companies and this is evident from the precision scores. 

---

**Evaluate on Test**

In [21]:
#Predict on y
y_pred = rf.predict(X_test_scaled)

#Evaluate
print('In-sample data model performance:')
print(classification_report(y_test, y_pred))

In-sample data model performance:
              precision    recall  f1-score   support

         ATK       0.90      0.50      0.64       144
        BNSF       0.45      0.41      0.43       575
         CSX       0.33      0.07      0.12       285
          NS       0.35      0.07      0.12       307
          UP       0.46      0.79      0.58       799

    accuracy                           0.46      2110
   macro avg       0.50      0.37      0.38      2110
weighted avg       0.45      0.46      0.41      2110



In [22]:
equip_rail_class_report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).T
equip_rail_class_report

Unnamed: 0,precision,recall,f1-score,support
ATK,0.9,0.5,0.642857,144.0
BNSF,0.448669,0.410435,0.428701,575.0
CSX,0.328125,0.073684,0.120344,285.0
NS,0.349206,0.071661,0.118919,307.0
UP,0.457516,0.788486,0.579044,799.0
accuracy,0.464929,0.464929,0.464929,0.464929
macro avg,0.496703,0.368853,0.377973,2110.0
weighted avg,0.452067,0.464929,0.413525,2110.0


In [31]:
equip_rail_class_report.to_excel('equip_rail_model_report.xlsx')

----

----

# Modeling Equipment Rail with Different Features

In [24]:
df.columns.to_list()

['state_fips',
 'railroad_company',
 'accident_type',
 'state_x',
 'temp',
 'visibility',
 'weather',
 'train_speed',
 'train_direction',
 'train_weight',
 'train_type',
 'track_type',
 'front_engines',
 'loadfrght_cars',
 'loadpass_cars',
 'emptyfrght_cars',
 'emptypass_cars',
 'equip_damage',
 'track_damage',
 'cause',
 'total_killed',
 'total_injured',
 'max_speed',
 'total_damage',
 'engineers_onduty',
 'conductors_onduty',
 'brakemen_onduty',
 'region',
 'typrr',
 'lat',
 'long',
 'signal_type',
 'date',
 'season',
 'state_y',
 'state',
 'year']

In [25]:
equip_df = df[['total_killed','total_injured','max_speed','railroad_company', 'track_damage', 'total_damage', 'equip_damage', ]]

In [26]:
object_cols = preprocessing.get_object_cols(equip_df)
numeric_cols = preprocessing.get_numeric_X_cols(X_train, object_cols)

In [27]:
#Split the Data
train, validate, X_train, y_train, X_validate, y_validate, X_test, y_test = preprocessing.train_validate_test(equip_df, 'railroad_company')

In [28]:
train.head(1)

Unnamed: 0_level_0,total_killed,total_injured,max_speed,railroad_company,track_damage,total_damage,equip_damage
incdtno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
124392,0,0,5,NS,24135,25335,1200


In [29]:
X_train_scaled, X_validate_scaled, X_test_scaled = min_max_scale(X_train, X_validate, X_test, numeric_cols)

KeyError: "['train_speed', 'weather'] not in index"

In [None]:
X_train_scaled.head(1)

---

**Logistic Regression**

In [None]:
#Using the scaled data, make a logistic regression model
lm = LogisticRegression()

#Fit the lm object to the vectorized data
lm.fit(X_train_scaled, y_train)

#Create corresponding dataframes 
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

#Form predictions uisng the lm model
train['predicted'] = lm.predict(X_train_scaled)
validate["predicted"] = lm.predict(X_validate_scaled)
test['predicted'] = lm.predict(X_test_scaled)

#Review how the lm model performed on the in-sample data
print('In-sample data model performance:')
print(classification_report(train.actual, train.predicted, zero_division=0))
print('----------------')
#Review how the lm model performed on the out-of-sample data
print('Out-of-sample data model performance:')
print(classification_report(validate.actual, validate.predicted, zero_division=0))

**Takeaways:**

---

**KNN**

In [None]:
#Create the KNN object with a k = 7
knn = KNeighborsClassifier(n_neighbors=15, weights='distance')

#Fit the object to the scaled training data
knn.fit(X_train_scaled, y_train)

#Create corresponding dataframes 
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

#Form predictions uisng the knn model
train['predicted'] = knn.predict(X_train_scaled)
validate["predicted"] = knn.predict(X_validate_scaled)

#Review how the knn model performed on the in-sample data
print('In-sample data model performance:')
print(classification_report(train.actual, train.predicted,  zero_division=0))
print('----------------')
#Review how the knn model performed on the out-of-sample data
print('Out-of-sample data model performance:')
print(classification_report(validate.actual, validate.predicted, zero_division=0))

**Takeaways:**

---

**Random Forest**

In [None]:
#Create the RF object
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=2,
                            n_estimators=100,
                            max_depth=12, 
                            random_state=123)

#Fit the RF object to the training data
rf.fit(X_train_scaled, y_train)

#Predict on y
y_pred = rf.predict(X_train_scaled)

#Evaluate
print('In-sample data model performance:')
print(classification_report(y_train, y_pred))

In [None]:
#Predict using out of sample data
y_pred = rf.predict(X_validate_scaled)
print('----------------')
print('Out-of-sample data model performance:')
print(classification_report(y_validate, y_pred, zero_division=0))

**Takeaways:**

---

**Decision Tree Modeling**

In [None]:
#Make the CLF object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=8, random_state=123)

#Fit the model on the training set 
clf = clf.fit(X_train_scaled, y_train)

#Make predictions
y_pred = clf.predict(X_train_scaled)

#Evaluate model performance on training data
print('In-sample data model performance:')
print(classification_report(y_train, y_pred))

In [None]:
# Predict on validate
y_pred = clf.predict(X_validate_scaled)

# Evaluate model performance on out-of-sample data
print('Out-of-sample data model performance:')
print(classification_report(y_validate, y_pred, zero_division=0))

---

# Modeling After Removing Outliers in Total Damages

In [None]:
#Aquire the data
df = acquire.get_equiprail()
df.shape

In [None]:
#Prepare the data
df = prepare.prep_equip_df(df)
df.shape

In [None]:
#Remove Outliers from the dataframe
df = df[df.total_damage < 234898.75]
df.shape

In [None]:
#Subset the Data Based on Features Selected
equip_df = df[['total_damage', 'railroad_company']]
equip_df.info() 

In [None]:
#Split the Data
train, validate, X_train, y_train, X_validate, y_validate, X_test, y_test = preprocessing.train_validate_test(equip_df, 'railroad_company')

In [None]:
#Define Object and Numeric Columns
object_cols = preprocessing.get_object_cols(equip_df)
numeric_cols = preprocessing.get_numeric_X_cols(X_train, object_cols)

In [None]:
#Scale the Data
X_train_scaled, X_validate_scaled, X_test_scaled = min_max_scale(X_train, X_validate, X_test, numeric_cols)

---

**Baseline Accuracy**

In [None]:
print(f'Baseline Accuracy: {round(max(df.railroad_company.value_counts()) / df.shape[0] *100)}%')

**Logistic Regression**

In [None]:
#Using the scaled data, make a logistic regression model
lm = LogisticRegression()

#Fit the lm object to the vectorized data
lm.fit(X_train_scaled, y_train)

#Create corresponding dataframes 
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

#Form predictions uisng the lm model
train['predicted'] = lm.predict(X_train_scaled)
validate["predicted"] = lm.predict(X_validate_scaled)
test['predicted'] = lm.predict(X_test_scaled)

#Review how the lm model performed on the in-sample data
print('In-sample data model performance:')
print(classification_report(train.actual, train.predicted, zero_division=0))
print('----------------')
#Review how the lm model performed on the out-of-sample data
print('Out-of-sample data model performance:')
print(classification_report(validate.actual, validate.predicted, zero_division=0))

**Random Forest**

In [None]:
#Create the RF object
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=2,
                            n_estimators=100,
                            max_depth=10, 
                            random_state=123)

#Fit the RF object to the training data
rf.fit(X_train_scaled, y_train)

#Predict on y
y_pred = rf.predict(X_train_scaled)

#Evaluate
print('In-sample data model performance:')
print(classification_report(y_train, y_pred))

In [None]:
#Predict using out of sample data
y_pred = rf.predict(X_validate_scaled)
print('----------------')
print('Out-of-sample data model performance:')
print(classification_report(y_validate, y_pred, zero_division=0))

**Takeaways:**
   - Even after removing outliers, the classification models do not make significant imporvements in predicting Railroad involved in an equipment accident.