## Importing Data

In [199]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn import tree
from sklearn import preprocessing

In [200]:
oil_spills=pd.read_csv('../data/clean_data.csv').iloc[:, 1:]
oil_spills.head(6)

Unnamed: 0,date,cause,source,location,substance,volume,year_quarter
0,1/28/2013,Operator Error,well,WAINWRIGHT,oil,1,1
1,11/10/2012,Equipment Failure,battery,BONNYVILLE,water,2,4
2,3/29/2007,Equipment Failure,pipeline,MEDICINE HAT,gas,2,1
3,3/21/2008,Equipment Failure,pipeline,MEDICINE HAT,gas,2,1
4,4/3/2008,Equipment Failure,pipeline,MEDICINE HAT,gas,2,2
5,11/2/2005,Operator Error,pipeline,MEDICINE HAT,oil,1,4


## Updating categorical data to numerical values

### 1. Record the mapping from categorical data to numerical values

In [201]:

from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()
categories = ['cause','source','location','substance']
for c in categories:
    number.fit(oil_spills[c])
    mapping = dict(zip(number.transform(number.classes_),number.classes_))
    mdf=pd.DataFrame.from_dict(mapping, orient='index',columns=[c])
    mdf.to_csv(output_file_path + c + ".csv")

### 2. update correspondingly in the data frame

In [202]:
oil_spills['cause']=number.fit_transform(oil_spills['cause'].astype('str'))
oil_spills['source']=number.fit_transform(oil_spills['source'].astype('str'))
oil_spills['location']=number.fit_transform(oil_spills['location'].astype('str'))
oil_spills['substance']=number.fit_transform(oil_spills['substance'].astype('str'))
oil_spills.head(6)

Unnamed: 0,date,cause,source,location,substance,volume,year_quarter
0,1/28/2013,1,2,8,1,1,1
1,11/10/2012,0,0,0,2,2,4
2,3/29/2007,0,1,4,0,2,1
3,3/21/2008,0,1,4,0,2,1
4,4/3/2008,0,1,4,0,2,2
5,11/2/2005,1,1,4,1,1,4


## Fit a decision tree model using sklearn 

In [203]:
# Create X (i.e., features) and y (i.e., target)
X = oil_spills.iloc[:,2:]
y = oil_spills.iloc[:,1]

#Split the training data (X,y) into two sets: train set and test set
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)

In [204]:
# find the optimal hyperparameter by comparing the scores from cross validation.

depth_range = range(1,10)
train_cv = []
test_cv = []

for d in depth_range:
    model = DecisionTreeClassifier(max_depth=d)
    train_cv.append(np.mean(cross_val_score(model, X_train, y_train, cv=10)))
    test_cv.append(np.mean(cross_val_score(model, X_test, y_test, cv=10)))

max_cv = max(train_cv)
opt_d = train_cv.index(max_cv)
print("Optimal Training Accuracy: ", max_cv)
print("Optimal Training Accuracy has max depth of: ",opt_d)
plt.plot(depth_range, train_cv, label="train")
plt.plot(depth_range, test_cv, label="test")
plt.xlabel("depth values")
plt.ylabel("10-fold cross validation scores")
plt.legend()
plt.savefig("depth_compare.png")
plt.close()

Optimal Training Accuracy:  0.8323843799819587
Optimal Training Accuracy has max depth of:  4


In [205]:
# Set train model to have max_depth of 4
model = DecisionTreeClassifier(max_depth=4)
model.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

## Export data to csv

### 1. Export mapping dictionaries to csv

In [192]:
#pd.DataFrame([cause_num_name_mapping, source_num_name_mapping, loc_num_name_mapping, sub_num_name_mapping])
cause=pd.DataFrame.from_dict(cause_num_name_mapping, orient='index',columns=['cause'])
source=pd.DataFrame.from_dict(source_num_name_mapping, orient='index',columns=['source'])

### 2. Importance of features

In [206]:
feat_importance = model.tree_.compute_feature_importances(normalize=False)
feature_df = pd.DataFrame([fea_importance], columns = feature_cols)
feature_df.to_csv(output_file_path + "feature_compare.csv")

In [207]:
feature_df

Unnamed: 0,source,location,substance,volume,year_quarter
0,0.032832,0.004671,0.000727,0.00248,0.000223


In [208]:
feature_cols

['source', 'location', 'substance', 'volume', 'year_quarter']

### 3. Export model accuracies to csv

In [209]:
cv_score = np.mean(cross_val_score(model, X_train, y_train, cv=10))

In [210]:
print("We choose the max_depth of", opt_d, ", and the testing score is ", cv_score, " by cross validation")

We choose the max_depth of 4 , and the testing score is  0.831016859648912  by cross validation


### 4. Export the model

In [211]:
import pickle

In [212]:
output_file_path='../results/'
filename = output_file_path + 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))