In [29]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import argparse
import pickle

from sklearn import tree
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
state = 9

    # Get command line arguments
input_file_path = "../data/clean_data.csv"
output_file_path = "../results/"

    # 1. Data Importing
oil_spills = pd.read_csv(input_file_path).iloc[:, 1:]


    # 2. Categorical data mapping
    # 2.1 Record the mapping from categorical data to numerical values in dataframes, then export to CSV
number = LabelEncoder()
categories = ['cause','source','location','substance']
for c in categories:
    number.fit(oil_spills[c])
    mapping = dict(zip(number.transform(number.classes_),number.classes_))
    mdf=pd.DataFrame.from_dict(mapping, orient='index',columns=[c])
    mdf.to_csv(output_file_path + c + ".csv")

In [30]:
oil_spills['cause']=number.fit_transform(oil_spills['cause'].astype('str'))
oil_spills['source']=number.fit_transform(oil_spills['source'].astype('str'))
oil_spills['location']=number.fit_transform(oil_spills['location'].astype('str'))
oil_spills['substance']=number.fit_transform(oil_spills['substance'].astype('str'))

In [31]:
X = oil_spills.iloc[:,2:]
y = oil_spills.iloc[:,1]

In [32]:

oil_spills.head()

Unnamed: 0,date,cause,source,location,substance,volume,year_quarter
0,1/28/2013,1,2,8,1,1,1
1,11/10/2012,0,0,0,2,2,4
2,3/29/2007,0,1,4,0,2,1
3,3/21/2008,0,1,4,0,2,1
4,4/3/2008,0,1,4,0,2,2


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=state)
feature_cols=X.columns.values.tolist()

In [34]:
depth_range = range(1,7)
train_cv = []
test_cv = []
for d in depth_range:
    model = DecisionTreeClassifier(max_depth=d)
    train_cv.append(np.mean(cross_val_score(model, X_train, y_train, cv=10)))
    test_cv.append(np.mean(cross_val_score(model, X_test, y_test, cv=10)))
max_cv = max(train_cv)
opt_d = train_cv.index(max_cv)

In [35]:
    plt.plot(depth_range, train_cv, label="train")
    plt.plot(depth_range, test_cv, label="test")
    plt.xlabel("depth values")
    plt.ylabel("10-fold cross validation scores")
    plt.legend()
    plt.savefig(output_file_path+"depth_compare.png")
    plt.close()

In [36]:
    # 3.5 Set train model with the optimal max_depth
    model = DecisionTreeClassifier(max_depth=opt_d)
    model.fit(X_train,y_train)
    cv_score = np.mean(cross_val_score(model, X_train, y_train, cv=10))
    print("We choose the max_depth of", opt_d, ", and testing score is ", cv_score, " by cross validation")


    # 4 Exporting
    # 4.1 Export the model to file
    filename = output_file_path + 'finalized_model.sav'
    pickle.dump(model, open(filename, 'wb'))

    # 4.2 Find the most important features, export the features comparison to CSV
    fea_importance = model.tree_.compute_feature_importances(normalize=False)
    feature_df = pd.DataFrame([fea_importance], columns = feature_cols)
    feature_df.to_csv(output_file_path + "feature_compare.csv")

We choose the max_depth of 4 , and the testing score is  0.831016859648912  by cross validation


In [52]:
d = {"The CV score for the model is": [cv_score]}
df = pd.DataFrame(d)
df.to_csv(output_file_path + "model_score.csv")