# Retuned classification models, without the Resolution_Code column

### Based on features from Omnidian database 101, we compare K-Nearest Neighbors, Gradient Boosting, Random Forest, Bagging, and Logistic Regression.

In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-paper')

In [10]:
import pickle

Get Data

Note to self: handle those duplicates!

In [11]:
data101 = pd.read_csv('data/eda101.csv', index_col='Ticket_Id')
data101.head()

Unnamed: 0_level_0,Asset_Id,Resolution_Code,Root_Cause,Ticket_Creation_Reason,asset_type,latitude,longitude,tilt,azimuth,Ticket_Status,Ticket_Origin,Service_Type,Service_Partner,Ticket_Tags,Date_Ticket_Initially_Assigned,Date_Ticket_Closed,installed_by,installation_date
Ticket_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
23947,101112604,inspect system,root_cause_normal_wear_and_tear,communication offline,Residential Solar PV,38.944642,-121.248833,37.0,190.0,Closed,origin_omnidian_customer,Field Service,SunUp STS Service Team,"comment_from_another_agent,communication_statu...",2018-10-22 17:25:27,2018-11-29,Williams Lifetime Builders Inc. DBA Lifetime S...,2013-04-04 00:00:00
23947,101112604,inspect system,root_cause_normal_wear_and_tear,communication offline,Residential Solar PV,38.944642,-121.248833,37.0,190.0,Closed,origin_omnidian_customer,Field Service,SunUp STS Service Team,"comment_from_another_agent,communication_statu...",2018-10-22 17:25:27,2018-11-29,Williams Lifetime Builders Inc. DBA Lifetime S...,2013-04-04 00:00:00
27384,101113056,inspect system,root_cause_normal_wear_and_tear,system inspection,Residential Solar PV,34.101697,-118.146646,18.0,180.0,Closed,origin_homeowner,Field Service,IndaSpec Solar Service Team,"comment_from_another_agent,comment_from_end-us...",2018-12-20 06:49:02,2019-02-09,Green Tech Solutions Inc.,2014-04-08 00:00:00
27384,101113056,inspect system,root_cause_normal_wear_and_tear,system inspection,Residential Solar PV,34.101697,-118.146646,18.0,180.0,Closed,origin_homeowner,Field Service,IndaSpec Solar Service Team,"comment_from_another_agent,comment_from_end-us...",2018-12-20 06:49:02,2019-02-09,Green Tech Solutions Inc.,2014-04-08 00:00:00
27384,101113056,inspect system,root_cause_normal_wear_and_tear,system inspection,Residential Solar PV,34.101697,-118.146646,18.0,90.0,Closed,origin_homeowner,Field Service,IndaSpec Solar Service Team,"comment_from_another_agent,comment_from_end-us...",2018-12-20 06:49:02,2019-02-09,Green Tech Solutions Inc.,2014-04-08 00:00:00


Drop Resolution_Code, because that is highly correlated with Root_Cause

In [12]:
data101 = data101.drop('Resolution_Code', axis=1)

In [13]:
data101.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 861 entries, 23947 to 35379
Data columns (total 17 columns):
Asset_Id                          861 non-null int64
Root_Cause                        861 non-null object
Ticket_Creation_Reason            861 non-null object
asset_type                        861 non-null object
latitude                          861 non-null float64
longitude                         861 non-null float64
tilt                              861 non-null float64
azimuth                           861 non-null float64
Ticket_Status                     861 non-null object
Ticket_Origin                     861 non-null object
Service_Type                      861 non-null object
Service_Partner                   861 non-null object
Ticket_Tags                       861 non-null object
Date_Ticket_Initially_Assigned    861 non-null object
Date_Ticket_Closed                861 non-null object
installed_by                      861 non-null object
installation_date     

In [14]:
data101.asset_type.value_counts()

Residential Solar PV    861
Name: asset_type, dtype: int64

Assign target to Root_Cause and Train-Test-Split

Consider stratifying when train-test-split

In [15]:
X = data101.drop('Root_Cause', axis=1)
y = data101['Root_Cause']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

One hot encoding: since our data is largely categorical but our models require numeric inputs, we one hot encode it.


In [16]:
categoricals = ['Asset_Id', 'Ticket_Creation_Reason', 'asset_type', 
                'Ticket_Status', 'Ticket_Origin', 
                'Service_Partner', 'Ticket_Tags', 'installed_by']

In [17]:
data101.columns

Index(['Asset_Id', 'Root_Cause', 'Ticket_Creation_Reason', 'asset_type',
       'latitude', 'longitude', 'tilt', 'azimuth', 'Ticket_Status',
       'Ticket_Origin', 'Service_Type', 'Service_Partner', 'Ticket_Tags',
       'Date_Ticket_Initially_Assigned', 'Date_Ticket_Closed', 'installed_by',
       'installation_date'],
      dtype='object')

In [18]:
encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
encoder.fit(X_train[categoricals])

OneHotEncoder(categorical_features=None, categories='auto', drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='ignore',
              n_values=None, sparse=True)

In [19]:
X_train = pd.DataFrame(encoder.transform(X_train).toarray, index_col='Ticket_Id',
                         columns=encoder.get_feature_names())

  mask &= (ar1 != a)
  mask |= (ar1 == a)


ValueError: could not convert string to float: 'Closed'

In [27]:
X_train = pd.DataFrame(encoder.transform(X_train).toarray(), index_col='Ticket_Id',
                         columns=encoder.get_feature_names())
X_test = pd.DataFrame(encoder.transform(X_test).toarray(), index_col='Ticket_Id',
                        columns=encoder.get_feature_names())

  mask &= (ar1 != a)
  mask |= (ar1 == a)


ValueError: could not convert string to float: 'Closed'

Training our models.

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [None]:
filename = '101_knn.pkl'
pickle.dump(knn, open(filename, 'wb'))

In [None]:
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
lr.fit(X_train, y_train)

In [None]:
filename = '101_lr.pkl'
pickle.dump(lr, open(filename, 'wb'))

In [None]:
gb = GradientBoostingClassifier(random_state=42, min_samples_leaf=30)
gb.fit(X_train, y_train)

In [None]:
filename = '101_gb.pkl'
pickle.dump(gb, open(filename, 'wb'))

In [None]:
dt = DecisionTreeClassifier(random_state=42, min_samples_leaf=30)
dt.fit(X_train, y_train)

In [None]:
filename = '101_dt.pkl'
pickle.dump(dt, open(filename, 'wb'))

In [None]:
bg = BaggingClassifier(random_state=25565)
bg.fit(X_train, y_train)

In [None]:
filename = '101_bg.pkl'
pickle.dump(bg, open(filename, 'wb'))

In [None]:
rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf.fit(X_train, y_train);

In [None]:
filename = '101_rf.pkl'
pickle.dump(rf, open(filename, 'wb'))

Let's show our results

In [None]:
knn.score(X_test, y_test)

In [None]:
lr.score(X_test, y_test)

In [None]:
gb.score(X_test, y_test)

In [None]:
dt.score(X_test, y_test)

In [None]:
bg.score(X_test, y_test)

In [None]:
rf.score(X_test, y_test)

## Build a function to take in Ticket_Id and return root cause probabilities.

In [None]:
X_test

In [None]:
y_test

In [None]:
y_test.loc[y_test.index == 13043]

In [None]:
rf.predict_proba(X_test)

In [None]:
rf.predict(X_test)[0:5]

In [None]:
l_props = lr.predict_proba(X_test)[1]

In [None]:
zipp = list(zip(lr.classes_, l_props))

In [None]:
zipp

In [None]:
#want a dictionary that gives probability for each class, 
#prediction, and ground truth
def display_preds_truth(model, Ticket_Id, X_test, y_test):
    probs = model.predict_proba(X_test.loc[X_test.index == Ticket_Id])
    classes = model.classes_
    display = dict(zip(classes, probs))
    display['prediction'] = model.predict(X_test.loc[X_test.index == Ticket_Id])
    display['ground truth'] = y_test.loc[y_test.index == Ticket_Id]
    return display
        
    

In [None]:
lr.predict_proba(X_test[8541])

In [None]:
lr.predict_proba(X_test.loc[X_test.index == 8541])

In [None]:
display_preds_truth(lr, 8541, X_test, y_test)

In [None]:
#want a dictionary that gives probability for each class,
def display_probas(model, obs, X_test):
    probs = model.predict_proba(X_test)[obs]
    classes = model.classes_
    display = dict(zip(classes, probs))
    display['prediction'] = model.predict(X_test)[obs]
    return display

In [None]:
display_probas(lr, 1, X_test)

In [None]:
X_test.info()

In [None]:
y_test