# Inaugural classification models.

### Based on features from Omnidian database 101, we compare K-Nearest Neighbors, Gradient Boosting, Random Forest, Bagging, and Logistic Regression.

In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-paper')



In [2]:
from omnotes import get_preds, score_model, human_readify

In [3]:
import pickle

Get Data

In [4]:
data101 = pd.read_csv('data/eda101.csv')
data101.head()

Unnamed: 0,Ticket_Id,Asset_Id,Resolution_Code,Root_Cause,Ticket_Creation_Reason,asset_type,latitude,longitude,tilt,azimuth,Ticket_Status,Ticket_Origin,Service_Type,Service_Partner,Ticket_Tags,Date_Ticket_Initially_Assigned,Date_Ticket_Closed,installed_by,installation_date
0,23947,101112604,inspect system,root_cause_normal_wear_and_tear,communication offline,Residential Solar PV,38.944642,-121.248833,37.0,190.0,Closed,origin_omnidian_customer,Field Service,SunUp STS Service Team,"comment_from_another_agent,communication_statu...",2018-10-22 17:25:27,2018-11-29,Williams Lifetime Builders Inc. DBA Lifetime S...,2013-04-04 00:00:00
1,23947,101112604,inspect system,root_cause_normal_wear_and_tear,communication offline,Residential Solar PV,38.944642,-121.248833,37.0,190.0,Closed,origin_omnidian_customer,Field Service,SunUp STS Service Team,"comment_from_another_agent,communication_statu...",2018-10-22 17:25:27,2018-11-29,Williams Lifetime Builders Inc. DBA Lifetime S...,2013-04-04 00:00:00
2,27384,101113056,inspect system,root_cause_normal_wear_and_tear,system inspection,Residential Solar PV,34.101697,-118.146646,18.0,180.0,Closed,origin_homeowner,Field Service,IndaSpec Solar Service Team,"comment_from_another_agent,comment_from_end-us...",2018-12-20 06:49:02,2019-02-09,Green Tech Solutions Inc.,2014-04-08 00:00:00
3,27384,101113056,inspect system,root_cause_normal_wear_and_tear,system inspection,Residential Solar PV,34.101697,-118.146646,18.0,180.0,Closed,origin_homeowner,Field Service,IndaSpec Solar Service Team,"comment_from_another_agent,comment_from_end-us...",2018-12-20 06:49:02,2019-02-09,Green Tech Solutions Inc.,2014-04-08 00:00:00
4,27384,101113056,inspect system,root_cause_normal_wear_and_tear,system inspection,Residential Solar PV,34.101697,-118.146646,18.0,90.0,Closed,origin_homeowner,Field Service,IndaSpec Solar Service Team,"comment_from_another_agent,comment_from_end-us...",2018-12-20 06:49:02,2019-02-09,Green Tech Solutions Inc.,2014-04-08 00:00:00


Let's start by converting everything to lowercase to minimze confusion and redundancy.

In [24]:
dfeda101 = dfeda101.astype(str).apply(lambda x: x.str.lower())
dfeda101.head()

Unnamed: 0,Ticket_Id,Asset_Id,Root_Cause,Ticket_Creation_Reason,asset_type,latitude,longitude,tilt,azimuth,Ticket_Origin,Service_Type,Date_Ticket_Initially_Assigned,Date_Ticket_Closed,installed_by,installation_date
0,23947,101112604,root_cause_normal_wear_and_tear,communication offline,residential solar pv,38.944642,-121.248833,37.0,190.0,origin_omnidian_customer,field service,2018-10-22 17:25:27,2018-11-29,williams lifetime builders inc. dba lifetime s...,2013-04-04 00:00:00
1,23947,101112604,root_cause_normal_wear_and_tear,communication offline,residential solar pv,38.944642,-121.248833,37.0,190.0,origin_omnidian_customer,field service,2018-10-22 17:25:27,2018-11-29,williams lifetime builders inc. dba lifetime s...,2013-04-04 00:00:00
2,27384,101113056,root_cause_normal_wear_and_tear,system inspection,residential solar pv,34.101697,-118.146646,18.0,180.0,origin_homeowner,field service,2018-12-20 06:49:02,2019-02-09,green tech solutions inc.,2014-04-08 00:00:00
3,27384,101113056,root_cause_normal_wear_and_tear,system inspection,residential solar pv,34.101697,-118.146646,18.0,180.0,origin_homeowner,field service,2018-12-20 06:49:02,2019-02-09,green tech solutions inc.,2014-04-08 00:00:00
4,27384,101113056,root_cause_normal_wear_and_tear,system inspection,residential solar pv,34.101697,-118.146646,18.0,90.0,origin_homeowner,field service,2018-12-20 06:49:02,2019-02-09,green tech solutions inc.,2014-04-08 00:00:00


In [5]:
data101.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 861 entries, 0 to 860
Data columns (total 19 columns):
Ticket_Id                         861 non-null int64
Asset_Id                          861 non-null int64
Resolution_Code                   861 non-null object
Root_Cause                        861 non-null object
Ticket_Creation_Reason            861 non-null object
asset_type                        861 non-null object
latitude                          861 non-null float64
longitude                         861 non-null float64
tilt                              861 non-null float64
azimuth                           861 non-null float64
Ticket_Status                     861 non-null object
Ticket_Origin                     861 non-null object
Service_Type                      861 non-null object
Service_Partner                   861 non-null object
Ticket_Tags                       861 non-null object
Date_Ticket_Initially_Assigned    861 non-null object
Date_Ticket_Closed           

Assign target to Root_Cause and Train-Test-Split

In [31]:
X = data101.drop('Root_Cause', axis=1)
y = data101['Root_Cause']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [32]:
X_test

Unnamed: 0,Ticket_Id,Asset_Id,Resolution_Code,Ticket_Creation_Reason,asset_type,latitude,longitude,tilt,azimuth,Ticket_Status,Ticket_Origin,Service_Type,Service_Partner,Ticket_Tags,Date_Ticket_Initially_Assigned,Date_Ticket_Closed,installed_by,installation_date
714,13043,101112438,inspect system,zero production,Residential Solar PV,39.925983,-75.595193,30.0,178.0,Closed,origin_omnidian_monitoring_team,Field Service,SunSystem Technology,"comment_from_another_agent,communication_statu...",2018-04-19 20:53:44,2018-06-15,Heat Shed a GeoGenix Company,2012-08-24 00:00:00
605,8541,101112943,Omnidian cancelled ticket,zero production,Residential Solar PV,33.507058,-112.381158,23.0,200.0,Closed,origin_omnidian_monitoring_team,Field Service,SunSystem Technology,"communication_status_communicating_no_change,o...",2018-01-30 16:32:36,2018-05-05,Arizona Solar Solutions,2014-02-11 00:00:00
120,20886,101111969,replace monitoring hardware,communication offline,Residential Solar PV,32.596853,-117.032791,14.0,172.0,Closed,origin_homeowner,Field Service,SunUp STS Service Team,"comment_from_another_agent,communication_statu...",2018-09-05 01:54:10,2018-10-20,Solar Alliance of America Inc.,2012-10-05 00:00:00
208,2743,101112404,replace monitoring hardware,communication offline,Residential Solar PV,40.680359,-75.329853,30.0,180.0,Closed,origin_omnidian_monitoring_team,Field Service,Power Overhaul,"closed_by_omnidian,communication_status_commun...",2017-08-14 17:15:28,2018-01-24,Sonic Solar Energy,2013-01-08 00:00:00
380,14316,101113072,replace inverter,zero production,Residential Solar PV,34.084329,-118.000602,18.0,225.0,Closed,origin_omnidian_monitoring_team,Field Service,SunSystem Technology,"comment_from_another_agent,comment_from_end-us...",2018-05-09 19:05:02,2018-07-07,Green Tech Solutions Inc.,2014-02-25 00:00:00
816,7622,101113290,incorrect homeowner information,zero production,Residential Solar PV,33.513388,-117.337312,18.0,250.0,Closed,origin_omnidian_monitoring_team,Field Service,HelioPower,"comment_from_another_agent,origin_omnidian_mon...",2018-01-02 19:23:37,2018-03-28,Green Tech Solutions Inc.,2015-06-19 00:00:00
575,20710,101112597,reset monitoring hardware,communication offline,Residential Solar PV,37.041301,-121.606724,23.0,180.0,Closed,origin_omnidian_monitoring_team,Field Service,SunUp STS Service Team,"comment_from_another_agent,comment_from_end-us...",2018-08-29 21:07:51,2018-10-06,Andromeda RETIRING DO NOT USE,2013-03-25 00:00:00
266,28463,101112914,replace monitoring hardware,communication offline,Residential Solar PV,40.453195,-74.396525,18.0,170.0,Closed,origin_omnidian_customer,Field Service,Power Overhaul,"comment_from_another_agent,communication_statu...",2019-01-11 00:06:37,2019-04-02,Green Apple Energy,2014-05-09 00:00:00
557,5083,101113124,Omnidian cancelled ticket,communication offline,Residential Solar PV,33.780198,-116.546310,30.0,215.0,Closed,origin_omnidian_monitoring_team,Field Service,IndaSpec Solar Service Team,"omnidian_made_comment,origin_omnidian_monitori...",2017-10-26 18:14:22,2018-04-13,Potere Solar,2014-07-15 00:00:00
825,22237,101113232,activate monitoring hardware,communication offline,Residential Solar PV,33.679811,-112.615651,18.0,190.0,Closed,origin_omnidian_customer,Field Service,SunSystem Technology,"comment_from_another_agent,communication_statu...",2018-09-21 19:09:08,2018-12-08,Summerwindsolar LLC Phoenix,2015-02-25 00:00:00


In [33]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 173 entries, 714 to 7
Data columns (total 18 columns):
Ticket_Id                         173 non-null int64
Asset_Id                          173 non-null int64
Resolution_Code                   173 non-null object
Ticket_Creation_Reason            173 non-null object
asset_type                        173 non-null object
latitude                          173 non-null float64
longitude                         173 non-null float64
tilt                              173 non-null float64
azimuth                           173 non-null float64
Ticket_Status                     173 non-null object
Ticket_Origin                     173 non-null object
Service_Type                      173 non-null object
Service_Partner                   173 non-null object
Ticket_Tags                       173 non-null object
Date_Ticket_Initially_Assigned    173 non-null object
Date_Ticket_Closed                173 non-null object
installed_by                 

One hot encoding: since our data is largely categorical but our models require numeric inputs, we one hot encode it.


In [29]:
encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
encoder.fit(X_train)

OneHotEncoder(categorical_features=None, categories='auto', drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='ignore',
              n_values=None, sparse=True)

In [8]:
# X_train = pd.DataFrame(encoder.transform(X_train).toarray(),
#                          columns=encoder.get_feature_names())
# X_test = pd.DataFrame(encoder.transform(X_test).toarray(),
#                         columns=encoder.get_feature_names())

In [35]:
X_test = pd.DataFrame(encoder.transform(X_test).toarray(),
                        columns=encoder.get_feature_names())

In [36]:
X_test

Unnamed: 0,x0_1866,x0_2015,x0_2591,x0_2743,x0_2766,x0_2809,x0_2820,x0_2926,x0_3220,x0_3664,...,x17_2016-03-09 00:00:00,x17_2016-03-16 00:00:00,x17_2016-03-21 00:00:00,x17_2016-03-22 00:00:00,x17_2016-04-27 00:00:00,x17_2016-05-11 00:00:00,x17_2016-05-18 00:00:00,x17_2016-06-29 00:00:00,x17_2016-07-01 00:00:00,x17_2016-08-03 00:00:00
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Training our models.

In [9]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [10]:
filename = '101_knn.pkl'
pickle.dump(knn, open(filename, 'wb'))

In [11]:
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
filename = '101_lr.pkl'
pickle.dump(lr, open(filename, 'wb'))

In [13]:
gb = GradientBoostingClassifier(random_state=42, min_samples_leaf=30)
gb.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=30, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [14]:
filename = '101_gb.pkl'
pickle.dump(gb, open(filename, 'wb'))

In [15]:
dt = DecisionTreeClassifier(random_state=42, min_samples_leaf=30)
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=30, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [16]:
filename = '101_dt.pkl'
pickle.dump(dt, open(filename, 'wb'))

In [17]:
bg = BaggingClassifier(random_state=25565)
bg.fit(X_train, y_train)

BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=1.0, n_estimators=10,
                  n_jobs=None, oob_score=False, random_state=25565, verbose=0,
                  warm_start=False)

In [18]:
filename = '101_bg.pkl'
pickle.dump(bg, open(filename, 'wb'))

In [19]:
rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf.fit(X_train, y_train);

In [20]:
filename = '101_rf.pkl'
pickle.dump(rf, open(filename, 'wb'))

Let's show our results

In [21]:
knn.score(X_test, y_test)

0.6820809248554913

In [22]:
lr.score(X_test, y_test)

0.8959537572254336

In [23]:
gb.score(X_test, y_test)

0.7456647398843931

In [24]:
dt.score(X_test, y_test)

0.6936416184971098

In [25]:
bg.score(X_test, y_test)

0.9248554913294798

In [26]:
rf.score(X_test, y_test)

0.9132947976878613

What may be some other classifiers?

In [27]:
# from sklearn.utils.testing import all_estimators

In [56]:
rf.predict_proba(X_test)[0]

array([0.  , 0.  , 0.  , 0.01, 0.  , 0.11, 0.  , 0.88, 0.  , 0.  ])

In [60]:
rf.predict(X_test)[0:5]

array(['root_cause_normal_wear_and_tear',
       'root_cause_major_component_failure_warranty',
       'root_cause_normal_wear_and_tear',
       'root_cause_normal_wear_and_tear',
       'root_cause_major_component_failure_warranty'], dtype=object)

In [94]:
l_props = lr.predict_proba(X_test)[1]

In [100]:
zipp = list(zip(lr.classes_, l_props))

In [101]:
zipp

[('root_cause_design/sale_issue', 0.00043387759236169344),
 ('root_cause_environmental', 0.004206441640046431),
 ('root_cause_homeowner', 0.014217294259570568),
 ('root_cause_installer_workmanship', 0.004648355311296379),
 ('root_cause_major_component_failure_non-warranty', 0.0036398455245150647),
 ('root_cause_major_component_failure_warranty', 0.9000584314884308),
 ('root_cause_non-service_support', 0.027058029061024055),
 ('root_cause_normal_wear_and_tear', 0.0411124777776664),
 ('root_cause_roof_issue', 0.0027033523841786256),
 ('root_cause_service_workmanship', 0.0019218949609099132)]

In [110]:
#want a dictionary that gives probability for each class, 
#prediction, and ground truth
def display_preds_truth(model, obs, X_test, y_test):
    probs = model.predict_proba(X_test)[obs]
    classes = model.classes_
    display = dict(zip(classes, probs))
#     display['prediction'] = model.predict(X_test)[obs]
    display['ground truth'] = y_test[obs]
    return display
        
    
    

In [129]:
#want a dictionary that gives probability for each class,
def display_probas(model, obs, X_test):
    probs = model.predict_proba(X_test)[obs]
    classes = model.classes_
    display = dict(zip(classes, probs))
    display['prediction'] = model.predict(X_test)[obs]
    return display

In [130]:
display_probas(lr, 1, X_test)

{'root_cause_design/sale_issue': 0.00043387759236169344,
 'root_cause_environmental': 0.004206441640046431,
 'root_cause_homeowner': 0.014217294259570568,
 'root_cause_installer_workmanship': 0.004648355311296379,
 'root_cause_major_component_failure_non-warranty': 0.0036398455245150647,
 'root_cause_major_component_failure_warranty': 0.9000584314884308,
 'root_cause_non-service_support': 0.027058029061024055,
 'root_cause_normal_wear_and_tear': 0.0411124777776664,
 'root_cause_roof_issue': 0.0027033523841786256,
 'root_cause_service_workmanship': 0.0019218949609099132,
 'prediction': 'root_cause_major_component_failure_warranty'}

In [124]:
y_test

714                    root_cause_normal_wear_and_tear
605        root_cause_major_component_failure_warranty
120                    root_cause_normal_wear_and_tear
208                    root_cause_normal_wear_and_tear
380        root_cause_major_component_failure_warranty
816        root_cause_major_component_failure_warranty
575                    root_cause_normal_wear_and_tear
266        root_cause_major_component_failure_warranty
557                    root_cause_normal_wear_and_tear
825                    root_cause_normal_wear_and_tear
580                    root_cause_normal_wear_and_tear
327                    root_cause_normal_wear_and_tear
623    root_cause_major_component_failure_non-warranty
365        root_cause_major_component_failure_warranty
827                     root_cause_non-service_support
198                    root_cause_normal_wear_and_tear
456                    root_cause_normal_wear_and_tear
465                    root_cause_normal_wear_and_tear
311       