# *Inaugural classification models.

### Based on features 'ticket_id', 'asset_id', 'root_cause', 'ticket_creation_reason','asset_type', 'latitude', 'longitude', 'tilt', 'azimuth','ticket_origin', 'date_ticket_initially_assigned', 'date_ticket_closed', 'installed_by', and 'installation_date' from Omnidian database 101, we compare K-Nearest Neighbors, Gradient Boosting, Random Forest, Bagging, and Logistic Regression.

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, log_loss, f1_score, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-paper')

In [2]:
import pickle

Get Data

In [5]:
data101 = pd.read_csv('../data/eda101_a.csv')

In [6]:
data101.head()

Unnamed: 0,ticket_id,asset_id,root_cause,ticket_creation_reason,latitude,longitude,tilt,azimuth,inverter_count,module_count,ticket_origin,service_partner,date_ticket_initially_assigned,installed_by,installation_date
0,23947,101112604,root_cause_normal_wear_and_tear,communication offline,38.944642,-121.248833,37.0,190.0,1,21,origin_omnidian_customer,sunup sts service team,2018/10/22,williams lifetime builders inc. dba lifetime s...,2013/04/04
1,27384,101113056,root_cause_normal_wear_and_tear,system inspection,34.101697,-118.146646,18.0,180.0,1,15,origin_homeowner,indaspec solar service team,2018/12/20,green tech solutions inc.,2014/04/08
2,27384,101113056,root_cause_normal_wear_and_tear,system inspection,34.101697,-118.146646,18.0,180.0,1,5,origin_homeowner,indaspec solar service team,2018/12/20,green tech solutions inc.,2014/04/08
3,27384,101113056,root_cause_normal_wear_and_tear,system inspection,34.101697,-118.146646,18.0,90.0,1,5,origin_homeowner,indaspec solar service team,2018/12/20,green tech solutions inc.,2014/04/08
4,22820,101112180,root_cause_normal_wear_and_tear,communication offline,34.075427,-117.16714,23.0,180.0,1,24,origin_omnidian_customer,sunsystem technology,2018/10/04,horizon solar power,2013/01/03


In [7]:
data101.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 710 entries, 0 to 709
Data columns (total 15 columns):
ticket_id                         710 non-null int64
asset_id                          710 non-null int64
root_cause                        710 non-null object
ticket_creation_reason            710 non-null object
latitude                          710 non-null float64
longitude                         710 non-null float64
tilt                              710 non-null float64
azimuth                           710 non-null float64
inverter_count                    710 non-null int64
module_count                      710 non-null int64
ticket_origin                     710 non-null object
service_partner                   710 non-null object
date_ticket_initially_assigned    710 non-null object
installed_by                      710 non-null object
installation_date                 710 non-null object
dtypes: float64(4), int64(4), object(7)
memory usage: 83.3+ KB


In [8]:
data101.ticket_id.nunique()

476

In [9]:
data101.asset_id.nunique()

344

Let's start by converting everything to lowercase to minimze confusion and redundancy.

In [12]:
data101.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 710 entries, 0 to 709
Data columns (total 15 columns):
ticket_id                         710 non-null int64
asset_id                          710 non-null int64
root_cause                        710 non-null object
ticket_creation_reason            710 non-null object
latitude                          710 non-null float64
longitude                         710 non-null float64
tilt                              710 non-null float64
azimuth                           710 non-null float64
inverter_count                    710 non-null int64
module_count                      710 non-null int64
ticket_origin                     710 non-null object
service_partner                   710 non-null object
date_ticket_initially_assigned    710 non-null object
installed_by                      710 non-null object
installation_date                 710 non-null object
dtypes: float64(4), int64(4), object(7)
memory usage: 83.3+ KB


In [13]:
data101.columns

Index(['ticket_id', 'asset_id', 'root_cause', 'ticket_creation_reason',
       'latitude', 'longitude', 'tilt', 'azimuth', 'inverter_count',
       'module_count', 'ticket_origin', 'service_partner',
       'date_ticket_initially_assigned', 'installed_by', 'installation_date'],
      dtype='object')

Assign target to Root_Cause and Train-Test-Split

In [14]:
X = data101.drop('root_cause', axis=1).copy()
y = data101['root_cause']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [15]:
# X_test.head()

In [16]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142 entries, 294 to 56
Data columns (total 14 columns):
ticket_id                         142 non-null int64
asset_id                          142 non-null int64
ticket_creation_reason            142 non-null object
latitude                          142 non-null float64
longitude                         142 non-null float64
tilt                              142 non-null float64
azimuth                           142 non-null float64
inverter_count                    142 non-null int64
module_count                      142 non-null int64
ticket_origin                     142 non-null object
service_partner                   142 non-null object
date_ticket_initially_assigned    142 non-null object
installed_by                      142 non-null object
installation_date                 142 non-null object
dtypes: float64(4), int64(4), object(6)
memory usage: 16.6+ KB


One hot encoding: since our data is largely categorical but our models require numeric inputs, we one hot encode it.


In [17]:
encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
encoder.fit(X_train)

OneHotEncoder(categorical_features=None, categories='auto', drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='ignore',
              n_values=None, sparse=True)

In [18]:
X_train = pd.DataFrame(encoder.transform(X_train).toarray(),
                         columns=encoder.get_feature_names())
X_test = pd.DataFrame(encoder.transform(X_test).toarray(),
                        columns=encoder.get_feature_names())

## Training our models.

In [24]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [25]:
knn.score(X_test, y_test)

0.5563380281690141

In [26]:
knn_pred = knn.predict(X_test)

In [33]:
f1_score(knn_pred, y_test, average='weighted')

  'recall', 'true', average, warn_for)


0.5762513423508293

In [35]:
filename = '101_knn.pkl'
pickle.dump(knn, open(filename, 'wb'))

In [36]:
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [38]:
lr_pred = lr.predict(X_test)

In [39]:
lr.score(X_test, y_test)

0.7394366197183099

In [40]:
f1_score(lr_pred, y_test, average='weighted')

  'recall', 'true', average, warn_for)


0.7612973688621895

In [41]:
filename = '101_lr.pkl'
pickle.dump(lr, open(filename, 'wb'))

In [42]:
gb = GradientBoostingClassifier(random_state=42, min_samples_leaf=30)
gb.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=30, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [44]:
gb.score(X_test, y_test)

0.5704225352112676

In [45]:
gb_pred = gb.predict(X_test)

In [None]:
f1_score(lr_pred, y_test, average='weighted')

In [19]:
# filename = '101_gb.pkl'
# pickle.dump(gb, open(filename, 'wb'))

In [20]:
dt = DecisionTreeClassifier(random_state=42, min_samples_leaf=30)
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=30, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [21]:
# filename = '101_dt.pkl'
# pickle.dump(dt, open(filename, 'wb'))

In [22]:
bg = BaggingClassifier(random_state=25565)
bg.fit(X_train, y_train)

BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=1.0, n_estimators=10,
                  n_jobs=None, oob_score=False, random_state=25565, verbose=0,
                  warm_start=False)

In [23]:
# filename = '101_bg.pkl'
# pickle.dump(bg, open(filename, 'wb'))

In [24]:
rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf.fit(X_train, y_train);

In [25]:
# filename = '101_rf.pkl'
# pickle.dump(rf, open(filename, 'wb'))

Let's show our results

In [26]:
knn.score(X_test, y_test)

0.5818181818181818

In [27]:
lr.score(X_test, y_test)

0.8242424242424242

In [28]:
gb.score(X_test, y_test)

0.6121212121212121

In [29]:
dt.score(X_test, y_test)

0.593939393939394

In [30]:
bg.score(X_test, y_test)

0.8363636363636363

In [31]:
rf.score(X_test, y_test)

0.8303030303030303

What may be some other classifiers?

In [27]:
# from sklearn.utils.testing import all_estimators

In [56]:
rf.predict_proba(X_test)[0]

array([0.  , 0.  , 0.  , 0.01, 0.  , 0.11, 0.  , 0.88, 0.  , 0.  ])

In [60]:
rf.predict(X_test)[0:5]

array(['root_cause_normal_wear_and_tear',
       'root_cause_major_component_failure_warranty',
       'root_cause_normal_wear_and_tear',
       'root_cause_normal_wear_and_tear',
       'root_cause_major_component_failure_warranty'], dtype=object)

In [94]:
l_props = lr.predict_proba(X_test)[1]

In [100]:
zipp = list(zip(lr.classes_, l_props))

In [101]:
zipp

[('root_cause_design/sale_issue', 0.00043387759236169344),
 ('root_cause_environmental', 0.004206441640046431),
 ('root_cause_homeowner', 0.014217294259570568),
 ('root_cause_installer_workmanship', 0.004648355311296379),
 ('root_cause_major_component_failure_non-warranty', 0.0036398455245150647),
 ('root_cause_major_component_failure_warranty', 0.9000584314884308),
 ('root_cause_non-service_support', 0.027058029061024055),
 ('root_cause_normal_wear_and_tear', 0.0411124777776664),
 ('root_cause_roof_issue', 0.0027033523841786256),
 ('root_cause_service_workmanship', 0.0019218949609099132)]

In [110]:
#want a dictionary that gives probability for each class, 
#prediction, and ground truth
def display_preds_truth(model, obs, X_test, y_test):
    probs = model.predict_proba(X_test)[obs]
    classes = model.classes_
    display = dict(zip(classes, probs))
#     display['prediction'] = model.predict(X_test)[obs]
    display['ground truth'] = y_test[obs]
    return display
        
    
    

In [129]:
#want a dictionary that gives probability for each class,
def display_probas(model, obs, X_test):
    probs = model.predict_proba(X_test)[obs]
    classes = model.classes_
    display = dict(zip(classes, probs))
    display['prediction'] = model.predict(X_test)[obs]
    return display

In [130]:
display_probas(lr, 1, X_test)

{'root_cause_design/sale_issue': 0.00043387759236169344,
 'root_cause_environmental': 0.004206441640046431,
 'root_cause_homeowner': 0.014217294259570568,
 'root_cause_installer_workmanship': 0.004648355311296379,
 'root_cause_major_component_failure_non-warranty': 0.0036398455245150647,
 'root_cause_major_component_failure_warranty': 0.9000584314884308,
 'root_cause_non-service_support': 0.027058029061024055,
 'root_cause_normal_wear_and_tear': 0.0411124777776664,
 'root_cause_roof_issue': 0.0027033523841786256,
 'root_cause_service_workmanship': 0.0019218949609099132,
 'prediction': 'root_cause_major_component_failure_warranty'}