### Based on features from Omnidian database 101, we compare K-Nearest Neighbors, Gradient Boosting, Random Forest, Bagging, and Logistic Regression.

In [86]:
!pip install cloudpickle



In [87]:
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import precision_recall_curve, log_loss, roc_curve
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn import tree
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-paper')

In [88]:
import pickle

### Get Data

In [3]:
df101_b = pd.read_csv('data/eda101.csv')

In [4]:
# df101_b.head()

In [5]:
df101_b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 825 entries, 0 to 824
Data columns (total 14 columns):
Ticket_Id                         825 non-null int64
Asset_Id                          825 non-null int64
Root_Cause                        825 non-null object
Ticket_Creation_Reason            825 non-null object
asset_type                        825 non-null object
latitude                          825 non-null float64
longitude                         825 non-null float64
tilt                              825 non-null float64
azimuth                           825 non-null float64
Ticket_Origin                     825 non-null object
Date_Ticket_Initially_Assigned    825 non-null object
Date_Ticket_Closed                825 non-null object
installed_by                      825 non-null object
installation_date                 825 non-null object
dtypes: float64(4), int64(2), object(8)
memory usage: 90.3+ KB


### Let's change everything to lowercase to reduce confusions.

In [6]:
df101_b.rename(str.lower, axis='columns', inplace=True)

In [7]:
df101_b = df101_b.applymap(lambda s:s.lower() if type(s) == str else s)


In [8]:
# df101_b.head()

### Both ticket_id and asset_id need to be strings

In [9]:
df101_b[['ticket_id', 'asset_id']] = df101_b[['ticket_id', 'asset_id']].astype(str)
df101_b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 825 entries, 0 to 824
Data columns (total 14 columns):
ticket_id                         825 non-null object
asset_id                          825 non-null object
root_cause                        825 non-null object
ticket_creation_reason            825 non-null object
asset_type                        825 non-null object
latitude                          825 non-null float64
longitude                         825 non-null float64
tilt                              825 non-null float64
azimuth                           825 non-null float64
ticket_origin                     825 non-null object
date_ticket_initially_assigned    825 non-null object
date_ticket_closed                825 non-null object
installed_by                      825 non-null object
installation_date                 825 non-null object
dtypes: float64(4), object(10)
memory usage: 90.3+ KB


### Now drop duplicates

In [10]:
df101_b.drop_duplicates(inplace=True)

In [11]:
df101_b.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 638 entries, 0 to 823
Data columns (total 14 columns):
ticket_id                         638 non-null object
asset_id                          638 non-null object
root_cause                        638 non-null object
ticket_creation_reason            638 non-null object
asset_type                        638 non-null object
latitude                          638 non-null float64
longitude                         638 non-null float64
tilt                              638 non-null float64
azimuth                           638 non-null float64
ticket_origin                     638 non-null object
date_ticket_initially_assigned    638 non-null object
date_ticket_closed                638 non-null object
installed_by                      638 non-null object
installation_date                 638 non-null object
dtypes: float64(4), object(10)
memory usage: 74.8+ KB


In [12]:
df101_b.isnull().any()

ticket_id                         False
asset_id                          False
root_cause                        False
ticket_creation_reason            False
asset_type                        False
latitude                          False
longitude                         False
tilt                              False
azimuth                           False
ticket_origin                     False
date_ticket_initially_assigned    False
date_ticket_closed                False
installed_by                      False
installation_date                 False
dtype: bool

## Because we have an inbalanced class distribution, we'll need to vectorize all our features befor splitting. We'll use LabelEncoder for our target column, then we'll one-hot-encode the rest of the categorical features.

In [13]:
y = df101_b['root_cause'].copy()

In [72]:
le = LabelEncoder()
y_le = le.fit_transform(y)
# Let's make sure we can get the names back.
list(le.inverse_transform(y_le))


['root_cause_normal_wear_and_tear',
 'root_cause_normal_wear_and_tear',
 'root_cause_normal_wear_and_tear',
 'root_cause_normal_wear_and_tear',
 'root_cause_normal_wear_and_tear',
 'root_cause_normal_wear_and_tear',
 'root_cause_normal_wear_and_tear',
 'root_cause_normal_wear_and_tear',
 'root_cause_normal_wear_and_tear',
 'root_cause_normal_wear_and_tear',
 'root_cause_normal_wear_and_tear',
 'root_cause_major_component_failure_warranty',
 'root_cause_normal_wear_and_tear',
 'root_cause_non-service_support',
 'root_cause_roof_issue',
 'root_cause_major_component_failure_warranty',
 'root_cause_major_component_failure_warranty',
 'root_cause_normal_wear_and_tear',
 'root_cause_normal_wear_and_tear',
 'root_cause_non-service_support',
 'root_cause_non-service_support',
 'root_cause_design/sale_issue',
 'root_cause_design/sale_issue',
 'root_cause_normal_wear_and_tear',
 'root_cause_normal_wear_and_tear',
 'root_cause_normal_wear_and_tear',
 'root_cause_normal_wear_and_tear',
 'root_caus

In [73]:
y.shape

(638,)

In [16]:
X = df101_b.drop(['root_cause'], axis=1).copy()

In [17]:
# # List our categorical features
categoricals = list(X.columns[(X.dtypes.values == np.dtype('object'))])
categoricals

['ticket_id',
 'asset_id',
 'ticket_creation_reason',
 'asset_type',
 'ticket_origin',
 'date_ticket_initially_assigned',
 'date_ticket_closed',
 'installed_by',
 'installation_date']

In [18]:
# List our numerical features
nums = list(df101_b.columns[(df101_b.dtypes.values != np.dtype('object'))])
nums

['latitude', 'longitude', 'tilt', 'azimuth']

In [19]:
df_nums = df101_b[nums].copy()

In [20]:
cats = df101_b[categoricals].copy()

In [21]:
encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
encoder.fit(cats)

OneHotEncoder(categorical_features=None, categories='auto', drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='ignore',
              n_values=None, sparse=True)

In [22]:
enc_cats = pd.DataFrame(encoder.transform(cats).toarray(),
                       columns=encoder.get_feature_names())

### Now we join the numerical features together.

In [23]:
X = df_nums.join(enc_cats)
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 638 entries, 0 to 823
Columns: 1926 entries, latitude to x8_2016-08-03 00:00:00
dtypes: float64(1926)
memory usage: 9.4 MB


### Let's be sure our binary zeroes didn't become nans or nulls.

In [42]:
X.isnull().any()

latitude                  False
longitude                 False
tilt                      False
azimuth                   False
x0_10006                   True
x0_10009                   True
x0_10010                   True
x0_10046                   True
x0_10088                   True
x0_10336                   True
x0_10337                   True
x0_10361                   True
x0_10363                   True
x0_10417                   True
x0_10461                   True
x0_10551                   True
x0_10657                   True
x0_10663                   True
x0_10707                   True
x0_10747                   True
x0_10752                   True
x0_10755                   True
x0_10908                   True
x0_10909                   True
x0_10910                   True
x0_10912                   True
x0_10913                   True
x0_10992                   True
x0_11041                   True
x0_11253                   True
                          ...  
x8_2015-

In [45]:
X.isna().any()

latitude                  False
longitude                 False
tilt                      False
azimuth                   False
x0_10006                   True
x0_10009                   True
x0_10010                   True
x0_10046                   True
x0_10088                   True
x0_10336                   True
x0_10337                   True
x0_10361                   True
x0_10363                   True
x0_10417                   True
x0_10461                   True
x0_10551                   True
x0_10657                   True
x0_10663                   True
x0_10707                   True
x0_10747                   True
x0_10752                   True
x0_10755                   True
x0_10908                   True
x0_10909                   True
x0_10910                   True
x0_10912                   True
x0_10913                   True
x0_10992                   True
x0_11041                   True
x0_11253                   True
                          ...  
x8_2015-

### Yes they did. Let's put our zeroes back in place.

In [46]:
X.fillna(0, inplace=True)

In [47]:
X.isnull().any()

latitude                  False
longitude                 False
tilt                      False
azimuth                   False
x0_10006                  False
x0_10009                  False
x0_10010                  False
x0_10046                  False
x0_10088                  False
x0_10336                  False
x0_10337                  False
x0_10361                  False
x0_10363                  False
x0_10417                  False
x0_10461                  False
x0_10551                  False
x0_10657                  False
x0_10663                  False
x0_10707                  False
x0_10747                  False
x0_10752                  False
x0_10755                  False
x0_10908                  False
x0_10909                  False
x0_10910                  False
x0_10912                  False
x0_10913                  False
x0_10992                  False
x0_11041                  False
x0_11253                  False
                          ...  
x8_2015-

### Next we split for training and testing.

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y_le, random_state=42, test_size=0.2)

In [49]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 510 entries, 764 to 131
Columns: 1926 entries, latitude to x8_2016-08-03 00:00:00
dtypes: float64(1926)
memory usage: 7.5 MB


In [50]:
y_train.shape

(510,)

In [51]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128 entries, 324 to 325
Columns: 1926 entries, latitude to x8_2016-08-03 00:00:00
dtypes: float64(1926)
memory usage: 1.9 MB


In [52]:
y_test.shape

(128,)

### Let's make sure we can get our root_cause names back in place for human readability.

In [78]:
y_train_names = list(le.inverse_transform(y_train))
len(y_train_names)

510

In [79]:
y_test_names = list(le.inverse_transform(y_test))
len(y_test_names)

128

## Modeling.

In [64]:
gb = GradientBoostingClassifier(random_state=42, min_samples_leaf=30)
gb.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=30, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [68]:
gb_pred = gb.predict(X_test)

In [69]:
gb.score(X_test, y_test)

0.4765625

In [81]:
print(classification_report(y_test, gb_pred))


              precision    recall  f1-score   support

           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         5
           3       0.00      0.00      0.00         5
           4       0.00      0.00      0.00         2
           5       0.26      0.38      0.31        21
           6       0.50      0.15      0.24        13
           7       0.57      0.73      0.64        70
           8       0.00      0.00      0.00         7
           9       0.00      0.00      0.00         1

    accuracy                           0.48       128
   macro avg       0.15      0.14      0.13       128
weighted avg       0.40      0.48      0.42       128



  'precision', 'predicted', average, warn_for)


In [84]:
from sklearn.metrics import balanced_accuracy_score

In [85]:
balanced_accuracy_score(y_test, gb_pred)

0.14037444037444036

In [82]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [83]:
knn.score(X_test, y_test)

0.3828125

In [None]:
# filename = '101_knn.pkl'
# pickle.dump(knn, open(filename, 'wb'))

In [None]:
# kn = pickle.load

In [89]:
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [90]:
lr.score(X_test, y_test)

0.546875

In [None]:
# filename = '101_lr.pkl'
# pickle.dump(lr, open(filename, 'wb'))

In [None]:
# filename = '101_gb.pkl'
# pickle.dump(gb, open(filename, 'wb'))

In [91]:
dt = DecisionTreeClassifier(random_state=42, min_samples_leaf=30)
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=30, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [92]:
dt.score(X_test, y_test)

0.484375

In [None]:
# filename = '101_dt.pkl'
# pickle.dump(dt, open(filename, 'wb'))

In [93]:
bg = BaggingClassifier(random_state=25565)
bg.fit(X_train, y_train)

BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=1.0, max_samples=1.0, n_estimators=10,
                  n_jobs=None, oob_score=False, random_state=25565, verbose=0,
                  warm_start=False)

In [94]:
bg.score(X_test, y_test)

0.53125

In [None]:
# filename = '101_bg.pkl'
# pickle.dump(bg, open(filename, 'wb'))

In [95]:
rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf.fit(X_train, y_train);

In [96]:
rf.score(X_test, y_test)

0.53125

In [None]:
# filename = '101_rf.pkl'
# pickle.dump(rf, open(filename, 'wb'))

Let's show our results

In [None]:
knn.score(X_test, y_test)

In [None]:
lr.score(X_test, y_test)

In [None]:
gb.score(X_test, y_test)

In [None]:
dt.score(X_test, y_test)

In [None]:
bg.score(X_test, y_test)

In [None]:
rf.score(X_test, y_test)

In [None]:
from sklearn.utils.testing import all_estimators

In [97]:
rf.predict_proba(X_test)[0]

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.09, 0.01, 0.9 , 0.  , 0.  ])

In [None]:
rf.predict(X_test)[0:5]

In [99]:
l_props = lr.predict_proba(X_test)[1]

In [100]:
zipp = list(zip(lr.classes_, l_props))

In [101]:
#want a dictionary that gives probability for each class, 
#prediction, and ground truth
def display_preds_truth(model, obs, X_test, y_test):
    probs = model.predict_proba(X_test)[obs]
    classes = model.classes_
    display = dict(zip(classes, probs))
#     display['prediction'] = model.predict(X_test)[obs]
    display['ground truth'] = y_test[obs]
    return display
        
    
    

In [102]:
#want a dictionary that gives probability for each class,
def display_probas(model, obs, X_test):
    probs = model.predict_proba(X_test)[obs]
    classes = model.classes_
    display = dict(zip(classes, probs))
    display['prediction'] = model.predict(X_test)[obs]
    return display

In [103]:
display_probas(lr, 1, X_test)

{0: 0.0020976295183632136,
 1: 0.007567920462641144,
 2: 0.027824670706177567,
 3: 0.005245495449828084,
 4: 0.006648726156005691,
 5: 0.04240686532491433,
 6: 0.05344159118190062,
 7: 0.8435075132221252,
 8: 0.009345673007980008,
 9: 0.0019139149700641534,
 'prediction': 7}