### Based on features from Omnidian database 101, we compare K-Nearest Neighbors, Gradient Boosting, Random Forest, Bagging, and Logistic Regression.

In [126]:
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import precision_recall_curve, log_loss, roc_curve
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn import tree
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-paper')

In [2]:
import pickle

Get Data

In [3]:
df101_b = pd.read_csv('data/eda101.csv', parse_dates=['Date_Ticket_Initially_Assigned', 'Date_Ticket_Closed', 'installation_date'])

In [4]:
# df101_b.head()

In [5]:
df101_b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 825 entries, 0 to 824
Data columns (total 14 columns):
Ticket_Id                         825 non-null int64
Asset_Id                          825 non-null int64
Root_Cause                        825 non-null object
Ticket_Creation_Reason            825 non-null object
asset_type                        825 non-null object
latitude                          825 non-null float64
longitude                         825 non-null float64
tilt                              825 non-null float64
azimuth                           825 non-null float64
Ticket_Origin                     825 non-null object
Date_Ticket_Initially_Assigned    825 non-null datetime64[ns]
Date_Ticket_Closed                825 non-null datetime64[ns]
installed_by                      825 non-null object
installation_date                 825 non-null datetime64[ns]
dtypes: datetime64[ns](3), float64(4), int64(2), object(5)
memory usage: 90.3+ KB


Let's change everything to lowercase to reduce confusions.

In [58]:
df101_b.rename(str.lower, axis='columns', inplace=True)

In [60]:
df101_b = df101_b.applymap(lambda s:s.lower() if type(s) == str else s)


In [61]:
# df101_b.head()

Unnamed: 0,ticket_id,asset_id,root_cause,ticket_creation_reason,asset_type,latitude,longitude,tilt,azimuth,ticket_origin,date_ticket_initially_assigned,date_ticket_closed,installed_by,installation_date
0,23947,101112604,root_cause_normal_wear_and_tear,communication offline,residential solar pv,38.944642,-121.248833,37.0,190.0,origin_omnidian_customer,2018-10-22 17:25:27,2018-11-29,williams lifetime builders inc. dba lifetime s...,2013-04-04
2,27384,101113056,root_cause_normal_wear_and_tear,system inspection,residential solar pv,34.101697,-118.146646,18.0,180.0,origin_homeowner,2018-12-20 06:49:02,2019-02-09,green tech solutions inc.,2014-04-08
4,27384,101113056,root_cause_normal_wear_and_tear,system inspection,residential solar pv,34.101697,-118.146646,18.0,90.0,origin_homeowner,2018-12-20 06:49:02,2019-02-09,green tech solutions inc.,2014-04-08
5,22820,101112180,root_cause_normal_wear_and_tear,communication offline,residential solar pv,34.075427,-117.16714,23.0,180.0,origin_omnidian_customer,2018-10-04 19:24:52,2018-11-20,horizon solar power,2013-01-03
6,8568,101111679,root_cause_normal_wear_and_tear,communication offline,residential solar pv,33.691802,-112.28623,23.0,240.0,origin_omnidian_customer,2018-01-30 17:30:09,2018-03-13,arizona solar solutions,2012-12-27


### Both ticket_id and asset_id need to be strings

In [62]:
df101_b[['ticket_id', 'asset_id']] = df101_b[['ticket_id', 'asset_id']].astype(str)
df101_b.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 638 entries, 0 to 823
Data columns (total 14 columns):
ticket_id                         638 non-null object
asset_id                          638 non-null object
root_cause                        638 non-null object
ticket_creation_reason            638 non-null object
asset_type                        638 non-null object
latitude                          638 non-null float64
longitude                         638 non-null float64
tilt                              638 non-null float64
azimuth                           638 non-null float64
ticket_origin                     638 non-null object
date_ticket_initially_assigned    638 non-null datetime64[ns]
date_ticket_closed                638 non-null datetime64[ns]
installed_by                      638 non-null object
installation_date                 638 non-null datetime64[ns]
dtypes: datetime64[ns](3), float64(4), object(7)
memory usage: 74.8+ KB


Now drop duplicates

In [63]:
df101_b.drop_duplicates(inplace=True)

In [116]:
df101_b.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 638 entries, 0 to 823
Data columns (total 14 columns):
ticket_id                         638 non-null object
asset_id                          638 non-null object
root_cause                        638 non-null object
ticket_creation_reason            638 non-null object
asset_type                        638 non-null object
latitude                          638 non-null float64
longitude                         638 non-null float64
tilt                              638 non-null float64
azimuth                           638 non-null float64
ticket_origin                     638 non-null object
date_ticket_initially_assigned    638 non-null datetime64[ns]
date_ticket_closed                638 non-null datetime64[ns]
installed_by                      638 non-null object
installation_date                 638 non-null datetime64[ns]
dtypes: datetime64[ns](3), float64(4), object(7)
memory usage: 94.8+ KB


In [117]:
df101_b.isnull().any()

ticket_id                         False
asset_id                          False
root_cause                        False
ticket_creation_reason            False
asset_type                        False
latitude                          False
longitude                         False
tilt                              False
azimuth                           False
ticket_origin                     False
date_ticket_initially_assigned    False
date_ticket_closed                False
installed_by                      False
installation_date                 False
dtype: bool

## Because we have an inbalanced class distribution, we'll need to vectorize all our features befor splitting. We'll use LabelEncoder for our target column, then we'll one-hot-encode the rest of the categorical features.

In [125]:
y = df101_b['root_cause'].copy()

In [127]:
le = LabelEncoder()
y_le = le.fit_transform(y)

In [128]:
y.shape

(638,)

In [129]:
X = df101_b.drop(['root_cause'], axis=1).copy()

In [130]:
# # List our categorical features
categoricals = list(X.columns[(X.dtypes.values == np.dtype('object'))])
categoricals

['ticket_id',
 'asset_id',
 'ticket_creation_reason',
 'asset_type',
 'ticket_origin',
 'installed_by']

In [131]:
# List our numerical features
nums = list(df101_b.columns[(df101_b.dtypes.values != np.dtype('object'))])
nums

['latitude',
 'longitude',
 'tilt',
 'azimuth',
 'date_ticket_initially_assigned',
 'date_ticket_closed',
 'installation_date']

In [132]:
df_nums = df101_b[nums].copy()

In [133]:
cats = df101_b[categoricals].copy()

In [134]:
encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
encoder.fit(cats)

OneHotEncoder(categorical_features=None, categories='auto', drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='ignore',
              n_values=None, sparse=True)

In [135]:
enc_cats = pd.DataFrame(encoder.transform(cats).toarray(),
                       columns=encoder.get_feature_names())

### Now we join the numerical features together.

In [136]:
X = df_nums.join(enc_cats)
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 638 entries, 0 to 823
Columns: 895 entries, latitude to x5_williams lifetime builders inc. dba lifetime solar
dtypes: datetime64[ns](3), float64(892)
memory usage: 4.4 MB


### Next we split for training and testing.

In [137]:
X_train, X_test, y_train, y_test = train_test_split(X, y_le, random_state=42, test_size=0.2)

In [139]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 510 entries, 764 to 131
Columns: 895 entries, latitude to x5_williams lifetime builders inc. dba lifetime solar
dtypes: datetime64[ns](3), float64(892)
memory usage: 3.5 MB


In [142]:
y_train.shape

(510,)

In [140]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128 entries, 324 to 325
Columns: 895 entries, latitude to x5_williams lifetime builders inc. dba lifetime solar
dtypes: datetime64[ns](3), float64(892)
memory usage: 896.0 KB


In [143]:
y_test.shape

(128,)

### Assign target to Root_Cause, then Train-Test-Split. After we split and encode we'll take the ticket_id off so we can use it later to look rows up.

In [66]:
# X_t = df101_b.drop(['root_cause'], axis=1).copy()
# y_t = df101_b['root_cause']
# X_train, X_test, y_train, y_test = train_test_split(X_t, y_t, random_state=42, test_size=0.2)

In [67]:
# train_ticket = X_train.ticket_id

In [68]:
# X_train.drop(['ticket_id'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [69]:
# X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 510 entries, 764 to 131
Data columns (total 12 columns):
asset_id                          510 non-null object
ticket_creation_reason            510 non-null object
asset_type                        510 non-null object
latitude                          510 non-null float64
longitude                         510 non-null float64
tilt                              510 non-null float64
azimuth                           510 non-null float64
ticket_origin                     510 non-null object
date_ticket_initially_assigned    510 non-null datetime64[ns]
date_ticket_closed                510 non-null datetime64[ns]
installed_by                      510 non-null object
installation_date                 510 non-null datetime64[ns]
dtypes: datetime64[ns](3), float64(4), object(5)
memory usage: 51.8+ KB


In [70]:
# test_ticket = X_test.ticket_id

In [71]:
# X_test.drop(['ticket_id'], axis=1, inplace=True)

In [72]:
# # List our categorical features
# categoricals = list(X_test.columns[(X_test.dtypes.values == np.dtype('object'))])
# categoricals

['asset_id',
 'ticket_creation_reason',
 'asset_type',
 'ticket_origin',
 'installed_by']

In [73]:
# X_train_cat = X_train[categoricals].copy()

In [74]:
# X_test_cat = X_test[categoricals].copy()

In [75]:
# X_nums = list(X_test.columns[(X_test.dtypes.values != np.dtype('object'))])
# X_nums

['latitude',
 'longitude',
 'tilt',
 'azimuth',
 'date_ticket_initially_assigned',
 'date_ticket_closed',
 'installation_date']

In [76]:
# X_train_num = X_train[X_nums].copy()
# X_train_num.head()

In [77]:
# X_test_num = X_test[X_nums].copy()
# X_test_num.head()

In [78]:
# encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
# encoder.fit(X_train_cat)

OneHotEncoder(categorical_features=None, categories='auto', drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='ignore',
              n_values=None, sparse=True)

In [79]:
# X_train_enc = pd.DataFrame(encoder.transform(X_train_cat).toarray(),
#                          columns=encoder.get_feature_names())
# X_test_enc = pd.DataFrame(encoder.transform(X_test_cat).toarray(),
#                         columns=encoder.get_feature_names())

In [80]:
# X_train_enc.head()

Unnamed: 0,x0_101111473,x0_101111478,x0_101111489,x0_101111497,x0_101111521,x0_101111528,x0_101111533,x0_101111565,x0_101111573,x0_101111576,...,x4_sol-tek industries,x4_solar alliance of america inc.,x4_solar energy world nj,x4_solar plus llc,x4_sonic solar energy,x4_summerwindsolar llc phoenix,x4_sunstarter solar installations inc,x4_syntrol plumbing heating and air,x4_talbott solar home,x4_williams lifetime builders inc. dba lifetime solar
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [81]:
# X_train_enc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510 entries, 0 to 509
Columns: 368 entries, x0_101111473 to x4_williams lifetime builders inc. dba lifetime solar
dtypes: float64(368)
memory usage: 1.4 MB


Now we'll put the ticket_id, numerical columns, and encoded columns all together.

In [82]:
# X_train_num.insert(loc=0, column='ticket_id', value=train_ticket)


In [83]:
# X_train_mega = X_train_num.join(X_train_enc)
# X_train_mega.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 510 entries, 764 to 131
Columns: 376 entries, ticket_id to x4_williams lifetime builders inc. dba lifetime solar
dtypes: datetime64[ns](3), float64(372), object(1)
memory usage: 1.5+ MB


In [84]:
# X_test_num.insert(loc=0, column='ticket_id', value=test_ticket)
# X_test_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128 entries, 324 to 325
Data columns (total 8 columns):
ticket_id                         128 non-null object
latitude                          128 non-null float64
longitude                         128 non-null float64
tilt                              128 non-null float64
azimuth                           128 non-null float64
date_ticket_initially_assigned    128 non-null datetime64[ns]
date_ticket_closed                128 non-null datetime64[ns]
installation_date                 128 non-null datetime64[ns]
dtypes: datetime64[ns](3), float64(4), object(1)
memory usage: 9.0+ KB


In [85]:
# X_test_mega = X_test_num.join(X_test_enc)
# X_test_mega.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128 entries, 324 to 325
Columns: 376 entries, ticket_id to x4_williams lifetime builders inc. dba lifetime solar
dtypes: datetime64[ns](3), float64(372), object(1)
memory usage: 382.0+ KB


### Modeling

In [144]:
gb = GradientBoostingClassifier(random_state=42, min_samples_leaf=30)
gb.fit(X_train_enc, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=30, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [145]:
gb.pred = gb.predict(y_test)

ValueError: Expected 2D array, got 1D array instead:
array=[1. 3. 7. 7. 7. 5. 5. 5. 7. 7. 7. 7. 7. 5. 7. 6. 7. 6. 5. 7. 5. 7. 7. 7.
 6. 5. 5. 7. 7. 7. 7. 7. 6. 7. 7. 2. 5. 7. 8. 7. 7. 8. 7. 7. 6. 6. 4. 3.
 8. 7. 7. 7. 7. 7. 5. 7. 2. 7. 7. 5. 7. 7. 1. 1. 9. 3. 5. 5. 5. 7. 8. 3.
 6. 6. 5. 7. 5. 7. 2. 6. 2. 7. 7. 7. 3. 8. 6. 7. 7. 7. 7. 5. 7. 7. 1. 7.
 7. 6. 7. 5. 5. 7. 7. 4. 7. 7. 7. 7. 6. 6. 7. 7. 5. 7. 7. 8. 8. 7. 5. 7.
 2. 7. 7. 7. 7. 7. 7. 7.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [146]:
gb.score(X_test, y_test)

TypeError: float() argument must be a string or a number, not 'Timestamp'

In [90]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

ValueError: could not convert string to float: 'zero production'

In [None]:
# filename = '101_knn.pkl'
# pickle.dump(knn, open(filename, 'wb'))

In [None]:
# kn = pickle.load

In [None]:
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
lr.fit(X_train, y_train)

In [None]:
# filename = '101_lr.pkl'
# pickle.dump(lr, open(filename, 'wb'))

In [None]:
gb = GradientBoostingClassifier(random_state=42, min_samples_leaf=30)
gb.fit(X_train, y_train)

In [None]:
# filename = '101_gb.pkl'
# pickle.dump(gb, open(filename, 'wb'))

In [None]:
dt = DecisionTreeClassifier(random_state=42, min_samples_leaf=30)
dt.fit(X_train, y_train)

In [None]:
# filename = '101_dt.pkl'
# pickle.dump(dt, open(filename, 'wb'))

In [None]:
bg = BaggingClassifier(random_state=25565)
bg.fit(X_train, y_train)

In [None]:
# filename = '101_bg.pkl'
# pickle.dump(bg, open(filename, 'wb'))

In [None]:
rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf.fit(X_train, y_train);

In [None]:
# filename = '101_rf.pkl'
# pickle.dump(rf, open(filename, 'wb'))

Let's show our results

In [None]:
knn.score(X_test, y_test)

In [None]:
lr.score(X_test, y_test)

In [None]:
gb.score(X_test, y_test)

In [None]:
dt.score(X_test, y_test)

In [None]:
bg.score(X_test, y_test)

In [None]:
rf.score(X_test, y_test)

In [None]:
from sklearn.utils.testing import all_estimators

In [None]:
rf.predict_proba(X_test)[0]

In [None]:
rf.predict(X_test)[0:5]

In [None]:
l_props = lr.predict_proba(X_test)[1]

In [None]:
zipp = list(zip(lr.classes_, l_props))

In [None]:
#want a dictionary that gives probability for each class, 
#prediction, and ground truth
def display_preds_truth(model, obs, X_test, y_test):
    probs = model.predict_proba(X_test)[obs]
    classes = model.classes_
    display = dict(zip(classes, probs))
#     display['prediction'] = model.predict(X_test)[obs]
    display['ground truth'] = y_test[obs]
    return display
        
    
    

In [None]:
#want a dictionary that gives probability for each class,
def display_probas(model, obs, X_test):
    probs = model.predict_proba(X_test)[obs]
    classes = model.classes_
    display = dict(zip(classes, probs))
    display['prediction'] = model.predict(X_test)[obs]
    return display

In [None]:
display_probas(lr, 1, X_test)