# Use ColumnTransformer instead of OneHotEncoder on data cleaned in eda101_a

### Based on features from Omnidian database 101, we compare K-Nearest Neighbors, Gradient Boosting, Random Forest, Bagging, and Logistic Regression.

In [29]:
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.linear_model import RidgeClassifier
from scipy import stats
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn import tree
import numpy as np
import pandas as pd
from datetime import datetime
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-paper')
import pickle

Get Data

In [2]:
df101_c = pd.read_csv('data/eda101_c.csv', parse_dates=['date_ticket_initially_assigned', 'date_ticket_closed', 'installation_date'])
df101_c.head()

Unnamed: 0,ticket_id,asset_id,root_cause,ticket_creation_reason,latitude,longitude,tilt,azimuth,ticket_origin,service_partner,date_ticket_initially_assigned,date_ticket_closed,installed_by,installation_date
0,23947,101112604,root_cause_normal_wear_and_tear,communication offline,38.944642,-121.248833,37.0,190.0,origin_omnidian_customer,sunup sts service team,2018-10-22,2018-11-29,williams lifetime builders inc. dba lifetime s...,2013-04-04
1,23947,101112604,root_cause_normal_wear_and_tear,communication offline,38.944642,-121.248833,37.0,190.0,origin_omnidian_customer,sunup sts service team,2018-10-22,2018-11-29,williams lifetime builders inc. dba lifetime s...,2013-04-04
2,27384,101113056,root_cause_normal_wear_and_tear,system inspection,34.101697,-118.146646,18.0,180.0,origin_homeowner,indaspec solar service team,2018-12-20,2019-02-09,green tech solutions inc.,2014-04-08
3,27384,101113056,root_cause_normal_wear_and_tear,system inspection,34.101697,-118.146646,18.0,180.0,origin_homeowner,indaspec solar service team,2018-12-20,2019-02-09,green tech solutions inc.,2014-04-08
4,27384,101113056,root_cause_normal_wear_and_tear,system inspection,34.101697,-118.146646,18.0,90.0,origin_homeowner,indaspec solar service team,2018-12-20,2019-02-09,green tech solutions inc.,2014-04-08


In [3]:
df101_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 825 entries, 0 to 824
Data columns (total 14 columns):
ticket_id                         825 non-null int64
asset_id                          825 non-null int64
root_cause                        825 non-null object
ticket_creation_reason            825 non-null object
latitude                          825 non-null float64
longitude                         825 non-null float64
tilt                              825 non-null float64
azimuth                           825 non-null float64
ticket_origin                     825 non-null object
service_partner                   825 non-null object
date_ticket_initially_assigned    825 non-null datetime64[ns]
date_ticket_closed                825 non-null datetime64[ns]
installed_by                      825 non-null object
installation_date                 825 non-null datetime64[ns]
dtypes: datetime64[ns](3), float64(4), int64(2), object(5)
memory usage: 90.3+ KB


Both ticket_id and asset_id need to be strings

In [4]:
df101_c[['ticket_id', 'asset_id']] = df101_c[['ticket_id', 'asset_id']].astype(object)
df101_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 825 entries, 0 to 824
Data columns (total 14 columns):
ticket_id                         825 non-null object
asset_id                          825 non-null object
root_cause                        825 non-null object
ticket_creation_reason            825 non-null object
latitude                          825 non-null float64
longitude                         825 non-null float64
tilt                              825 non-null float64
azimuth                           825 non-null float64
ticket_origin                     825 non-null object
service_partner                   825 non-null object
date_ticket_initially_assigned    825 non-null datetime64[ns]
date_ticket_closed                825 non-null datetime64[ns]
installed_by                      825 non-null object
installation_date                 825 non-null datetime64[ns]
dtypes: datetime64[ns](3), float64(4), object(7)
memory usage: 90.3+ KB


Assign target to Root_Cause and Train-Test-Split. We'll also take the ticket_id off now so we can use it later to look rows up.

In [5]:
X = df101_c.drop(['root_cause'], axis=1).copy()
y = df101_c['root_cause']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

Let's try ColumnTransformer. We'll pull ticket_id from the dataframes to keep them from being encoded, then we'll put them back together for our function later.

In [6]:
train_ticket = X_train.ticket_id

In [7]:
X_train.drop(['ticket_id'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [8]:
test_ticket = X_test.ticket_id

In [9]:
X_test.drop(['ticket_id'], axis=1, inplace=True)

In [10]:
# List our categorical features
categoricals = list(X_test.columns[(X_test.dtypes.values == np.dtype('object'))])
categoricals

['asset_id',
 'ticket_creation_reason',
 'ticket_origin',
 'service_partner',
 'installed_by']

In [11]:
X_nums = list(X_test.columns[(X_test.dtypes.values != np.dtype('object'))])
X_nums

['latitude',
 'longitude',
 'tilt',
 'azimuth',
 'date_ticket_initially_assigned',
 'date_ticket_closed',
 'installation_date']

In [12]:
X_train_num = X_train[X_nums].copy()
X_train_num.head()

Unnamed: 0,latitude,longitude,tilt,azimuth,date_ticket_initially_assigned,date_ticket_closed,installation_date
239,34.038597,-118.493806,40.0,225.0,2018-08-31,2018-11-20,2014-03-26
689,39.923479,-74.75141,30.0,160.0,2018-08-10,2018-08-10,2012-12-03
644,38.25163,-122.149367,23.0,152.0,2018-01-22,2018-03-08,2014-10-07
342,34.448895,-119.260475,9.0,180.0,2018-01-24,2018-03-09,2014-02-18
299,34.138321,-117.55659,23.0,180.0,2018-04-24,2018-06-23,2015-02-11


In [13]:
X_test_num = X_test[X_nums].copy()
X_test_num.head()

Unnamed: 0,latitude,longitude,tilt,azimuth,date_ticket_initially_assigned,date_ticket_closed,installation_date
611,37.818595,-121.93428,23.0,170.0,2018-06-12,2018-08-17,2012-02-10
174,33.840872,-111.771766,34.0,232.0,2017-08-04,2018-08-16,2015-04-30
67,40.768241,-74.510319,18.0,224.0,2018-03-15,2018-06-07,2013-09-23
168,33.866658,-118.387667,23.0,55.0,2018-06-25,2018-07-26,2015-11-10
275,34.314705,-118.432859,23.0,132.0,2018-04-02,2018-06-15,2015-08-17


In [14]:
preprocessor = make_column_transformer( (OneHotEncoder(handle_unknown='ignore'), categoricals))
encoder = preprocessor.fit(X_train)

In [15]:
X_train_enc = pd.DataFrame(encoder.transform(X_train).toarray(),
                         columns=encoder.get_feature_names())
X_test_enc = pd.DataFrame(encoder.transform(X_test).toarray(),
                        columns=encoder.get_feature_names())

In [16]:
X_train_enc.head()

Unnamed: 0,onehotencoder__x0_101111473,onehotencoder__x0_101111478,onehotencoder__x0_101111489,onehotencoder__x0_101111511,onehotencoder__x0_101111521,onehotencoder__x0_101111528,onehotencoder__x0_101111533,onehotencoder__x0_101111565,onehotencoder__x0_101111567,onehotencoder__x0_101111573,...,onehotencoder__x4_solar alliance of america inc.,onehotencoder__x4_solar energy world nj,onehotencoder__x4_solar plus llc,onehotencoder__x4_sonic solar energy,onehotencoder__x4_summerwindsolar llc phoenix,onehotencoder__x4_summit technology group,onehotencoder__x4_sunstarter solar installations inc,onehotencoder__x4_syntrol plumbing heating and air,onehotencoder__x4_talbott solar home,onehotencoder__x4_williams lifetime builders inc. dba lifetime solar
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
X_train_enc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660 entries, 0 to 659
Columns: 389 entries, onehotencoder__x0_101111473 to onehotencoder__x4_williams lifetime builders inc. dba lifetime solar
dtypes: float64(389)
memory usage: 2.0 MB


Now we'll put the ticket_id, numerical columns, and encoded columns all together.

In [18]:
X_train_num.insert(loc=0, column='ticket_id', value=train_ticket)
X_train_num.head()

Unnamed: 0,ticket_id,latitude,longitude,tilt,azimuth,date_ticket_initially_assigned,date_ticket_closed,installation_date
239,20902,34.038597,-118.493806,40.0,225.0,2018-08-31,2018-11-20,2014-03-26
689,19046,39.923479,-74.75141,30.0,160.0,2018-08-10,2018-08-10,2012-12-03
644,8381,38.25163,-122.149367,23.0,152.0,2018-01-22,2018-03-08,2014-10-07
342,8324,34.448895,-119.260475,9.0,180.0,2018-01-24,2018-03-09,2014-02-18
299,13319,34.138321,-117.55659,23.0,180.0,2018-04-24,2018-06-23,2015-02-11


In [19]:
X_train_mega = pd.concat([X_train_num, X_train_enc], axis=1, join='inner')
X_train_mega.head()

Unnamed: 0,ticket_id,latitude,longitude,tilt,azimuth,date_ticket_initially_assigned,date_ticket_closed,installation_date,onehotencoder__x0_101111473,onehotencoder__x0_101111478,...,onehotencoder__x4_solar alliance of america inc.,onehotencoder__x4_solar energy world nj,onehotencoder__x4_solar plus llc,onehotencoder__x4_sonic solar energy,onehotencoder__x4_summerwindsolar llc phoenix,onehotencoder__x4_summit technology group,onehotencoder__x4_sunstarter solar installations inc,onehotencoder__x4_syntrol plumbing heating and air,onehotencoder__x4_talbott solar home,onehotencoder__x4_williams lifetime builders inc. dba lifetime solar
239,20902,34.038597,-118.493806,40.0,225.0,2018-08-31,2018-11-20,2014-03-26,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
644,8381,38.25163,-122.149367,23.0,152.0,2018-01-22,2018-03-08,2014-10-07,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
342,8324,34.448895,-119.260475,9.0,180.0,2018-01-24,2018-03-09,2014-02-18,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
299,13319,34.138321,-117.55659,23.0,180.0,2018-04-24,2018-06-23,2015-02-11,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
383,11743,33.87713,-117.357747,18.0,180.0,2018-03-30,2018-06-01,2014-05-27,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
X_train_mega.isna().any().sum()

0

In [21]:
X_test_num.insert(loc=0, column='ticket_id', value=test_ticket)
X_test_num.head()

Unnamed: 0,ticket_id,latitude,longitude,tilt,azimuth,date_ticket_initially_assigned,date_ticket_closed,installation_date
611,15895,37.818595,-121.93428,23.0,170.0,2018-06-12,2018-08-17,2012-02-10
174,2591,33.840872,-111.771766,34.0,232.0,2017-08-04,2018-08-16,2015-04-30
67,11278,40.768241,-74.510319,18.0,224.0,2018-03-15,2018-06-07,2013-09-23
168,16584,33.866658,-118.387667,23.0,55.0,2018-06-25,2018-07-26,2015-11-10
275,11859,34.314705,-118.432859,23.0,132.0,2018-04-02,2018-06-15,2015-08-17


In [22]:
X_test_mega = pd.concat([X_test_num, X_test_enc], axis=1, join='inner')
X_test_mega.head()

Unnamed: 0,ticket_id,latitude,longitude,tilt,azimuth,date_ticket_initially_assigned,date_ticket_closed,installation_date,onehotencoder__x0_101111473,onehotencoder__x0_101111478,...,onehotencoder__x4_solar alliance of america inc.,onehotencoder__x4_solar energy world nj,onehotencoder__x4_solar plus llc,onehotencoder__x4_sonic solar energy,onehotencoder__x4_summerwindsolar llc phoenix,onehotencoder__x4_summit technology group,onehotencoder__x4_sunstarter solar installations inc,onehotencoder__x4_syntrol plumbing heating and air,onehotencoder__x4_talbott solar home,onehotencoder__x4_williams lifetime builders inc. dba lifetime solar
67,11278,40.768241,-74.510319,18.0,224.0,2018-03-15,2018-06-07,2013-09-23,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86,29226,32.837775,-116.770146,30.0,180.0,2019-01-24,2019-03-13,2015-03-17,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30,16499,34.242344,-118.47169,18.0,270.0,2018-06-25,2018-11-06,2015-06-26,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
39,10747,33.774322,-118.127746,18.0,270.0,2018-03-09,2018-04-21,2015-02-17,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
155,29251,41.699119,-70.073714,23.0,185.0,2019-01-24,2019-02-23,2012-05-02,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
X_test_mega.isna().any().sum()

0

### Modeling

In [26]:
random_state=42

In [27]:
def run_eval_model(Classifier, X_train, y_train, X_test, y_test):
    Classifier.fit(X_train, y_train)
    return Classifier.score(X_test, y_test)

In [28]:
run_eval_model(BernoulliNB(alpha=.01), X_train_mega, y_train, X_test_mega, y_test)

TypeError: float() argument must be a string or a number, not 'Timestamp'

In [31]:
gb = GradientBoostingClassifier(random_state=42, min_samples_leaf=30)
gb.fit(X_train_mega, y_train)

TypeError: float() argument must be a string or a number, not 'Timestamp'

In [None]:
gb.score(X_test_enc, y_test)

Training our models.

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [None]:
# filename = '101_knn.pkl'
# pickle.dump(knn, open(filename, 'wb'))

In [None]:
# kn = pickle.load

In [None]:
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
lr.fit(X_train, y_train)

In [None]:
# filename = '101_lr.pkl'
# pickle.dump(lr, open(filename, 'wb'))

In [None]:
gb = GradientBoostingClassifier(random_state=42, min_samples_leaf=30)
gb.fit(X_train, y_train)

In [None]:
# filename = '101_gb.pkl'
# pickle.dump(gb, open(filename, 'wb'))

In [None]:
dt = DecisionTreeClassifier(random_state=42, min_samples_leaf=30)
dt.fit(X_train, y_train)

In [None]:
# filename = '101_dt.pkl'
# pickle.dump(dt, open(filename, 'wb'))

In [None]:
bg = BaggingClassifier(random_state=25565)
bg.fit(X_train, y_train)

In [None]:
# filename = '101_bg.pkl'
# pickle.dump(bg, open(filename, 'wb'))

In [None]:
rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf.fit(X_train, y_train);

In [None]:
# filename = '101_rf.pkl'
# pickle.dump(rf, open(filename, 'wb'))

Let's show our results

In [None]:
knn.score(X_test, y_test)

In [None]:
lr.score(X_test, y_test)

In [None]:
gb.score(X_test, y_test)

In [None]:
dt.score(X_test, y_test)

In [None]:
bg.score(X_test, y_test)

In [None]:
rf.score(X_test, y_test)

What may be some other classifiers?

In [None]:
from sklearn.utils.testing import all_estimators

In [None]:
rf.predict_proba(X_test)[0]

In [None]:
rf.predict(X_test)[0:5]

In [None]:
l_props = lr.predict_proba(X_test)[1]

In [None]:
zipp = list(zip(lr.classes_, l_props))

In [None]:
#want a dictionary that gives probability for each class, 
#prediction, and ground truth
def display_preds_truth(model, obs, X_test, y_test):
    probs = model.predict_proba(X_test)[obs]
    classes = model.classes_
    display = dict(zip(classes, probs))
#     display['prediction'] = model.predict(X_test)[obs]
    display['ground truth'] = y_test[obs]
    return display
        
    
    

In [None]:
#want a dictionary that gives probability for each class,
def display_probas(model, obs, X_test):
    probs = model.predict_proba(X_test)[obs]
    classes = model.classes_
    display = dict(zip(classes, probs))
    display['prediction'] = model.predict(X_test)[obs]
    return display

In [None]:
display_probas(lr, 1, X_test)

In [None]:
X_test

In [None]:
y_test