# Use ColumnTransformer instead of OneHotEncoder

### Based on features from Omnidian database 101, we compare K-Nearest Neighbors, Gradient Boosting, Random Forest, Bagging, and Logistic Regression.

In [1]:
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn import tree
import numpy as np
import pandas as pd
from datetime import datetime
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-paper')



In [2]:
import pickle

Get Data

In [4]:
df101_b = pd.read_csv('data/eda101.csv', parse_dates=['Date_Ticket_Initially_Assigned', 'Date_Ticket_Closed', 'installation_date'])

In [5]:
# df101_b.head()

In [6]:
df101_b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 825 entries, 0 to 824
Data columns (total 14 columns):
Ticket_Id                         825 non-null int64
Asset_Id                          825 non-null int64
Root_Cause                        825 non-null object
Ticket_Creation_Reason            825 non-null object
asset_type                        825 non-null object
latitude                          825 non-null float64
longitude                         825 non-null float64
tilt                              825 non-null float64
azimuth                           825 non-null float64
Ticket_Origin                     825 non-null object
Date_Ticket_Initially_Assigned    825 non-null datetime64[ns]
Date_Ticket_Closed                825 non-null datetime64[ns]
installed_by                      825 non-null object
installation_date                 825 non-null datetime64[ns]
dtypes: datetime64[ns](3), float64(4), int64(2), object(5)
memory usage: 90.3+ KB


Let's change everything to lowercase to reduce confusions.

In [7]:
df101_b.rename(str.lower, axis='columns', inplace=True)

In [8]:
df101_b = df101_b.applymap(lambda s:s.lower() if type(s) == object else s)


In [9]:
# df101_b.head()

Both ticket_id and asset_id need to be strings

In [10]:
df101_b[['ticket_id', 'asset_id']] = df101_b[['ticket_id', 'asset_id']].astype(str)
df101_b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 825 entries, 0 to 824
Data columns (total 14 columns):
ticket_id                         825 non-null object
asset_id                          825 non-null object
root_cause                        825 non-null object
ticket_creation_reason            825 non-null object
asset_type                        825 non-null object
latitude                          825 non-null float64
longitude                         825 non-null float64
tilt                              825 non-null float64
azimuth                           825 non-null float64
ticket_origin                     825 non-null object
date_ticket_initially_assigned    825 non-null datetime64[ns]
date_ticket_closed                825 non-null datetime64[ns]
installed_by                      825 non-null object
installation_date                 825 non-null datetime64[ns]
dtypes: datetime64[ns](3), float64(4), object(7)
memory usage: 90.3+ KB


Now drop duplicates

In [11]:
df101_b.drop_duplicates(inplace=True)

In [12]:
df101_b.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 638 entries, 0 to 823
Data columns (total 14 columns):
ticket_id                         638 non-null object
asset_id                          638 non-null object
root_cause                        638 non-null object
ticket_creation_reason            638 non-null object
asset_type                        638 non-null object
latitude                          638 non-null float64
longitude                         638 non-null float64
tilt                              638 non-null float64
azimuth                           638 non-null float64
ticket_origin                     638 non-null object
date_ticket_initially_assigned    638 non-null datetime64[ns]
date_ticket_closed                638 non-null datetime64[ns]
installed_by                      638 non-null object
installation_date                 638 non-null datetime64[ns]
dtypes: datetime64[ns](3), float64(4), object(7)
memory usage: 74.8+ KB


In [13]:
df101_b.isnull().any()

ticket_id                         False
asset_id                          False
root_cause                        False
ticket_creation_reason            False
asset_type                        False
latitude                          False
longitude                         False
tilt                              False
azimuth                           False
ticket_origin                     False
date_ticket_initially_assigned    False
date_ticket_closed                False
installed_by                      False
installation_date                 False
dtype: bool

Assign target to Root_Cause and Train-Test-Split. We'll also take the ticket_id off now so we can use it later to look rows up.

In [14]:
X = df101_b.drop(['root_cause'], axis=1).copy()
y = df101_b['root_cause']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

Let's try ColumnTransformer. We'll pull ticket_id from the dataframes to keep them from being encoded, then we'll put them back together for our function later.

In [15]:
train_ticket = X_train.ticket_id

In [16]:
X_train.drop(['ticket_id'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [None]:
test_ticket = X_test.ticket_id

In [None]:
X_test.drop(['ticket_id'], axis=1, inplace=True)

In [None]:
# List our categorical features
categoricals = list(X_test.columns[(X_test.dtypes.values == np.dtype('object'))])
categoricals

In [None]:
X_nums = list(X_test.columns[(X_test.dtypes.values != np.dtype('object'))])
X_nums

In [None]:
X_train_num = X_train[X_nums].copy()
# X_train_num.head()

In [None]:
X_test_num = X_test[X_nums].copy()
# X_test_num.head()

In [None]:
preprocessor = make_column_transformer( (OneHotEncoder(handle_unknown='ignore'), categoricals))
encoder = preprocessor.fit(X_train)

In [None]:
X_train_enc = pd.DataFrame(encoder.transform(X_train).toarray(),
                         columns=encoder.get_feature_names())
X_test_enc = pd.DataFrame(encoder.transform(X_test).toarray(),
                        columns=encoder.get_feature_names())

In [None]:
# X_train_enc.head()

In [None]:
X_train_enc.info()

Now we'll put the ticket_id, numerical columns, and encoded columns all together.

In [None]:
X_train_num.insert(loc=0, column='ticket_id', value=train_ticket)
# X_train_num.head()

In [None]:
X_train_mega = X_train_num.join(X_train_enc)
# X_train_mega.head()

In [None]:
X_test_num.insert(loc=0, column='ticket_id', value=test_ticket)
# X_test_num.head()

In [None]:
X_test_mega = X_test_num.join(X_test_enc)
# X_test_mega.head()

In [None]:
X_test_mega.isna().any().sum()

### Modeling

In [None]:
gb = GradientBoostingClassifier(random_state=42, min_samples_leaf=30)
gb.fit(X_train_enc, y_train)

In [None]:
gb.score(X_test_enc, y_test)

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [None]:
# filename = '101_knn.pkl'
# pickle.dump(knn, open(filename, 'wb'))

In [None]:
# kn = pickle.load

In [None]:
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
lr.fit(X_train, y_train)

In [None]:
# filename = '101_lr.pkl'
# pickle.dump(lr, open(filename, 'wb'))

In [None]:
gb = GradientBoostingClassifier(random_state=42, min_samples_leaf=30)
gb.fit(X_train, y_train)

In [None]:
# filename = '101_gb.pkl'
# pickle.dump(gb, open(filename, 'wb'))

In [None]:
dt = DecisionTreeClassifier(random_state=42, min_samples_leaf=30)
dt.fit(X_train, y_train)

In [None]:
# filename = '101_dt.pkl'
# pickle.dump(dt, open(filename, 'wb'))

In [None]:
bg = BaggingClassifier(random_state=25565)
bg.fit(X_train, y_train)

In [None]:
# filename = '101_bg.pkl'
# pickle.dump(bg, open(filename, 'wb'))

In [None]:
rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf.fit(X_train, y_train);

In [None]:
# filename = '101_rf.pkl'
# pickle.dump(rf, open(filename, 'wb'))

Let's show our results

In [None]:
knn.score(X_test, y_test)

In [None]:
lr.score(X_test, y_test)

In [None]:
gb.score(X_test, y_test)

In [None]:
dt.score(X_test, y_test)

In [None]:
bg.score(X_test, y_test)

In [None]:
rf.score(X_test, y_test)

In [None]:
from sklearn.utils.testing import all_estimators

In [None]:
rf.predict_proba(X_test)[0]

In [None]:
rf.predict(X_test)[0:5]

In [None]:
l_props = lr.predict_proba(X_test)[1]

In [None]:
zipp = list(zip(lr.classes_, l_props))

In [None]:
#want a dictionary that gives probability for each class, 
#prediction, and ground truth
def display_preds_truth(model, obs, X_test, y_test):
    probs = model.predict_proba(X_test)[obs]
    classes = model.classes_
    display = dict(zip(classes, probs))
#     display['prediction'] = model.predict(X_test)[obs]
    display['ground truth'] = y_test[obs]
    return display
        
    
    

In [None]:
#want a dictionary that gives probability for each class,
def display_probas(model, obs, X_test):
    probs = model.predict_proba(X_test)[obs]
    classes = model.classes_
    display = dict(zip(classes, probs))
    display['prediction'] = model.predict(X_test)[obs]
    return display

In [None]:
display_probas(lr, 1, X_test)