# In which we create vectors of all categorical features before splitting for testing, and used 'days ago' instead of datetimes so we can run more models. The results were not great because we do not have a one-to-one relationship between ticket_id and asset_id.

#### Factors from SQL query: 'ticket_id', 'asset_id', 'root_cause', 'ticket_creation_reason',
####      'latitude', 'longitude', 'tilt', 'azimuth', 'ticket_origin',
#### 'service_partner', 'ticket_assigned_days_ago', 'ticket_closed_days_ago',
####       'installed_by', 'installed_days_ago'
#### Models compared: 'Logistic Regression', 'Nearest Neighbors', 'RBF SVM',
####         'Decision Tree', 'Random Forest', 'Neural Net', 'Bagging', 'AdaBoost',
####       'Gradient Boost'

In [1]:
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.linear_model import RidgeClassifier
from sklearn.neural_network import MLPClassifier
from scipy import stats
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn import tree
import numpy as np
import pandas as pd
from datetime import datetime
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-paper')
import pickle



Get Data

In [2]:
df101_e = pd.read_csv('data/eda101_d.csv')

In [3]:
df101_e.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 825 entries, 0 to 824
Data columns (total 14 columns):
ticket_id                   825 non-null int64
asset_id                    825 non-null int64
root_cause                  825 non-null object
ticket_creation_reason      825 non-null object
latitude                    825 non-null float64
longitude                   825 non-null float64
tilt                        825 non-null float64
azimuth                     825 non-null float64
ticket_origin               825 non-null object
service_partner             825 non-null object
ticket_assigned_days_ago    825 non-null int64
ticket_closed_days_ago      825 non-null int64
installed_by                825 non-null object
installed_days_ago          825 non-null int64
dtypes: float64(4), int64(5), object(5)
memory usage: 90.3+ KB


In [4]:
df101_e.columns

Index(['ticket_id', 'asset_id', 'root_cause', 'ticket_creation_reason',
       'latitude', 'longitude', 'tilt', 'azimuth', 'ticket_origin',
       'service_partner', 'ticket_assigned_days_ago', 'ticket_closed_days_ago',
       'installed_by', 'installed_days_ago'],
      dtype='object')

Both ticket_id and asset_id need to be strings

In [5]:
df101_e[['ticket_id', 'asset_id']] = df101_e[['ticket_id', 'asset_id']].astype(object)
df101_e.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 825 entries, 0 to 824
Data columns (total 14 columns):
ticket_id                   825 non-null object
asset_id                    825 non-null object
root_cause                  825 non-null object
ticket_creation_reason      825 non-null object
latitude                    825 non-null float64
longitude                   825 non-null float64
tilt                        825 non-null float64
azimuth                     825 non-null float64
ticket_origin               825 non-null object
service_partner             825 non-null object
ticket_assigned_days_ago    825 non-null int64
ticket_closed_days_ago      825 non-null int64
installed_by                825 non-null object
installed_days_ago          825 non-null int64
dtypes: float64(4), int64(3), object(7)
memory usage: 90.3+ KB


In [6]:
df101_e.ticket_id.nunique()

476

# Convert everthing to numbers for our machine to read.

## We want to use ticket_id to look things up later and so will not encode it.

In [7]:
df_sans_ticket = df101_e.drop(['ticket_id', 'root_cause'], axis=1).copy()
df_sans_ticket.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 825 entries, 0 to 824
Data columns (total 12 columns):
asset_id                    825 non-null object
ticket_creation_reason      825 non-null object
latitude                    825 non-null float64
longitude                   825 non-null float64
tilt                        825 non-null float64
azimuth                     825 non-null float64
ticket_origin               825 non-null object
service_partner             825 non-null object
ticket_assigned_days_ago    825 non-null int64
ticket_closed_days_ago      825 non-null int64
installed_by                825 non-null object
installed_days_ago          825 non-null int64
dtypes: float64(4), int64(3), object(5)
memory usage: 77.4+ KB


In [8]:
# List our categorical features
categoricals = list(df_sans_ticket.columns[(df_sans_ticket.dtypes.values == np.dtype('object'))])
categoricals

['asset_id',
 'ticket_creation_reason',
 'ticket_origin',
 'service_partner',
 'installed_by']

In [9]:
encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
encoder.fit(df_sans_ticket[categoricals])

OneHotEncoder(categorical_features=None, categories='auto', drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='ignore',
              n_values=None, sparse=True)

In [10]:
enc_cat = pd.DataFrame(encoder.transform(df_sans_ticket[categoricals]).toarray(),
                         columns=encoder.get_feature_names())
enc_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 825 entries, 0 to 824
Columns: 422 entries, x0_101111473 to x4_williams lifetime builders inc. dba lifetime solar
dtypes: float64(422)
memory usage: 2.7 MB


In [11]:
# enc_cat.insert(loc=0, column='ticket_id', value=df101_e.ticket_id)
# enc_cat.head()

In [12]:
# grab the other features
df_other = df101_e.drop(categoricals, axis=1).copy()
df_other.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 825 entries, 0 to 824
Data columns (total 9 columns):
ticket_id                   825 non-null object
root_cause                  825 non-null object
latitude                    825 non-null float64
longitude                   825 non-null float64
tilt                        825 non-null float64
azimuth                     825 non-null float64
ticket_assigned_days_ago    825 non-null int64
ticket_closed_days_ago      825 non-null int64
installed_days_ago          825 non-null int64
dtypes: float64(4), int64(3), object(2)
memory usage: 58.1+ KB


In [13]:
# put it back together
df_enc = df_other.join(enc_cat)
df_enc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 825 entries, 0 to 824
Columns: 431 entries, ticket_id to x4_williams lifetime builders inc. dba lifetime solar
dtypes: float64(426), int64(3), object(2)
memory usage: 2.7+ MB


In [14]:
df_enc.isna().any().sum()

0

## Train-test-split.

In [16]:
X_enc = df_enc.drop(['root_cause'], axis=1).copy()
y_enc = df_enc['root_cause']
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(X_enc, y_enc, random_state=42,
                                                                    test_size=0.2)

In [17]:
# X_train_enc.head()

In [18]:
X_train_enc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 660 entries, 239 to 102
Columns: 430 entries, ticket_id to x4_williams lifetime builders inc. dba lifetime solar
dtypes: float64(426), int64(3), object(1)
memory usage: 2.2+ MB


In [19]:
y_train_enc.shape

(660,)

In [20]:
# X_test_enc.head()

In [21]:
X_test_enc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165 entries, 611 to 527
Columns: 430 entries, ticket_id to x4_williams lifetime builders inc. dba lifetime solar
dtypes: float64(426), int64(3), object(1)
memory usage: 555.6+ KB


In [22]:
y_test_enc.shape

(165,)

# **********************************

### Modeling

In [62]:
random_state=42

In [76]:
names = ['Logistic Regression', 'Nearest Neighbors', 'RBF SVM',
         'Decision Tree', 'Random Forest', 'Neural Net', 'Bagging', 'AdaBoost',
         'Gradient Boost']

In [77]:
classifiers = [
    LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000),
    KNeighborsClassifier(),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    BaggingClassifier(random_state=42),
    AdaBoostClassifier(),
    GradientBoostingClassifier(random_state=42, min_samples_leaf=30),
    ]

In [104]:
lr = LogisticRegression(random_state=42, solver='lbfgs',  multi_class='multinomial', max_iter=10000)
lr.fit(X_train_enc, y_train_enc)
lr.score(X_test_enc, y_test_enc)



0.6

In [105]:
lr = LogisticRegression(random_state=42, solver='sag',
                        multi_class='multinomial', max_iter=10000)
lr.fit(X_train_enc, y_train_enc)
lr.score(X_test_enc, y_test_enc)

0.5878787878787879

In [106]:
lr = LogisticRegression(random_state=42, solver='saga',
                        multi_class='multinomial', max_iter=10000)
lr.fit(X_train_enc, y_train_enc)
lr.score(X_test_enc, y_test_enc)

0.5878787878787879

In [79]:
kn = KNeighborsClassifier()
kn.fit(X_train_enc, y_train_enc)
kn.score(X_test_enc, y_test_enc)

0.5818181818181818

In [67]:
svc_g = SVC(gamma=2, C=1)
svc_g.fit(X_train_enc, y_train_enc)
svc_g.score(X_test_enc, y_test_enc)

0.5818181818181818

In [80]:
dt = DecisionTreeClassifier(min_samples_leaf=30)
dt.fit(X_train_enc, y_train_enc)
dt.score(X_test_enc, y_test_enc)

0.6060606060606061

In [81]:
dt = DecisionTreeClassifier(min_samples_leaf=100)
dt.fit(X_train_enc, y_train_enc)
dt.score(X_test_enc, y_test_enc)

0.5818181818181818

In [82]:
rf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
rf.fit(X_train_enc, y_train_enc)
rf.score(X_test_enc, y_test_enc)

0.5818181818181818

In [84]:
rf = RandomForestClassifier(n_estimators=100,criterion='entropy')
rf.fit(X_train_enc, y_train_enc)
rf.score(X_test_enc, y_test_enc)

0.8484848484848485

In [85]:
rf = RandomForestClassifier(n_estimators=100,criterion='gini')
rf.fit(X_train_enc, y_train_enc)
rf.score(X_test_enc, y_test_enc)

0.8363636363636363

In [86]:
rf = RandomForestClassifier(n_estimators=200,criterion='entropy')
rf.fit(X_train_enc, y_train_enc)
rf.score(X_test_enc, y_test_enc)

0.8424242424242424

In [89]:
mlp = MLPClassifier(alpha=1, max_iter=1000)
mlp.fit(X_train_enc, y_train_enc)
mlp.score(X_test_enc, y_test_enc)

0.10909090909090909

In [102]:
mlp = MLPClassifier(max_iter=100)
mlp.fit(X_train_enc, y_train_enc)
mlp.score(X_test_enc, y_test_enc)

0.5636363636363636

In [91]:
mlp = MLPClassifier(max_iter=100)
mlp.fit(X_train_enc, y_train_enc)
mlp.score(X_test_enc, y_test_enc)

0.5818181818181818

In [92]:
bag = BaggingClassifier(random_state=42)
bag.fit(X_train_enc, y_train_enc)
bag.score(X_test_enc, y_test_enc)

0.8242424242424242

In [93]:
bag = BaggingClassifier(random_state=42, bootstrap_features=True)
bag.fit(X_train_enc, y_train_enc)
bag.score(X_test_enc, y_test_enc)

0.8303030303030303

In [94]:
bag = BaggingClassifier(random_state=42, bootstrap_features=True, warm_start=True)
bag.fit(X_train_enc, y_train_enc)
bag.score(X_test_enc, y_test_enc)

0.8303030303030303

In [97]:
ada = AdaBoostClassifier(random_state=42)
ada.fit(X_train_enc, y_train_enc)
ada.score(X_test_enc, y_test_enc)

0.5818181818181818

In [96]:
gb = GradientBoostingClassifier(random_state=42, min_samples_leaf=30)
gb.fit(X_train_enc, y_train_enc)
gb.score(X_test_enc, y_test_enc)

0.7696969696969697