# In which we create vectors of all categorical features before splitting for testing, and used 'days ago' instead of datetimes so we can run more models. The results were not great because we do not have a one-to-one relationship between ticket_id and asset_id.

#### Factors from SQL query: 'ticket_id', 'asset_id', 'root_cause', 'ticket_creation_reason',
####      'latitude', 'azimuth', 'ticket_origin',
#### 'service_partner', 'ticket_assigned_days_ago', 'ticket_closed_days_ago',
####       'installed_by', 'installed_days_ago'
#### Models compared: 'Logistic Regression', 'Nearest Neighbors', 'RBF SVM',
####         'Decision Tree', 'Random Forest', 'Neural Net', 'Bagging', 'AdaBoost',
####       'Gradient Boost'

In [69]:
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from catboost import Pool, CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, log_loss, f1_score, auc
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn import tree
import numpy as np
import pandas as pd
from datetime import datetime
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-paper')
import pickle

Get Data

In [2]:
df_all_shards_b = pd.read_csv('../data/all_shards_b.csv')

In [3]:
df_all_shards_b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5247 entries, 0 to 5246
Data columns (total 11 columns):
ticket_id                         3633 non-null float64
asset_id                          5247 non-null object
root_cause                        3633 non-null object
ticket_creation_reason            3616 non-null object
ticket_origin                     3632 non-null object
service_partner                   3633 non-null object
date_ticket_initially_assigned    3628 non-null object
latitude                          3633 non-null float64
longitude                         3633 non-null float64
installed_by                      3615 non-null object
installation_date                 1871 non-null object
dtypes: float64(3), object(8)
memory usage: 451.0+ KB


In [4]:
# df_all_shards_b.head()

Both ticket_id and asset_id need to be strings

In [5]:
df_all_shards_b[['ticket_id', 'asset_id']] = df_all_shards_b[['ticket_id', 'asset_id']].astype(object)
df_all_shards_b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5247 entries, 0 to 5246
Data columns (total 11 columns):
ticket_id                         3633 non-null object
asset_id                          5247 non-null object
root_cause                        3633 non-null object
ticket_creation_reason            3616 non-null object
ticket_origin                     3632 non-null object
service_partner                   3633 non-null object
date_ticket_initially_assigned    3628 non-null object
latitude                          3633 non-null float64
longitude                         3633 non-null float64
installed_by                      3615 non-null object
installation_date                 1871 non-null object
dtypes: float64(2), object(9)
memory usage: 451.0+ KB


In [6]:
df_all_shards_b.ticket_id.nunique()

3591

## Deal with duplicates and nulls. A lot of this data came from the time before Omnidian. It's missing information we consider relevant, so we'll drop those columns.

In [7]:
df_all_shards_b.drop_duplicates(inplace=True)
df_all_shards_b.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4193 entries, 0 to 5246
Data columns (total 11 columns):
ticket_id                         3591 non-null object
asset_id                          4193 non-null object
root_cause                        3591 non-null object
ticket_creation_reason            3574 non-null object
ticket_origin                     3590 non-null object
service_partner                   3591 non-null object
date_ticket_initially_assigned    3586 non-null object
latitude                          3591 non-null float64
longitude                         3591 non-null float64
installed_by                      3573 non-null object
installation_date                 1829 non-null object
dtypes: float64(2), object(9)
memory usage: 393.1+ KB


In [8]:
df_all_shards_b.isnull().any()

ticket_id                          True
asset_id                          False
root_cause                         True
ticket_creation_reason             True
ticket_origin                      True
service_partner                    True
date_ticket_initially_assigned     True
latitude                           True
longitude                          True
installed_by                       True
installation_date                  True
dtype: bool

In [9]:
df_all_shards_b.dropna(axis=0, how='any', inplace=True)
df_all_shards_b.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1792 entries, 0 to 5085
Data columns (total 11 columns):
ticket_id                         1792 non-null object
asset_id                          1792 non-null object
root_cause                        1792 non-null object
ticket_creation_reason            1792 non-null object
ticket_origin                     1792 non-null object
service_partner                   1792 non-null object
date_ticket_initially_assigned    1792 non-null object
latitude                          1792 non-null float64
longitude                         1792 non-null float64
installed_by                      1792 non-null object
installation_date                 1792 non-null object
dtypes: float64(2), object(9)
memory usage: 168.0+ KB


# Convert everthing to numbers for our machine to read.

### We want to use ticket_id to look things up later and will not encode it.

In [10]:
df_sans_ticket = df_all_shards_b.drop(['ticket_id', 'root_cause'], axis=1).copy()
df_sans_ticket.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1792 entries, 0 to 5085
Data columns (total 9 columns):
asset_id                          1792 non-null object
ticket_creation_reason            1792 non-null object
ticket_origin                     1792 non-null object
service_partner                   1792 non-null object
date_ticket_initially_assigned    1792 non-null object
latitude                          1792 non-null float64
longitude                         1792 non-null float64
installed_by                      1792 non-null object
installation_date                 1792 non-null object
dtypes: float64(2), object(7)
memory usage: 140.0+ KB


In [11]:
# List our categorical features
categoricals = list(df_sans_ticket.columns[(df_sans_ticket.dtypes.values == np.dtype('object'))])
categoricals

['asset_id',
 'ticket_creation_reason',
 'ticket_origin',
 'service_partner',
 'date_ticket_initially_assigned',
 'installed_by',
 'installation_date']

In [12]:
encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
encoder.fit(df_sans_ticket[categoricals])

OneHotEncoder(categorical_features=None, categories='auto', drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='ignore',
              n_values=None, sparse=True)

In [13]:
enc_cat = pd.DataFrame(encoder.transform(df_sans_ticket[categoricals]).toarray(),
                         columns=encoder.get_feature_names())
enc_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1792 entries, 0 to 1791
Columns: 3962 entries, x0_101111473 to x6_2016-09-06 00:00:00
dtypes: float64(3962)
memory usage: 54.2 MB


In [14]:
# enc_cat.insert(loc=0, column='ticket_id', value=df101_e.ticket_id)
enc_cat.head()

Unnamed: 0,x0_101111473,x0_101111478,x0_101111485,x0_101111489,x0_101111497,x0_101111499,x0_101111511,x0_101111513,x0_101111521,x0_101111528,...,x6_2016-04-19 00:00:00,x6_2016-04-27 00:00:00,x6_2016-05-11 00:00:00,x6_2016-05-18 00:00:00,x6_2016-06-29 00:00:00,x6_2016-07-01 00:00:00,x6_2016-07-09 00:00:00,x6_2016-08-03 00:00:00,x6_2016-08-09 00:00:00,x6_2016-09-06 00:00:00
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# grab the other features
df_other = df_all_shards_b.drop(categoricals, axis=1).copy()
df_other.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1792 entries, 0 to 5085
Data columns (total 4 columns):
ticket_id     1792 non-null object
root_cause    1792 non-null object
latitude      1792 non-null float64
longitude     1792 non-null float64
dtypes: float64(2), object(2)
memory usage: 70.0+ KB


In [16]:
# put it back together
df_enc = df_other.join(enc_cat)
df_enc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1792 entries, 0 to 5085
Columns: 3966 entries, ticket_id to x6_2016-09-06 00:00:00
dtypes: float64(3964), object(2)
memory usage: 54.3+ MB


In [17]:
df_enc.isna().any().sum()

3962

In [18]:
df_enc.isnull().any()

ticket_id                 False
root_cause                False
latitude                  False
longitude                 False
x0_101111473               True
x0_101111478               True
x0_101111485               True
x0_101111489               True
x0_101111497               True
x0_101111499               True
x0_101111511               True
x0_101111513               True
x0_101111521               True
x0_101111528               True
x0_101111533               True
x0_101111565               True
x0_101111567               True
x0_101111573               True
x0_101111576               True
x0_101111589               True
x0_101111606               True
x0_101111614               True
x0_101111621               True
x0_101111625               True
x0_101111629               True
x0_101111640               True
x0_101111652               True
x0_101111662               True
x0_101111673               True
x0_101111679               True
                          ...  
x6_2015-

### It looks like the 0.0 from our binary encoding turned into nulls. We'll change them back.

In [19]:
df_enc.fillna(0.0, inplace=True)

In [20]:
df_enc.isna().any().sum()

0

## Train-test-split.

In [21]:
X_enc = df_enc.drop(['root_cause'], axis=1).copy()
y_enc = df_enc['root_cause']
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(X_enc, y_enc, random_state=42,
                                                                    test_size=0.2)

In [22]:
# X_train_enc.head()

In [23]:
X_train_enc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1433 entries, 1172 to 1187
Columns: 3965 entries, ticket_id to x6_2016-09-06 00:00:00
dtypes: float64(3965)
memory usage: 43.4 MB


In [24]:
y_train_enc.shape

(1433,)

In [25]:
# X_test_enc.head()

In [26]:
X_test_enc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 359 entries, 685 to 1380
Columns: 3965 entries, ticket_id to x6_2016-09-06 00:00:00
dtypes: float64(3965)
memory usage: 10.9 MB


In [27]:
y_test_enc.shape

(359,)

### Modeling

In [28]:
random_state=42

In [29]:
names = ['Logistic Regression', 'Nearest Neighbors', 'RBF SVM',
         'Decision Tree', 'Random Forest', 'Neural Net', 'Bagging', 'AdaBoost',
         'Gradient Boost']

In [65]:
lr = LogisticRegression(random_state=42, solver='lbfgs',  multi_class='multinomial', max_iter=1000)
lr.fit(X_train_enc, y_train_enc)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:

# Let's pickle this model
filename = '../pickled_models/shards_lr.pkl'
pickle.dump(lr, open(filename, 'wb'))

In [32]:
lr.score(X_test_enc, y_test_enc)

0.4735376044568245

In [33]:
f1_score()

TypeError: f1_score() missing 2 required positional arguments: 'y_true' and 'y_pred'

In [34]:
y_pred_proba1 = lr.predict_proba(X_test_enc)

In [35]:
y_pred_proba1.shape

(359, 10)

In [66]:
#try weighing less common classes. This is not great right now because the test data may not have all classes.
lr_w = LogisticRegression(random_state=42, class_weight='balanced', solver='lbfgs',
                        multi_class='multinomial', max_iter=200)
lr_w.fit(X_train_enc, y_train_enc)



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=200, multi_class='multinomial', n_jobs=None,
                   penalty='l2', random_state=42, solver='lbfgs', tol=0.0001,
                   verbose=0, warm_start=False)

In [68]:
lr_w.score(X_test_enc, y_test_enc)

0.05013927576601671

In [37]:
# Let's pickle this model
filename = '../pickled_models/shards_lr_w.pkl'
pickle.dump(lr_w, open(filename, 'wb'))

In [None]:
# lr.score(X_test_enc, y_test_enc)

In [42]:
knn = KNeighborsClassifier()
knn.fit(X_train_enc, y_train_enc)
knn.score(X_test_enc, y_test_enc)

0.4206128133704735

In [43]:
# Let's pickle this model
filename = '../pickled_models/shards_knn.pkl'
pickle.dump(knn, open(filename, 'wb'))

In [46]:
svc_g = SVC(gamma=2, C=1)
svc_g.fit(X_train_enc, y_train_enc)
svc_g.score(X_test_enc, y_test_enc)

0.4735376044568245

In [47]:
# Let's pickle this model
filename = '../pickled_models/shards_svc_g.pkl'
pickle.dump(svc_g, open(filename, 'wb'))

In [48]:
dt = DecisionTreeClassifier(min_samples_leaf=30)
dt.fit(X_train_enc, y_train_enc)
dt.score(X_test_enc, y_test_enc)

0.4456824512534819

In [49]:
# Let's pickle this model
filename = '../pickled_models/shards_dt.pkl'
pickle.dump(dt, open(filename, 'wb'))

In [50]:
rf_entropy = RandomForestClassifier(n_estimators=100,criterion='entropy')
rf_entropy.fit(X_train_enc, y_train_enc)
rf_entropy.score(X_test_enc, y_test_enc)

0.4623955431754875

In [51]:
# Let's pickle this model
filename = '../pickled_models/shards_rf_entropy.pkl'
pickle.dump(rf_entropy, open(filename, 'wb'))

In [52]:
rf_gini = RandomForestClassifier(n_estimators=100,criterion='gini')
rf_gini.fit(X_train_enc, y_train_enc)
rf_gini.score(X_test_enc, y_test_enc)

0.46518105849582175

In [53]:
# Let's pickle this model
filename = '../pickled_models/shards_rf_gini.pkl'
pickle.dump(rf_gini, open(filename, 'wb'))

In [54]:
mlp = MLPClassifier(max_iter=100)
mlp.fit(X_train_enc, y_train_enc)
mlp.score(X_test_enc, y_test_enc)

0.14484679665738162

In [55]:
# Let's pickle this model
filename = '../pickled_models/mlp.pkl'
pickle.dump(mlp, open(filename, 'wb'))

In [56]:
bag = BaggingClassifier(random_state=42, bootstrap_features=True)
bag.fit(X_train_enc, y_train_enc)
bag.score(X_test_enc, y_test_enc)

0.4735376044568245

In [57]:
# Let's pickle this model
filename = '../pickled_models/shards_bag.pkl'
pickle.dump(bag, open(filename, 'wb'))

In [58]:
ada = AdaBoostClassifier(random_state=42)
ada.fit(X_train_enc, y_train_enc)
ada.score(X_test_enc, y_test_enc)

0.47075208913649025

In [59]:
# Let's pickle this model
filename = '../pickled_models/shards_ada.pkl'
pickle.dump(ada, open(filename, 'wb'))

In [60]:
gb = GradientBoostingClassifier(random_state=42, min_samples_leaf=30)
gb.fit(X_train_enc, y_train_enc)
gb.score(X_test_enc, y_test_enc)

0.4568245125348189

In [61]:
# Let's pickle this model
filename = '../pickled_models/shards_gb.pkl'
pickle.dump(gb, open(filename, 'wb'))

In [62]:
cat = CatBoostClassifier(iterations=10,
                           learning_rate=1,
                           depth=2,
                           loss_function='MultiClass')
cat.fit(X_train_enc, y_train_enc)

0:	learn: -2.1063086	total: 65.1ms	remaining: 585ms
1:	learn: -1.9087747	total: 70.2ms	remaining: 281ms
2:	learn: -1.7167418	total: 74.5ms	remaining: 174ms
3:	learn: -1.6997964	total: 78.4ms	remaining: 118ms
4:	learn: -1.6646526	total: 82.5ms	remaining: 82.5ms
5:	learn: -1.6623159	total: 86.6ms	remaining: 57.8ms
6:	learn: -1.6543571	total: 90.8ms	remaining: 38.9ms
7:	learn: -1.6198531	total: 95.1ms	remaining: 23.8ms
8:	learn: -1.6173163	total: 99.3ms	remaining: 11ms
9:	learn: -1.6024950	total: 104ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1343d50b8>

In [64]:
cat.best_score_

{'learn': {'MultiClass': -1.602495019767117}}

In [63]:
filename = '../pickled_models/shards_cat.pkl'
pickle.dump(cat, open(filename, 'wb'))