# In which we drop the dimensionally mismatched features to improve the linear algebra for our models.

#### Factors from SQL query:'root_cause'(target variable), 'ticket_id', 'ticket_creation_reason',
####      'latitude', 'longitude', 'ticket_origin','service_partner','installed_by'

#### Models compared: 'Logistic Regression', 'Nearest Neighbors', 'RBF SVM',
####         'Decision Tree', 'Random Forest', 'Neural Net', 'Bagging', 'AdaBoost',
####       'Gradient Boost'

In [1]:
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from catboost import Pool, CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, log_loss, f1_score, auc
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn import tree
import numpy as np
import pandas as pd
from datetime import datetime
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-paper')
import pickle



Get Data

In [2]:
df_all_shards_c = pd.read_csv('../data/all_shards_c.csv')

In [3]:
df_all_shards_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3633 entries, 0 to 3632
Data columns (total 8 columns):
ticket_id                 3633 non-null int64
root_cause                3633 non-null object
ticket_creation_reason    3616 non-null object
ticket_origin             3632 non-null object
service_partner           3633 non-null object
latitude                  3633 non-null float64
longitude                 3633 non-null float64
installed_by              3615 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 227.1+ KB


In [5]:
df_all_shards_c.head()

Unnamed: 0,ticket_id,root_cause,ticket_creation_reason,ticket_origin,service_partner,latitude,longitude,installed_by
0,23947,root_cause_normal_wear_and_tear,communication offline,origin_omnidian_customer,sunup sts service team,38.944642,-121.248833,williams lifetime builders inc. dba lifetime s...
1,27384,root_cause_normal_wear_and_tear,system inspection,origin_homeowner,indaspec solar service team,34.101697,-118.146646,green tech solutions inc.
2,22820,root_cause_normal_wear_and_tear,communication offline,origin_omnidian_customer,sunsystem technology,34.075427,-117.16714,horizon solar power
3,8568,root_cause_normal_wear_and_tear,communication offline,origin_omnidian_customer,sunsystem technology,33.691802,-112.28623,arizona solar solutions
4,7826,root_cause_normal_wear_and_tear,communication offline,origin_omnidian_customer,sunsystem technology,33.691802,-112.28623,arizona solar solutions


Both ticket_id needs to be a string.

In [6]:
df_all_shards_c[['ticket_id']] = df_all_shards_c[['ticket_id']].astype(object)
df_all_shards_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3633 entries, 0 to 3632
Data columns (total 8 columns):
ticket_id                 3633 non-null object
root_cause                3633 non-null object
ticket_creation_reason    3616 non-null object
ticket_origin             3632 non-null object
service_partner           3633 non-null object
latitude                  3633 non-null float64
longitude                 3633 non-null float64
installed_by              3615 non-null object
dtypes: float64(2), object(6)
memory usage: 227.1+ KB


In [7]:
df_all_shards_c.ticket_id.nunique()

3591

## Deal with duplicates and nulls. A lot of this data came from the time before Omnidian. It's missing information we consider relevant, so we'll drop those columns.

In [8]:
df_all_shards_c.drop_duplicates(subset=['ticket_id'], inplace=True)
df_all_shards_c.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3591 entries, 0 to 3632
Data columns (total 8 columns):
ticket_id                 3591 non-null object
root_cause                3591 non-null object
ticket_creation_reason    3574 non-null object
ticket_origin             3590 non-null object
service_partner           3591 non-null object
latitude                  3591 non-null float64
longitude                 3591 non-null float64
installed_by              3573 non-null object
dtypes: float64(2), object(6)
memory usage: 252.5+ KB


In [9]:
df_all_shards_c.isnull().any()

ticket_id                 False
root_cause                False
ticket_creation_reason     True
ticket_origin              True
service_partner           False
latitude                  False
longitude                 False
installed_by               True
dtype: bool

In [10]:
df_all_shards_c.dropna(axis=0, how='any', inplace=True)
df_all_shards_c.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3555 entries, 0 to 3617
Data columns (total 8 columns):
ticket_id                 3555 non-null object
root_cause                3555 non-null object
ticket_creation_reason    3555 non-null object
ticket_origin             3555 non-null object
service_partner           3555 non-null object
latitude                  3555 non-null float64
longitude                 3555 non-null float64
installed_by              3555 non-null object
dtypes: float64(2), object(6)
memory usage: 250.0+ KB


In [11]:
df_all_shards_c.isnull().any()

ticket_id                 False
root_cause                False
ticket_creation_reason    False
ticket_origin             False
service_partner           False
latitude                  False
longitude                 False
installed_by              False
dtype: bool

# Convert everthing to numbers for our machine to read.

### We want to use ticket_id to look things up later and will not encode it.

In [12]:
df_sans_ticket = df_all_shards_c.drop(['ticket_id', 'root_cause'], axis=1).copy()
df_sans_ticket.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3555 entries, 0 to 3617
Data columns (total 6 columns):
ticket_creation_reason    3555 non-null object
ticket_origin             3555 non-null object
service_partner           3555 non-null object
latitude                  3555 non-null float64
longitude                 3555 non-null float64
installed_by              3555 non-null object
dtypes: float64(2), object(4)
memory usage: 194.4+ KB


In [13]:
# List our categorical features
categoricals = list(df_sans_ticket.columns[(df_sans_ticket.dtypes.values == np.dtype('object'))])
categoricals

['ticket_creation_reason', 'ticket_origin', 'service_partner', 'installed_by']

In [14]:
encoder = OneHotEncoder(categories='auto', handle_unknown='ignore')
encoder.fit(df_sans_ticket[categoricals])

OneHotEncoder(categorical_features=None, categories='auto', drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='ignore',
              n_values=None, sparse=True)

In [15]:
enc_cat = pd.DataFrame(encoder.transform(df_sans_ticket[categoricals]).toarray(),
                         columns=encoder.get_feature_names())
enc_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3555 entries, 0 to 3554
Columns: 310 entries, x0_communication offline to x3_williams lifetime builders inc. dba lifetime solar
dtypes: float64(310)
memory usage: 8.4 MB


In [16]:
# enc_cat.insert(loc=0, column='ticket_id', value=df101_e.ticket_id)
enc_cat.head()

Unnamed: 0,x0_communication offline,x0_duplicate ticket,x0_homeowner cannot access portal,x0_inquiry about service ticket,x0_install punchlist,x0_non-service inquiry,x0_performance guarantee review,x0_permanent system removal,x0_reinstallation,x0_roof leak,...,x3_titan solar power,x3_trinity,x3_united solar associates woburn,x3_unleash solar,x3_usa solar littleton,x3_valley unique electric inc fresno,x3_verde solar phoenix,x3_verengo inc.,x3_vivint solar,x3_williams lifetime builders inc. dba lifetime solar
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# grab the other features
df_other = df_all_shards_c.drop(categoricals, axis=1).copy()
df_other.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3555 entries, 0 to 3617
Data columns (total 4 columns):
ticket_id     3555 non-null object
root_cause    3555 non-null object
latitude      3555 non-null float64
longitude     3555 non-null float64
dtypes: float64(2), object(2)
memory usage: 138.9+ KB


In [18]:
# put it back together
df_enc = df_other.join(enc_cat)
df_enc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3555 entries, 0 to 3617
Columns: 314 entries, ticket_id to x3_williams lifetime builders inc. dba lifetime solar
dtypes: float64(312), object(2)
memory usage: 8.7+ MB


In [19]:
df_enc.isna().any().sum()

310

In [20]:
df_enc.isnull().any()

ticket_id                                                False
root_cause                                               False
latitude                                                 False
longitude                                                False
x0_communication offline                                  True
x0_duplicate ticket                                       True
x0_homeowner cannot access portal                         True
x0_inquiry about service ticket                           True
x0_install punchlist                                      True
x0_non-service inquiry                                    True
x0_performance guarantee review                           True
x0_permanent system removal                               True
x0_reinstallation                                         True
x0_roof leak                                              True
x0_soiling                                                True
x0_system activation                                   

### It looks like the 0.0 from our binary encoding turned into nulls. We'll change them back.

In [21]:
df_enc.fillna(0.0, inplace=True)

In [22]:
df_enc.isna().any().sum()

0

## Train-test-split.

In [23]:
X_enc = df_enc.drop(['root_cause'], axis=1).copy()
y_enc = df_enc['root_cause']
X_train_enc, X_test_enc, y_train_enc, y_test_enc = train_test_split(X_enc, y_enc, random_state=42,
                                                                    test_size=0.2)

In [24]:
X_train_enc.head()

Unnamed: 0,ticket_id,latitude,longitude,x0_communication offline,x0_duplicate ticket,x0_homeowner cannot access portal,x0_inquiry about service ticket,x0_install punchlist,x0_non-service inquiry,x0_performance guarantee review,...,x3_titan solar power,x3_trinity,x3_united solar associates woburn,x3_unleash solar,x3_usa solar littleton,x3_valley unique electric inc fresno,x3_verde solar phoenix,x3_verengo inc.,x3_vivint solar,x3_williams lifetime builders inc. dba lifetime solar
2694,5499,39.831618,-105.175348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1931,13893,40.987406,-74.038181,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1638,18106,40.397362,-104.769056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3288,29005,34.138824,-117.559302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
211,29048,32.804052,-116.915147,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
X_train_enc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2844 entries, 2694 to 3234
Columns: 313 entries, ticket_id to x3_williams lifetime builders inc. dba lifetime solar
dtypes: float64(312), int64(1)
memory usage: 6.8 MB


In [26]:
y_train_enc.shape

(2844,)

In [27]:
# X_test_enc.head()

In [28]:
X_test_enc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 711 entries, 322 to 2782
Columns: 313 entries, ticket_id to x3_williams lifetime builders inc. dba lifetime solar
dtypes: float64(312), int64(1)
memory usage: 1.7 MB


In [29]:
y_test_enc.shape

(711,)

### Modeling

In [None]:
random_state=42

In [None]:
names = ['Logistic Regression', 'Nearest Neighbors', 'RBF SVM',
         'Decision Tree', 'Random Forest', 'Neural Net', 'Bagging', 'AdaBoost',
         'Gradient Boost']

In [46]:
lr = LogisticRegression(random_state=42, solver='lbfgs',  multi_class='multinomial', max_iter=1000)
lr.fit(X_train_enc, y_train_enc)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [47]:
y_pred_lr_c = lr.predict(X_test_enc)

In [48]:
f1_score(y_pred_lr_c, y_test_enc, average='weighted')

  'recall', 'true', average, warn_for)


0.5471858215868292

In [49]:

# Let's pickle this model
filename = '../pickled_models/shards_lr_c.pkl'
pickle.dump(lr, open(filename, 'wb'))

In [50]:
lr.score(X_test_enc, y_test_enc)

0.40365682137834036

In [51]:
y_pred_lr_c = lr.predict_proba(X_test_enc)

In [52]:
y_pred_lr_c

array([[0.0060488 , 0.03998497, 0.06501293, ..., 0.41665351, 0.04736695,
        0.00234572],
       [0.00199947, 0.02239689, 0.10697537, ..., 0.43398293, 0.04334746,
        0.00196283],
       [0.00319949, 0.02847601, 0.08814591, ..., 0.42658186, 0.04635789,
        0.00227143],
       ...,
       [0.00191975, 0.02121884, 0.10770061, ..., 0.42442968, 0.04620042,
        0.00235381],
       [0.00646577, 0.04182714, 0.1118047 , ..., 0.36812394, 0.05039748,
        0.00347164],
       [0.01739114, 0.06586581, 0.18586086, ..., 0.24782242, 0.0601469 ,
        0.00965686]])

In [53]:
#try weighing less common classes. This is not great right now because the test data may not have all classes.
lr_w = LogisticRegression(random_state=42, class_weight='balanced', solver='sag',
                        multi_class='multinomial', max_iter=200)
lr_w.fit(X_train_enc, y_train_enc)



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=200, multi_class='multinomial', n_jobs=None,
                   penalty='l2', random_state=42, solver='sag', tol=0.0001,
                   verbose=0, warm_start=False)

In [54]:
# Let's pickle this model
filename = '../pickled_models/shards_lr_w.pkl'
pickle.dump(lr_w, open(filename, 'wb'))

In [None]:
# lr.score(X_test_enc, y_test_enc)

In [55]:
knn = KNeighborsClassifier()
knn.fit(X_train_enc, y_train_enc)
knn.score(X_test_enc, y_test_enc)

0.33473980309423346

In [56]:
# Let's pickle this model
filename = '../pickled_models/shards_knn.pkl'
pickle.dump(knn, open(filename, 'wb'))

In [57]:
svc_g = SVC(gamma=2, C=1)
svc_g.fit(X_train_enc, y_train_enc)
svc_g.score(X_test_enc, y_test_enc)

0.38396624472573837

In [58]:
# Let's pickle this model
filename = '../pickled_models/shards_svc_g.pkl'
pickle.dump(svc_g, open(filename, 'wb'))

In [59]:
dt = DecisionTreeClassifier(min_samples_leaf=30)
dt.fit(X_train_enc, y_train_enc)
dt.score(X_test_enc, y_test_enc)

0.3909985935302391

In [60]:
# Let's pickle this model
filename = '../pickled_models/shards_dt.pkl'
pickle.dump(dt, open(filename, 'wb'))

In [61]:
rf_entropy = RandomForestClassifier(n_estimators=100,criterion='entropy')
rf_entropy.fit(X_train_enc, y_train_enc)
rf_entropy.score(X_test_enc, y_test_enc)

0.38255977496483823

In [62]:
# Let's pickle this model
filename = '../pickled_models/shards_rf_entropy.pkl'
pickle.dump(rf_entropy, open(filename, 'wb'))

In [63]:
rf_gini = RandomForestClassifier(n_estimators=100,criterion='gini')
rf_gini.fit(X_train_enc, y_train_enc)
rf_gini.score(X_test_enc, y_test_enc)

0.379746835443038

In [64]:
# Let's pickle this model
filename = '../pickled_models/shards_rf_gini.pkl'
pickle.dump(rf_gini, open(filename, 'wb'))

In [65]:
mlp = MLPClassifier(max_iter=100)
mlp.fit(X_train_enc, y_train_enc)
mlp.score(X_test_enc, y_test_enc)

0.12658227848101267

In [66]:
# Let's pickle this model
filename = '../pickled_models/mlp.pkl'
pickle.dump(mlp, open(filename, 'wb'))

In [67]:
bag = BaggingClassifier(random_state=42, bootstrap_features=True)
bag.fit(X_train_enc, y_train_enc)
bag.score(X_test_enc, y_test_enc)

0.37834036568213786

In [68]:
# Let's pickle this model
filename = '../pickled_models/shards_bag.pkl'
pickle.dump(bag, open(filename, 'wb'))

In [69]:
ada = AdaBoostClassifier(random_state=42)
ada.fit(X_train_enc, y_train_enc)
ada.score(X_test_enc, y_test_enc)

0.37834036568213786

In [70]:
# Let's pickle this model
filename = '../pickled_models/shards_ada.pkl'
pickle.dump(ada, open(filename, 'wb'))

In [71]:
gb = GradientBoostingClassifier(random_state=42, min_samples_leaf=30)
gb.fit(X_train_enc, y_train_enc)
gb.score(X_test_enc, y_test_enc)

0.41068917018284107

In [72]:
# Let's pickle this model
filename = '../pickled_models/shards_gb.pkl'
pickle.dump(gb, open(filename, 'wb'))

In [73]:
cat = CatBoostClassifier(iterations=10,
                           learning_rate=1,
                           depth=2,
                           loss_function='MultiClass')
cat.fit(X_train_enc, y_train_enc)

0:	learn: -2.0115588	total: 58ms	remaining: 522ms
1:	learn: -1.7897191	total: 63.3ms	remaining: 253ms
2:	learn: -1.7264644	total: 67.9ms	remaining: 158ms
3:	learn: -1.7105212	total: 72.3ms	remaining: 109ms
4:	learn: -1.6908659	total: 76.8ms	remaining: 76.8ms
5:	learn: -1.6817686	total: 80.6ms	remaining: 53.7ms
6:	learn: -1.6777376	total: 85.6ms	remaining: 36.7ms
7:	learn: -1.6671538	total: 90.2ms	remaining: 22.6ms
8:	learn: -1.6615399	total: 95.2ms	remaining: 10.6ms
9:	learn: -1.6394750	total: 99.5ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x13021d208>

In [74]:
cat.best_score_

{'learn': {'MultiClass': -1.6394749540763465}}

In [75]:
filename = '../pickled_models/shards_cat.pkl'
pickle.dump(cat, open(filename, 'wb'))