# In which we used 'days ago' instead of datetimes so we can run more models.

### Based on features from Omnidian database 101, we compare K-Nearest Neighbors, Gradient Boosting, Random Forest, Bagging, and Logistic Regression.

In [66]:
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.linear_model import RidgeClassifier
from scipy import stats
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn import tree
import numpy as np
import pandas as pd
from datetime import datetime
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-paper')
import pickle

Get Data

In [67]:
df101_d = pd.read_csv('data/eda101_d.csv')

In [68]:
df101_d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 825 entries, 0 to 824
Data columns (total 14 columns):
ticket_id                   825 non-null int64
asset_id                    825 non-null int64
root_cause                  825 non-null object
ticket_creation_reason      825 non-null object
latitude                    825 non-null float64
longitude                   825 non-null float64
tilt                        825 non-null float64
azimuth                     825 non-null float64
ticket_origin               825 non-null object
service_partner             825 non-null object
ticket_assigned_days_ago    825 non-null int64
ticket_closed_days_ago      825 non-null int64
installed_by                825 non-null object
installed_days_ago          825 non-null int64
dtypes: float64(4), int64(5), object(5)
memory usage: 90.3+ KB


Both ticket_id and asset_id need to be strings

In [69]:
df101_d[['ticket_id', 'asset_id']] = df101_d[['ticket_id', 'asset_id']].astype(object)
df101_d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 825 entries, 0 to 824
Data columns (total 14 columns):
ticket_id                   825 non-null object
asset_id                    825 non-null object
root_cause                  825 non-null object
ticket_creation_reason      825 non-null object
latitude                    825 non-null float64
longitude                   825 non-null float64
tilt                        825 non-null float64
azimuth                     825 non-null float64
ticket_origin               825 non-null object
service_partner             825 non-null object
ticket_assigned_days_ago    825 non-null int64
ticket_closed_days_ago      825 non-null int64
installed_by                825 non-null object
installed_days_ago          825 non-null int64
dtypes: float64(4), int64(3), object(7)
memory usage: 90.3+ KB


Assign target to Root_Cause and Train-Test-Split. We'll also take the ticket_id off now so we can use it later to look rows up.

In [70]:
X = df101_d.drop(['root_cause'], axis=1).copy()
y = df101_d['root_cause'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

## Let's try some featureunionstuff

In [71]:
class PandasFeatureUnion(FeatureUnion):
    def fit_transform(self, X, y=None, **fit_params):
        self._validate_transformers()
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_transform_one)(trans, weight, X, y,
                                        **fit_params)
            for name, trans, weight in self._iter())

        if not result:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        Xs, transformers = zip(*result)
        self._update_transformer_list(transformers)
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

    def merge_dataframes_by_column(self, Xs):
        return pd.concat(Xs, axis="columns", copy=False)

    def transform(self, X):
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(_transform_one)(trans, weight, X)
            for name, trans, weight in self._iter())
        if not Xs:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.hstack(Xs).tocsr()
        else:
            Xs = self.merge_dataframes_by_column(Xs)
        return Xs

In [74]:
pipeline = PandasFeatureUnion([
    ("C", make_pipeline(
        PandasTransform(OneHotEncoder(handle_unknown='ignore'), categoricals)
    ))]
    

pipeline.fit_transform(X_test)

SyntaxError: invalid syntax (<ipython-input-74-dc24f6e36e1b>, line 7)

Let's try ColumnTransformer. We'll pull ticket_id from the dataframes to keep them from being encoded, then we'll put them back together for our function later.

In [40]:
train_ticket = X_train.ticket_id.copy()

In [47]:
X_train_sans_ticket = X_train.drop(['ticket_id'], axis=1).copy()

In [42]:
test_ticket = X_test.ticket_id.copy()

In [75]:
X_test_sans_ticket = X_test.drop(['ticket_id'], axis=1)

In [51]:
# List our categorical features
categoricals = list(X_train.columns[(X_train.dtypes.values == np.dtype('object'))])
categoricals.remove('ticket_id')
categoricals

['asset_id',
 'ticket_creation_reason',
 'ticket_origin',
 'service_partner',
 'installed_by']

In [52]:
X_nums = list(df101_d.columns[(df101_d.dtypes.values != np.dtype('object'))])
X_nums

['latitude',
 'longitude',
 'tilt',
 'azimuth',
 'ticket_assigned_days_ago',
 'ticket_closed_days_ago',
 'installed_days_ago']

In [53]:
X_train_num = X_train[X_nums].copy()
X_train_num.head()

Unnamed: 0,latitude,longitude,tilt,azimuth,ticket_assigned_days_ago,ticket_closed_days_ago,installed_days_ago
239,34.038597,-118.493806,40.0,225.0,310,229,1929
689,39.923479,-74.75141,30.0,160.0,331,331,2407
644,38.25163,-122.149367,23.0,152.0,531,486,1734
342,34.448895,-119.260475,9.0,180.0,529,485,1965
299,34.138321,-117.55659,23.0,180.0,439,379,1607


In [54]:
X_train_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 660 entries, 239 to 102
Data columns (total 7 columns):
latitude                    660 non-null float64
longitude                   660 non-null float64
tilt                        660 non-null float64
azimuth                     660 non-null float64
ticket_assigned_days_ago    660 non-null int64
ticket_closed_days_ago      660 non-null int64
installed_days_ago          660 non-null int64
dtypes: float64(4), int64(3)
memory usage: 41.2 KB


In [55]:
X_test_num = X_test[X_nums].copy()
X_test_num.head()

Unnamed: 0,latitude,longitude,tilt,azimuth,ticket_assigned_days_ago,ticket_closed_days_ago,installed_days_ago
611,37.818595,-121.93428,23.0,170.0,390,324,2704
174,33.840872,-111.771766,34.0,232.0,702,325,1529
67,40.768241,-74.510319,18.0,224.0,479,395,2113
168,33.866658,-118.387667,23.0,55.0,377,346,1335
275,34.314705,-118.432859,23.0,132.0,461,387,1420


In [56]:
X_test_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165 entries, 611 to 527
Data columns (total 7 columns):
latitude                    165 non-null float64
longitude                   165 non-null float64
tilt                        165 non-null float64
azimuth                     165 non-null float64
ticket_assigned_days_ago    165 non-null int64
ticket_closed_days_ago      165 non-null int64
installed_days_ago          165 non-null int64
dtypes: float64(4), int64(3)
memory usage: 10.3 KB


In [57]:
y_test.shape

(165,)

In [58]:
preprocessor = make_column_transformer( (OneHotEncoder(handle_unknown='ignore'), categoricals))
encoder = preprocessor.fit(X_train_sans_ticket)

In [59]:
X_train_enc = pd.DataFrame(encoder.transform(X_train).toarray(),
                         columns=encoder.get_feature_names())
X_test_enc = pd.DataFrame(encoder.transform(X_test).toarray(),
                        columns=encoder.get_feature_names())

In [60]:
X_train_enc.head()

Unnamed: 0,onehotencoder__x0_101111473,onehotencoder__x0_101111478,onehotencoder__x0_101111489,onehotencoder__x0_101111511,onehotencoder__x0_101111521,onehotencoder__x0_101111528,onehotencoder__x0_101111533,onehotencoder__x0_101111565,onehotencoder__x0_101111567,onehotencoder__x0_101111573,...,onehotencoder__x4_solar alliance of america inc.,onehotencoder__x4_solar energy world nj,onehotencoder__x4_solar plus llc,onehotencoder__x4_sonic solar energy,onehotencoder__x4_summerwindsolar llc phoenix,onehotencoder__x4_summit technology group,onehotencoder__x4_sunstarter solar installations inc,onehotencoder__x4_syntrol plumbing heating and air,onehotencoder__x4_talbott solar home,onehotencoder__x4_williams lifetime builders inc. dba lifetime solar
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [61]:
X_train_enc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660 entries, 0 to 659
Columns: 389 entries, onehotencoder__x0_101111473 to onehotencoder__x4_williams lifetime builders inc. dba lifetime solar
dtypes: float64(389)
memory usage: 2.0 MB


In [62]:
train_ticket.shape

(660,)

Now we'll put the ticket_id, numerical columns, and encoded columns all together.

In [63]:
X_train_num.insert(loc=0, column='ticket_id', value=train_ticket)
X_train_num.head()

Unnamed: 0,ticket_id,latitude,longitude,tilt,azimuth,ticket_assigned_days_ago,ticket_closed_days_ago,installed_days_ago
239,20902,34.038597,-118.493806,40.0,225.0,310,229,1929
689,19046,39.923479,-74.75141,30.0,160.0,331,331,2407
644,8381,38.25163,-122.149367,23.0,152.0,531,486,1734
342,8324,34.448895,-119.260475,9.0,180.0,529,485,1965
299,13319,34.138321,-117.55659,23.0,180.0,439,379,1607


In [64]:
X_train_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 660 entries, 239 to 102
Data columns (total 8 columns):
ticket_id                   660 non-null object
latitude                    660 non-null float64
longitude                   660 non-null float64
tilt                        660 non-null float64
azimuth                     660 non-null float64
ticket_assigned_days_ago    660 non-null int64
ticket_closed_days_ago      660 non-null int64
installed_days_ago          660 non-null int64
dtypes: float64(4), int64(3), object(1)
memory usage: 46.4+ KB


In [65]:
X_train_mega = pd.merge(X_train_num, X_train_enc, left_index=True, right_index=False)

MergeError: Must pass right_on or right_index=True

In [None]:
X_train_mega.info()

In [None]:
X_train_mega = pd.concat([X_train_num, X_train_enc], axis=1, join_axes=[X_train_enc.index])
X_train_mega.info()

In [None]:
X_train_mega.isna().any().sum()

In [None]:
X_train_mega.info()

In [None]:
y_train.shape

In [None]:
X_test_num.insert(loc=0, column='ticket_id', value=test_ticket)
X_test_num.head()

In [None]:
X_test_mega = pd.concat([X_test_num, X_test_enc], axis=1, join_axes=[X_test_num.index])
X_test_mega.info()

In [None]:
X_test_mega.isna().any().sum()

### Modeling

In [None]:
random_state=42

In [None]:
svclassifier = SVC(kernel='linear')  
svclassifier.fit(X_train_mega, y_train)

In [None]:
def run_eval_model(Classifier, X_train, y_train, X_test, y_test):
    Classifier.fit(X_train, y_train)
    return Classifier.score(X_test, y_test)

In [None]:
run_eval_model(BernoulliNB(alpha=.01), X_train_mega, y_train, X_test_mega, y_test)

In [None]:
gb = GradientBoostingClassifier(random_state=42, min_samples_leaf=30)
gb.fit(X_train_mega, y_train)

In [None]:
gb.score(X_test_enc, y_test)

Training our models.

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [None]:
# filename = '101_knn.pkl'
# pickle.dump(knn, open(filename, 'wb'))

In [None]:
# kn = pickle.load

In [None]:
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
lr.fit(X_train, y_train)

In [None]:
# filename = '101_lr.pkl'
# pickle.dump(lr, open(filename, 'wb'))

In [None]:
gb = GradientBoostingClassifier(random_state=42, min_samples_leaf=30)
gb.fit(X_train, y_train)

In [None]:
# filename = '101_gb.pkl'
# pickle.dump(gb, open(filename, 'wb'))

In [None]:
dt = DecisionTreeClassifier(random_state=42, min_samples_leaf=30)
dt.fit(X_train, y_train)

In [None]:
# filename = '101_dt.pkl'
# pickle.dump(dt, open(filename, 'wb'))

In [None]:
bg = BaggingClassifier(random_state=25565)
bg.fit(X_train, y_train)

In [None]:
# filename = '101_bg.pkl'
# pickle.dump(bg, open(filename, 'wb'))

In [None]:
rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf.fit(X_train, y_train);

In [None]:
# filename = '101_rf.pkl'
# pickle.dump(rf, open(filename, 'wb'))

Let's show our results

In [None]:
knn.score(X_test, y_test)

In [None]:
lr.score(X_test, y_test)

In [None]:
gb.score(X_test, y_test)

In [None]:
dt.score(X_test, y_test)

In [None]:
bg.score(X_test, y_test)

In [None]:
rf.score(X_test, y_test)

What may be some other classifiers?

In [None]:
from sklearn.utils.testing import all_estimators

In [None]:
rf.predict_proba(X_test)[0]

In [None]:
rf.predict(X_test)[0:5]

In [None]:
l_props = lr.predict_proba(X_test)[1]

In [None]:
zipp = list(zip(lr.classes_, l_props))

In [None]:
#want a dictionary that gives probability for each class, 
#prediction, and ground truth
def display_preds_truth(model, obs, X_test, y_test):
    probs = model.predict_proba(X_test)[obs]
    classes = model.classes_
    display = dict(zip(classes, probs))
#     display['prediction'] = model.predict(X_test)[obs]
    display['ground truth'] = y_test[obs]
    return display
        
    
    

In [None]:
#want a dictionary that gives probability for each class,
def display_probas(model, obs, X_test):
    probs = model.predict_proba(X_test)[obs]
    classes = model.classes_
    display = dict(zip(classes, probs))
    display['prediction'] = model.predict(X_test)[obs]
    return display

In [None]:
display_probas(lr, 1, X_test)

In [None]:
X_test

In [None]:
y_test