# Classification models featuring tilt, azimuth, lat, long, module count, inverter count, and other features in lowercase.

In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn import tree
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-paper')
import pickle
from datetime import datetime



In [2]:
df101_a = pd.read_csv('data/eda101_a.csv')

In [3]:
# df101_a.head()

In [4]:
df101_a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 710 entries, 0 to 709
Data columns (total 15 columns):
ticket_id                         710 non-null int64
asset_id                          710 non-null int64
root_cause                        710 non-null object
ticket_creation_reason            710 non-null object
latitude                          710 non-null float64
longitude                         710 non-null float64
tilt                              710 non-null float64
azimuth                           710 non-null float64
inverter_count                    710 non-null int64
module_count                      710 non-null int64
ticket_origin                     710 non-null object
service_partner                   710 non-null object
date_ticket_initially_assigned    710 non-null object
installed_by                      710 non-null object
installation_date                 710 non-null object
dtypes: float64(4), int64(4), object(7)
memory usage: 83.3+ KB


That didn't work, lets grab all the column names containing 'date' and convert them to datetime.

In [5]:
dates = [col for col in df101_a.columns if 'date' in col]
print(dates)

['date_ticket_initially_assigned', 'installation_date']


In [6]:
from pandas import to_datetime

In [7]:
df101_a[dates] = df101_a[dates].apply(pd.to_datetime, format='%Y-%m-%d')

In [8]:
df101_a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 710 entries, 0 to 709
Data columns (total 15 columns):
ticket_id                         710 non-null int64
asset_id                          710 non-null int64
root_cause                        710 non-null object
ticket_creation_reason            710 non-null object
latitude                          710 non-null float64
longitude                         710 non-null float64
tilt                              710 non-null float64
azimuth                           710 non-null float64
inverter_count                    710 non-null int64
module_count                      710 non-null int64
ticket_origin                     710 non-null object
service_partner                   710 non-null object
date_ticket_initially_assigned    710 non-null datetime64[ns]
installed_by                      710 non-null object
installation_date                 710 non-null datetime64[ns]
dtypes: datetime64[ns](2), float64(4), int64(4), object(5)
memory usa

Let's convert be certain our ticket_id and asset_id are objects for encoding.

In [9]:
df101_a[['ticket_id', 'asset_id']] = df101_a[['ticket_id', 'asset_id']].astype('object')
df101_a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 710 entries, 0 to 709
Data columns (total 15 columns):
ticket_id                         710 non-null object
asset_id                          710 non-null object
root_cause                        710 non-null object
ticket_creation_reason            710 non-null object
latitude                          710 non-null float64
longitude                         710 non-null float64
tilt                              710 non-null float64
azimuth                           710 non-null float64
inverter_count                    710 non-null int64
module_count                      710 non-null int64
ticket_origin                     710 non-null object
service_partner                   710 non-null object
date_ticket_initially_assigned    710 non-null datetime64[ns]
installed_by                      710 non-null object
installation_date                 710 non-null datetime64[ns]
dtypes: datetime64[ns](2), float64(4), int64(2), object(7)
memory u

In [10]:
df_origin_omnidian_customer = df101_a.loc[df101_a.ticket_origin =='origin_omnidian_customer']

In [70]:
df_origin_omnidian_customer.root_cause.value_counts()

root_cause_normal_wear_and_tear                    170
root_cause_major_component_failure_warranty         54
root_cause_non-service_support                      19
root_cause_homeowner                                10
root_cause_installer_workmanship                    10
root_cause_environmental                             5
root_cause_roof_issue                                4
root_cause_major_component_failure_non-warranty      4
root_cause_design/sale_issue                         2
Name: root_cause, dtype: int64

In [72]:
df_origin_homeowner = df101_a.loc[df101_a.ticket_origin =='origin_homeowner']

In [73]:
df_origin_homeowner.root_cause.value_counts()

root_cause_normal_wear_and_tear                    72
root_cause_major_component_failure_warranty        19
root_cause_non-service_support                     13
root_cause_roof_issue                              11
root_cause_homeowner                                9
root_cause_installer_workmanship                    6
root_cause_service_workmanship                      3
root_cause_major_component_failure_non-warranty     2
root_cause_environmental                            2
Name: root_cause, dtype: int64

## Train-test-split for cross-validation

In [58]:
X = df101_a.drop('root_cause', axis=1).copy()
y = df101_a['root_cause']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42, test_size=0.2)

## Let's try ColumnTransformer of categoricals for vectorization

In [62]:
ColumnTransformer?

[0;31mInit signature:[0m
[0mColumnTransformer[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtransformers[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mremainder[0m[0;34m=[0m[0;34m'drop'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msparse_threshold[0m[0;34m=[0m[0;36m0.3[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_jobs[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtransformer_weights[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Applies transformers to columns of an array or pandas DataFrame.

This estimator allows different columns or column subsets of the input
to be transformed separately and the features generated by each transformer
will be concatenated to form a single feature space.
This is useful for heterogeneous or columnar data, to combine several
feature extraction mech

In [59]:
categoricals = X.select_dtypes(include=['object']).columns

In [44]:
categoricals

Index(['ticket_id', 'asset_id', 'ticket_creation_reason', 'ticket_origin',
       'service_partner', 'installed_by'],
      dtype='object')

In [65]:
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [66]:
preprocessor = ColumnTransformer(
    transformers=[('cat', categorical_transformer, categoricals)])

In [49]:
encoder = OneHotEncoder(handle_unknown='ignore')
X_train_enc = encoder.fit(X_train[categoricals])
X_train_enc = pd.DataFrame(encoder.transform(X_train).toarray(),
                         columns=encoder.get_feature_names())
X_test_enc = pd.DataFrame(encoder.transform(X_test).toarray(),
                        columns=encoder.get_feature_names())

  mask &= (ar1 != a)
  mask |= (ar1 == a)


ValueError: could not convert string to float: 'origin_homeowner'