## Supervised Classification
modeling
---
- Logistic Regression
- KNeighborsClassifier
- Decision Tree
- Random Forest
- SVC
---
- Preprocessing: Standard scaler
- Pipeline: Standard scaler - modeling/ Pipeline - GridSearch
- Database: Postgre SQL
- Metrics: Confusion matrix

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import psycopg2 as pg2

# from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# from sklearn.naive_bayes import MultinomialNB, GaussianNB

from sklearn.metrics import mean_squared_error, confusion_matrix

### Fetch data from database

In [2]:
def get_cur():
    db_pwd = 'password'
    conn = pg2.connect(database='AQI_2019', user='postgres', password=db_pwd)
    return conn.cursor()

def close():
    cur.close()
    conn.close()

cur = get_cur()

In [3]:
# fetch data
cur.execute("""SELECT * FROM aqi
            """)
df = pd.DataFrame(cur.fetchall(), columns=['date', 'type', 'state', 'aqi', 'pt_id'])

cur.execute("""SELECT measuring_pt.lat, measuring_pt.long, aqi.state FROM measuring_pt
            LEFT JOIN aqi ON measuring_pt.pt_id = aqi.pt_id
            GROUP BY measuring_pt.lat, measuring_pt.long, aqi.state
            """)
df_dist = pd.DataFrame(cur.fetchall(), columns=['lat', 'long', 'state'])

In [4]:
df.head(3)

Unnamed: 0,date,type,state,aqi,pt_id
0,2019-01-01,PM25,Hawaii,24.0,0
1,2019-01-02,PM25,Hawaii,20.0,0
2,2019-01-03,PM25,Hawaii,21.0,0


In [5]:
df_dist.shape

(1482, 3)

In [6]:
df_dist.head(2)

Unnamed: 0,lat,long,state
0,19.117561,-155.778136,Hawaii
1,19.2039,-155.480183,Hawaii


### Data preparation for modeling

In [7]:
df['date'] = pd.to_datetime(df['date'])
df['date'] = pd.DatetimeIndex(df['date']).month.map(str) + pd.DatetimeIndex(df['date']).day.map(str)
df['date'] = pd.to_datetime(df['date'], format='%m%d').dt.strftime('%m%d')

In [8]:
df['date'] = df['date'].map(lambda x: int(x))

In [9]:
df.head(3)

Unnamed: 0,date,type,state,aqi,pt_id
0,101,PM25,Hawaii,24.0,0
1,102,PM25,Hawaii,20.0,0
2,103,PM25,Hawaii,21.0,0


##### Standard scaler

In [10]:
ss = StandardScaler()

[ X: date, AQI, States / y: Air types - Ozone or PM 2.5 ]

In [11]:
X = df.drop(columns=['type', 'pt_id'])
y = df['type']

# We use get_dummies because it makes exogenous(X) readable
X = pd.get_dummies(X, columns=['state'], drop_first=True)
X.head(2)

Unnamed: 0,date,aqi,state_Alaska,state_Arizona,state_Arkansas,state_California,state_Colorado,state_Connecticut,state_Delaware,state_District Of Columbia,...,state_South Dakota,state_Tennessee,state_Texas,state_Utah,state_Vermont,state_Virginia,state_Washington,state_West Virginia,state_Wisconsin,state_Wyoming
0,101,24.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,102,20.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [13]:
ss.fit(X_train) # Always fit to X_train because of data leaking
X_train_ss = ss.transform(X_train)
X_test_ss = ss.transform(X_test)

[ X_sm: date, AQI, Air type / y_sm: 3 states(Texas, Washington, Illinois) ]

In [14]:
# Prepare data for Linear Regression
X_sm = df[(df['state'] == 'Texas') | (df['state'] == 'Washington') | (df['state'] == 'Illinois')].drop(columns=['state', 'pt_id'])
y_sm = df[(df['state'] == 'Texas') | (df['state'] == 'Washington') | (df['state'] == 'Illinois')]['state']
X_sm = pd.get_dummies(X_sm, columns=['type'], drop_first=True)
X_sm.shape, y_sm.shape

((56677, 3), (56677,))

In [15]:
X_sm_train, X_sm_test, y_sm_train, y_sm_test = train_test_split(X_sm, y_sm, test_size=0.33, random_state=42)

In [16]:
ss.fit(X_sm_train) # Always fit to X_train because of data leaking
X_sm_train_ss = ss.transform(X_sm_train)
X_sm_test_ss = ss.transform(X_sm_test)

[ Xd: latitude, longitude / yd: state ]

In [17]:
Xd = df_dist.drop(columns='state')
yd = df_dist['state']
Xd_train, Xd_test, yd_train, yd_test = train_test_split(Xd, yd, test_size=0.33, random_state=42)

In [18]:
ss.fit(Xd_train) # Always fit to X_train because of data leaking
ss.mean_, ss.scale_

(array([ 37.66786208, -95.29746607]), array([ 5.46612298, 17.53832282]))

In [19]:
Xd_train_ss = ss.transform(Xd_train)
Xd_test_ss = ss.transform(Xd_test)

### Logistic Regression
Predict state based on date, air type and AQI

In [20]:
lgr = LogisticRegression(solver='saga', random_state=42, C=1.5)
lgr.fit(X_sm_train, y_sm_train)



LogisticRegression(C=1.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
lgr.intercept_, lgr.coef_

(array([-0.12584565,  0.01478667, -0.01764719]),
 array([[-9.39236697e-04, -3.86153436e-03,  1.99236475e-02],
        [ 1.75534263e-04,  8.11241694e-03, -2.58666549e-01],
        [-1.97807540e-04, -5.07969878e-02,  1.80770496e-01]]))

In [22]:
lgr.score(X_sm_train, y_sm_train), lgr.score(X_sm_test, y_sm_test)

(0.5860479814605114, 0.5892857142857143)

In [23]:
# fit with scaled data
lgr.fit(X_sm_train_ss, y_sm_train)



LogisticRegression(C=1.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
lgr.score(X_sm_train_ss, y_sm_train), lgr.score(X_sm_test_ss, y_sm_test)

(0.5998209253943592, 0.6014221556886228)

Predict Air type (OZ or PM 2.5) based on date, AQI, state

In [25]:
lgr = LogisticRegression(solver='liblinear', random_state=42, C=10.0)
lgr.fit(X_train, y_train)

LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
lgr.score(X_train, y_train), lgr.score(X_test, y_test)

(0.7082589355860733, 0.7086569415040167)

In [27]:
lgr.predict(X_test)

array(['OZ', 'OZ', 'OZ', ..., 'OZ', 'OZ', 'OZ'], dtype=object)

In [28]:
# fit with scaled data
lgr.fit(X_train_ss, y_train)

LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
lgr.score(X_train_ss, y_train), lgr.score(X_test_ss, y_test)

(0.7097073704479292, 0.7101766190075693)

#### Confusion matrix

In [30]:
y_test.value_counts(normalize=True)

OZ      0.590731
PM25    0.409269
Name: type, dtype: float64

In [31]:
confusion_matrix(y_test, lgr.predict(X_test_ss))

array([[87286, 14559],
       [35408, 35152]], dtype=int64)

In [32]:
pred = lgr.predict(X_test_ss)
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

acc = (tp + tn) / (tn + fp + fn + tp) # Correctly predicted
print(f'Accuracy: {round(acc,4)}')

miss = 1 - acc
print(f'Missclassification: {round(miss,4)}')

sens = tp / (tp + fn) # Recall (Correctly predicted among all positives)
print(f'Sensitivity: {round(sens,4)}')

spec = tn / (tn + fp)  # Correctly predicted among all negatives
print(f'Specificity: {round(spec,4)}')

prec = tp / (tp + fp) # Correctly predicted among all positively predicted
print(f'Precision: {round(prec,4)}')

f1 = 2*(prec*sens)/(prec+sens) # metric needed to check for unbalanced data
print(f'F1 score: {round(f1,4)}')

Accuracy: 0.7102
Missclassification: 0.2898
Sensitivity: 0.4982
Specificity: 0.857
Precision: 0.7071
F1 score: 0.5845


### KNeighbors Classifier
Predict state based on latitude and longitude

In [33]:
knn = KNeighborsClassifier(n_neighbors=3)

In [34]:
knn.fit(Xd_train_ss, yd_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [35]:
knn.score(Xd_train_ss, yd_train), knn.score(Xd_test_ss, yd_test)

(0.9455645161290323, 0.8693877551020408)

In [36]:
confusion_matrix(yd_test, knn.predict(Xd_test_ss))

array([[ 5,  0,  0, ...,  0,  0,  0],
       [ 0,  4,  0, ...,  0,  0,  0],
       [ 0,  0, 23, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  1,  0,  0],
       [ 0,  0,  0, ...,  0,  8,  0],
       [ 0,  0,  0, ...,  0,  0,  4]], dtype=int64)

Modeling using Pipeline

In [37]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=3))
])
pipe.fit(Xd_train, yd_train)

Pipeline(memory=None,
         steps=[('ss',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('knn',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=3, p=2,
                                      weights='uniform'))],
         verbose=False)

In [38]:
pipe.score(Xd_train, yd_train), pipe.score(Xd_test, yd_test)

(0.9455645161290323, 0.8693877551020408)

Pipeline and Gridsearch

In [39]:
pipe_params = {
    'ss__with_mean':[True, False],
    'knn__weights':['uniform', 'distance'],
    'knn__n_neighbors':[3, 5, 10]
}

In [40]:
pipe_gs = GridSearchCV(pipe, pipe_params, cv=5)

In [41]:
pipe_gs.fit(Xd_train, yd_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('ss',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('knn',
                                        KNeighborsClassifier(algorithm='auto',
                                                             leaf_size=30,
                                                             metric='minkowski',
                                                             metric_params=None,
                                                             n_jobs=None,
                                                             n_neighbors=3, p=2,
                                                             weights='uniform'))],
                                verbose=False),

In [42]:
pipe_gs.best_score_

0.8850806451612904

In [43]:
pipe_gs.best_estimator_

Pipeline(memory=None,
         steps=[('ss',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('knn',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=5, p=2,
                                      weights='distance'))],
         verbose=False)

In [44]:
pipe_gs.score(Xd_test, yd_test)

0.8795918367346939

### Decision tree
Predict state based on date, air type and AQI

In [45]:
gs_dt = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),
                     param_grid={
                         'max_depth':[30, 40, 50],
                         'min_samples_split':[2, 10, 20]
                     },
                     cv=5
                    )
gs_dt.fit(X_sm_train, y_sm_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=42,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [30, 40, 50],
                         'min_samples_split': [2, 10, 20]},
             pre_dispatch=

In [46]:
gs_dt.best_score_

0.662707713375293

In [47]:
gs_dt.best_params_

{'max_depth': 40, 'min_samples_split': 20}

### Random Forest
Predict state based on date, air type and AQI

In [48]:
gs_rf = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                     param_grid={
                         'n_estimators': [150, 200],
                         'max_depth':[None, 30, 50]
                     },
                     cv=5
                    )
gs_rf.fit(X_sm_train, y_sm_train)
gs_rf.best_params_

{'max_depth': 30, 'n_estimators': 200}

In [49]:
gs_rf.best_score_

0.6397966976535959

### Support Vector Machine - SVC
Predict state based on date, air type and AQI

In [50]:
svc = SVC(random_state=42, C=5.0)
svc.fit(X_sm_train_ss, y_sm_train)
svc.score(X_sm_train_ss, y_sm_train), svc.score(X_sm_test_ss, y_sm_test)

(0.6202038290364206, 0.6181030795551754)