<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Final-Project-Check-in" data-toc-modified-id="Final-Project-Check-in-1">Final Project Check-in</a></span></li><li><span><a href="#Group-Name" data-toc-modified-id="Group-Name-2">Group Name</a></span></li><li><span><a href="#Student-Names" data-toc-modified-id="Student-Names-3">Student Names</a></span></li><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-4">Load Data</a></span></li><li><span><a href="#Fit-scikit-learn-model" data-toc-modified-id="Fit-scikit-learn-model-5">Fit scikit-learn model</a></span></li><li><span><a href="#Evaluation-Metric" data-toc-modified-id="Evaluation-Metric-6">Evaluation Metric</a></span></li></ul></div>

Final Project Check-in
------

Group Name
-----

Kakkle 

Student Names
----

1. Annette (Zijun) Lin
2. Ming-Chuan Tsai
3. Kathy Yi

Load Data
-----

In [295]:
import pandas as pd
import numpy as np

df = pd.read_csv("forestfires.csv")

df['month'] = df['month'].map({'mar':3, 'oct':10, 'aug':8, 'sep':9, 'apr':4, 'jun':6, 'jul':7, 'feb':2, 'jan':1,
       'dec':12, 'may':5, 'nov':11})
df['day'] = df['day'].map({'fri': 5, 'tue': 2, 'sat': 6, 'sun': 7, 'mon': 1, 'wed': 3, 'thu':4})

## Fit models

In [273]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

def make_pipelines():
    """Create a pipeline for each of the following algorithms:
    1. Logistic Regression
    2. k-nearest neighbors (KNN) 
    3. Naive Bayes (Guassian)
    4. Support Vector Machines (SVM)
    5. Random Forest™ 
    
    If appropriate, apply StandardScaler before the algorithm.   
    Use default hyperparameters.
    If an algorithm takes random_state then random_state=42 
    
    Return a list of all the pipelines.
    """ 
    # YOUR CODE HERE
    pipe_lr = Pipeline([('scl', StandardScaler()), ('clf', LogisticRegression(random_state=42))])
    pipe_knn = Pipeline([('scl', StandardScaler()), ('clf', KNeighborsClassifier())])
    pipe_bayes = Pipeline([('clf', GaussianNB())])
    pipe_svm = Pipeline([('scl', StandardScaler()),('clf', SVC(random_state=42))])
    pipe_rf = Pipeline([('clf', RandomForestClassifier(random_state=42))])
    
    pipelines = [pipe_lr, pipe_knn, pipe_bayes, pipe_svm, pipe_rf]
    
    
    return pipelines

def sort_models(pipelines, X_data, y_data, metric=precision_score, average='weighted', matrix=False):

    scores_sorted = {}
    for pipe in pipelines:
        y_pred = pipe.predict(X_data)
        algname = str(type(pipe.named_steps['clf'])).split('.')[-1].replace("'>","")
        if matrix:
            if algname == 'KNeighborsClassifier' or algname == 'RandomForestClassifier':
                print(algname)
                print(confusion_matrix(y_data, y_pred))
        scores_sorted[algname] = metric(y_data, y_pred, average=average)
    scores_sorted = dict(sorted(scores_sorted.items(), key=lambda x: x[1], reverse=True))
    
    return scores_sorted

## Test confusion matrix

In [296]:
df.columns

Index(['X', 'Y', 'month', 'day', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH',
       'wind', 'rain', 'area'],
      dtype='object')

In [325]:
# tweaking happens here
df.loc[df.area >= 100, 'label'] = "big"
df.loc[(df.area >= 20) & (df.area < 100), 'label'] = "medium"
df.loc[(df.area >= 0) & (df.area < 20), 'label'] = "small"

# dropping columns
dfc = df.drop(columns=['FFMC', 'DMC','DC','ISI','RH','temp'])
y = df.iloc[:,-1].values
X = dfc.iloc[:,:-2].values

# split and smote
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
sample_size = np.sum(y_train=='small')
X_smote, y_smote = SMOTE(sampling_strategy={"big":sample_size, "medium":sample_size, "small":sample_size})\
                    .fit_sample(X_train, y_train)

# fit models
pipelines = make_pipelines()
for pipe in pipelines:
    pipe.fit(X_smote, y_smote)

sort_models(pipelines, X_test, y_test, metric=f1_score, matrix=True)

KNeighborsClassifier
[[ 0  0  1]
 [ 0  3  4]
 [18 24 54]]
RandomForestClassifier
[[ 0  0  1]
 [ 0  1  6]
 [ 0  3 93]]


  'precision', 'predicted', average, warn_for)


{'RandomForestClassifier': 0.8882189239332094,
 'SVC': 0.7105875581802714,
 'KNeighborsClassifier': 0.6550540067143481,
 'LogisticRegression': 0.5153746770025839,
 'GaussianNB': 0.026358223301827452}

### Training set metrics

In [176]:
sort_models(pipelines, X_smote, y_smote)

{'RandomForestClassifier': 0.9716913006026417,
 'KNeighborsClassifier': 0.8608042671386082,
 'SVC': 0.7222533726998279,
 'GaussianNB': 0.6504599405182888,
 'LogisticRegression': 0.5077027947769824}