<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Final-Project-Check-in" data-toc-modified-id="Final-Project-Check-in-1">Final Project Check-in</a></span></li><li><span><a href="#Group-Name" data-toc-modified-id="Group-Name-2">Group Name</a></span></li><li><span><a href="#Student-Names" data-toc-modified-id="Student-Names-3">Student Names</a></span></li><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-4">Load Data</a></span></li><li><span><a href="#Fit-scikit-learn-model" data-toc-modified-id="Fit-scikit-learn-model-5">Fit scikit-learn model</a></span></li><li><span><a href="#Evaluation-Metric" data-toc-modified-id="Evaluation-Metric-6">Evaluation Metric</a></span></li></ul></div>

Final Project Check-in
------

Group Name
-----

Kakkle 

Student Names
----

1. Annette (Zijun) Lin
2. Ming-Chuan Tsai
3. Kathy Yi

Load Data
-----

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [13]:
import pandas as pd
import numpy as np

df = pd.read_csv("forestfires.csv")

df['month'] = df['month'].map({'mar':3, 'oct':10, 'aug':8, 'sep':9, 'apr':4, 'jun':6, 'jul':7, 'feb':2, 'jan':1,
       'dec':12, 'may':5, 'nov':11})
df['day'] = df['day'].map({'fri': 5, 'tue': 2, 'sat': 6, 'sun': 7, 'mon': 1, 'wed': 3, 'thu':4})
df.loc[df.area >= 100, 'label'] = "large"
df.loc[(df.area >= 30) & (df.area < 100), 'label'] = "medium"
df.loc[(df.area < 30), 'label'] = "small"

# df = pd.get_dummies(df,prefix=['fire'], drop_first=True)

y = df.label.values
X = df.drop(["area", "label"], axis = 1).values

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
def make_pipelines():
    """Create a pipeline for each of the following algorithms:
    1. Logistic Regression
    2. k-nearest neighbors (KNN) 
    3. Naive Bayes (Guassian)
    4. Support Vector Machines (SVM)
    5. Random Forest™ 
    
    If appropriate, apply StandardScaler before the algorithm.   
    Use default hyperparameters.
    If an algorithm takes random_state then random_state=42 
    
    Return a list of all the pipelines.
    """ 
#     ('scl', StandardScaler()),          # Transformer: Standardize
#                     ('pca', PCA(n_components=2)),       # Transformer: Dimension Reduction
    
    # YOUR CODE HERE
#     solver='lbfgs', # Estimator: ML algorithm
#                                                multi_class='multinomial',
#                                                random_state=42
    pipe_lr = Pipeline([('scl', StandardScaler()),          # Transformer: Standardize
                    ('clf', LogisticRegression())]) 
    pipe_knn = Pipeline([('scl', StandardScaler()),
                        ('knn', KNeighborsClassifier())])
    pipe_nb = Pipeline([('scl', StandardScaler()),
                       ('gaussiannb', GaussianNB())])
    
    pipe_svm = Pipeline([('scl', StandardScaler()),
                        ('svm', SVC(random_state=42))])
    
    pipe_rf = Pipeline([('scl', StandardScaler()),
                        ('classifier', RandomForestClassifier(min_samples_split=5, random_state=42))])
    
    pipelines = [pipe_lr, pipe_knn, pipe_nb, pipe_svm, pipe_rf]
    
    return pipelines

In [28]:
pipelines = make_pipelines()
# Train all the models
for pipe in pipelines:
    pipe.fit(X_train, y_train)



In [29]:
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix


def sort_models(pipelines, X_data, y_data, metric=f1_score, average='weighted'):
    scoresDict = dict()
    nameList = ['LogisticRegression','KNeighborsClassifier', 'GaussianNB',  'SVC', 'RandomForestClassifier']
    
    # YOUR CODE HERE
    for model in range(len(pipelines)):
        pipe = pipelines[model]
        print(pipe.steps[-1][1].__class__.__name__.split('.')[-1])
        pipelines[model].fit(X_data, y_data)
        y = pipelines[model].predict(X_data)
        score = f1_score(y_true=y_data, y_pred=y, average=average)
        scoresDict[nameList[model]]=score
        print(confusion_matrix(y_data, y))
    scores_sorted = dict(sorted(scoresDict.items(), key=lambda x: x[1], reverse=True))
    print(scores_sorted)
# sorted_x = sorted(x.items(), key=lambda kv: kv[1])
    return scores_sorted

In [30]:
scores_sorted_train = sort_models(pipelines, X_train, y_train, metric=f1_score, average='weighted')


LogisticRegression
[[  0   0   9]
 [  0   0  23]
 [  0   0 381]]
KNeighborsClassifier
[[  0   0   9]
 [  0   0  23]
 [  0   0 381]]
GaussianNB
[[  8   1   0]
 [  8  15   0]
 [139 221  21]]
SVC
[[  0   0   9]
 [  0   0  23]
 [  0   0 381]]
RandomForestClassifier
[[  4   0   5]
 [  0   7  16]
 [  0   0 381]]
{'RandomForestClassifier': 0.9371753125113734, 'LogisticRegression': 0.8853385866151096, 'KNeighborsClassifier': 0.8853385866151096, 'SVC': 0.8853385866151096, 'GaussianNB': 0.10493429802321477}


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [31]:
scores_sorted_test = sort_models(pipelines, X_test, y_test, metric=f1_score, average='weighted')


LogisticRegression
[[ 0  0  2]
 [ 0  0  9]
 [ 0  0 93]]
KNeighborsClassifier
[[ 0  0  2]
 [ 0  0  9]
 [ 0  0 93]]
GaussianNB
[[ 2  0  0]
 [ 0  9  0]
 [ 4 86  3]]
SVC
[[ 0  0  2]
 [ 0  0  9]
 [ 0  0 93]]
RandomForestClassifier
[[ 0  0  2]
 [ 0  2  7]
 [ 0  0 93]]
{'RandomForestClassifier': 0.8844271113501883, 'LogisticRegression': 0.8442991019133151, 'KNeighborsClassifier': 0.8442991019133151, 'SVC': 0.8442991019133151, 'GaussianNB': 0.08048261834319526}


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Fit scikit-learn model
----

In [113]:
y_train2 = np.array([int(i) for i in y_train])
y_test2 = np.array([int(i) for i in y_test])

In [145]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression() # TODO: Replace with your choice of hyperparameters 
clf.fit(X=X_train, y=y_train) # Train model



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [146]:
clf.score(X_train, y_train)

0.48184019370460046

In [147]:
clf.score(X_test, y_test)

0.5384615384615384

In [148]:
clf.predict(X_train)

array(['E', 'E', 'D', 'D', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E',
       'E', 'E', 'E', 'D', 'D', 'E', 'E', 'E', 'E', 'E', 'D', 'E', 'C',
       'E', 'E', 'E', 'E', 'D', 'D', 'E', 'D', 'E', 'E', 'E', 'D', 'D',
       'D', 'E', 'E', 'E', 'E', 'E', 'E', 'D', 'E', 'E', 'E', 'E', 'D',
       'E', 'E', 'E', 'D', 'E', 'E', 'E', 'E', 'D', 'E', 'E', 'D', 'D',
       'E', 'E', 'E', 'E', 'E', 'E', 'D', 'E', 'C', 'E', 'E', 'E', 'D',
       'E', 'E', 'E', 'E', 'D', 'E', 'E', 'D', 'E', 'D', 'E', 'D', 'D',
       'E', 'E', 'E', 'D', 'D', 'E', 'D', 'E', 'E', 'E', 'E', 'E', 'E',
       'D', 'E', 'D', 'E', 'E', 'E', 'E', 'E', 'D', 'E', 'E', 'E', 'E',
       'D', 'E', 'E', 'D', 'E', 'D', 'E', 'E', 'E', 'E', 'D', 'E', 'E',
       'E', 'E', 'E', 'E', 'E', 'D', 'E', 'E', 'E', 'D', 'E', 'E', 'E',
       'E', 'E', 'E', 'E', 'D', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E',
       'D', 'D', 'E', 'E', 'D', 'E', 'E', 'E', 'D', 'E', 'E', 'E', 'E',
       'E', 'E', 'E', 'D', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E

In [89]:
df.describe()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,fire_B,fire_C,fire_D,fire_E
count,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0
mean,4.669246,4.299807,7.475822,4.259188,90.644681,110.87234,547.940039,9.021663,18.889168,44.288201,4.017602,0.021663,12.847292,0.025145,0.137331,0.338491,0.477756
std,2.313778,1.2299,2.27599,2.072929,5.520111,64.046482,248.066192,4.559477,5.806625,16.317469,1.791653,0.295959,63.655818,0.156717,0.34453,0.473655,0.499989
min,1.0,2.0,1.0,1.0,18.7,1.1,7.9,0.0,2.2,15.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,4.0,7.0,2.0,90.2,68.6,437.7,6.5,15.5,33.0,2.7,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,4.0,8.0,5.0,91.6,108.3,664.2,8.4,19.3,42.0,4.0,0.0,0.52,0.0,0.0,0.0,0.0
75%,7.0,5.0,9.0,6.0,92.9,142.4,713.9,10.8,22.8,53.0,4.9,0.0,6.57,0.0,0.0,1.0,1.0
max,9.0,9.0,12.0,7.0,96.2,291.3,860.6,56.1,33.3,100.0,9.4,6.4,1090.84,1.0,1.0,1.0,1.0


In [61]:
X_gt10 = X[np.where(y > 10)[0]]
y_gt10 = y[np.where(y > 10)[0]]

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [2]:
import sklearn
from sklearn.model_selection import train_test_split

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [96]:
X_train

array([[4., 3., 9., ..., 0., 1., 0.],
       [4., 4., 9., ..., 0., 1., 0.],
       [2., 5., 8., ..., 0., 1., 0.],
       ...,
       [2., 2., 8., ..., 0., 1., 0.],
       [2., 5., 7., ..., 0., 0., 1.],
       [2., 4., 8., ..., 0., 0., 1.]])

In [97]:
reg = linear_model.Ridge(alpha=.5)
reg.fit(X_train, y_train)
reg.score(X_train, y_train)

0.6170143487948007

In [48]:
def score(self, X_test, y_test):
    pre_y = [self.root.predict(i) for i in X_test]
    ybar = np.mean(pre_y)        
    ssreg = np.sum((pre_y-ybar)**2)   
    sstot = np.sum((y_test - ybar)**2)   
    return ssreg / sstot

In [53]:
y_pre_trans = []
for i in y_pre:
    if i > 10:
        y_pre_trans.append(i)
    else:
        y_pre_trans.append(0)
y_pre_trans

[0,
 16.950140694735225,
 32.92787953271066,
 0,
 10.366681143327371,
 0,
 0,
 0,
 0,
 0,
 11.329673583380671,
 10.209277246235853,
 19.389787759818816,
 0,
 11.599660737406023,
 0,
 0,
 12.948052182169745,
 21.756773170526806,
 10.262760114835213,
 0,
 16.188708921245613,
 20.547686374812578,
 16.362888179109536,
 0,
 26.701901925479874,
 11.218158147942436,
 31.17997570310765,
 13.762541483906995,
 0,
 0,
 17.214188498902153,
 0,
 13.810824504650279,
 0,
 21.94963708569373,
 0,
 14.467457035540548,
 16.25209493511882,
 0,
 28.947873013459386,
 0,
 0,
 17.508399030992088,
 21.640959050118468,
 25.2961409344517,
 0,
 11.53648131101491,
 0,
 11.715167213435837,
 0,
 19.452696543245736,
 17.596622272002435,
 0,
 14.819776918097824,
 0,
 0,
 15.729582908897358,
 12.210598135532292,
 0,
 0,
 0,
 20.459794942499755,
 0,
 17.15699233303964,
 14.043250145171692,
 0,
 28.780728347593282,
 18.288116810650116,
 0,
 0,
 0,
 0,
 0,
 12.074971768750979,
 0,
 13.843183855108446,
 17.74069338181735,


In [None]:
y_

In [18]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize=True)

In [19]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [25]:
lr.score(X_test, y_test)

0.0025606026443458774

In [21]:
from sklearn import linear_model

In [29]:
lm = linear_model.LassoLars(alpha=0.11) # TODO: Replace with your choice of algorithm and hyperparameters 
lm.fit(X_train, y_train) # Train model

LassoLars(alpha=0.11, copy_X=True, eps=2.220446049250313e-16,
          fit_intercept=True, fit_path=True, max_iter=500, normalize=True,
          positive=False, precompute='auto', verbose=False)

In [34]:
lm.score(X_test, y_test)

-0.002500798595874043

In [36]:
from sklearn import metrics

medae_value = metrics.median_absolute_error(y_test, lm.predict(X_test))
print(f"{medae_value:.4f} medae on training set")

9.8134 medae on training set


Evaluation Metric
----