<a href="https://colab.research.google.com/github/NurayVakitbilir/Vakitbilir_CNG562_HW1/blob/master/Vakitbilir_forest_fires.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Forestfires Regression

In [0]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn import preprocessing 


In [0]:
import warnings 
warnings.filterwarnings('ignore')

## **Importing the data**

In [0]:
data_url='http://www.dsi.uminho.pt/~pcortez/forestfires/forestfires.csv'
fires=pd.read_csv(data_url)

## **Data Preparation**

In [0]:
fires.month.replace(('jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'),
                        (1,2,3,4,5,6,7,8,9,10,11,12), inplace=True)
fires.day.replace(('mon','tue','wed','thu','fri','sat','sun'),
                       (1,2,3,4,5,6,7), inplace=True)

In [0]:
X = fires.drop('area', axis=1)
y = fires.area

In [0]:
X_scaled = preprocessing.scale(X)
y_scaled = preprocessing.scale(y)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size = 0.3, random_state = 1)

## **Training**

### *Linear Regression*

In [8]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression(normalize=True).fit(X_train, y_train)
print('MSE for Linear Regression training:', mse(y_train, linreg.predict(X_train)))
print('MSE for Linear Regression testing:', mse(y_test, linreg.predict(X_test)))
print('Score for Linear Regression training:', linreg.score(X_train, y_train))
print('Score for Linear Regression testing:', linreg.score(X_test, y_test))

MSE for Linear Regression training: 0.9899957510358671
MSE for Linear Regression testing: 0.975300611915431
Score for Linear Regression training: 0.01859543492513238
Score for Linear Regression testing: 0.0045291311337865015


### *Decision Tree Regression*

In [9]:
from sklearn.tree import DecisionTreeRegressor
treereg = DecisionTreeRegressor
tree = treereg(criterion='friedman_mse', splitter='best', max_depth=5, 
                min_samples_leaf=2, max_features=4,random_state=2, max_leaf_nodes=2, 
                min_impurity_split=0.0003, presort=True).fit(X_train, y_train)
print('MSE for Decision tree training:', mse(y_train, tree.predict(X_train)))
print('MSE for Decision tree testing:', mse(y_test, tree.predict(X_test)))
print('Score for Decision tree training:', tree.score(X_train, y_train))
print('Score for Decision tree testing:', tree.score(X_test, y_test))

MSE for Decision tree training: 0.9974510521621663
MSE for Decision tree testing: 0.9641532133683953
Score for Decision tree training: 0.011204830923345344
Score for Decision tree testing: 0.015907069773055893


### *K-Neighbors Regression*

In [10]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=15, leaf_size=25, p=6).fit(X_train, y_train)
print('MSE for KNN training:', mse(y_train, knn.predict(X_train)))
print('MSE for KNN testing:', mse(y_test, knn.predict(X_test)))
print('Score for KNN training:', knn.score(X_train, y_train))
print('Score for KNN testing:', knn.score(X_test, y_test))

MSE for KNN training: 0.9351178547593144
MSE for KNN testing: 0.9639404763991155
Score for KNN training: 0.07299710066072884
Score for KNN testing: 0.01612420636977452


***None of the training algorithms are converging well.***

## **Grid Search and Cross Validation**

In [0]:
from sklearn.model_selection import GridSearchCV

### *Decision Tree Regression*

In [12]:
tree = DecisionTreeRegressor()
parameters = {'criterion':('friedman_mse','mse'), 'splitter':('best','random'), 'min_samples_leaf':(2,3,4),
              'max_depth':(2,3,5,6),'max_features':(3,4,5),'random_state':(10,50,100)}
gscvtree = GridSearchCV(estimator = tree, param_grid = parameters, n_jobs=-1, cv=5).fit(X_train, y_train)
print(gscvtree.best_params_)
print('Score for Decision tree training:', gscvtree.score(X_train, y_train))
print('Score for Decision tree testing:', gscvtree.score(X_test, y_test))

{'criterion': 'friedman_mse', 'max_depth': 2, 'max_features': 3, 'min_samples_leaf': 2, 'random_state': 10, 'splitter': 'random'}
Score for Decision tree training: 0.0018016984254046742
Score for Decision tree testing: -0.00017264166213082355


### *K-Neighbors Regression*

In [13]:
from sklearn.model_selection import GridSearchCV
knn = KNeighborsRegressor()
parameters = {'n_neighbors': (1,2,3,5,10,15,20), 'leaf_size': (10, 20,30,40), 'p' : (1,2,4,7)}
gscvknn = GridSearchCV(estimator = knn, param_grid = parameters, n_jobs=-1, cv=5).fit(X_train, y_train)
print(gscvknn.best_params_)
print('Score for KNN training:', gscvknn.score(X_train, y_train))
print('Score for KNN testing:', gscvknn.score(X_test, y_test))

{'leaf_size': 10, 'n_neighbors': 20, 'p': 2}
Score for KNN training: 0.055835901127419874
Score for KNN testing: -0.0005900958848872317


***Grid search and cross validation results show that for Forest fires dataset, KNN and Decision Tree are not converging well.***

# Binary Classification

In [0]:
from sklearn.metrics import accuracy_score

In [0]:
y_bin = 1.0 * (fires['area']>35) #threshold is set to 35.

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_bin, test_size = 0.3, random_state = 3)

### *Logistic Regression*

In [17]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=5,solver='lbfgs',multi_class='ovr').fit(X_train, y_train)
print('Accuracy for Logistic Regression traning:', accuracy_score(y_train, log_reg.predict(X_train)))
print('Accuracy for Logistic Regression test:', accuracy_score(y_test, log_reg.predict(X_test)))

Accuracy for Logistic Regression traning: 0.925207756232687
Accuracy for Logistic Regression test: 0.9358974358974359


### *Decision Tree Classifier*

In [18]:
from sklearn.tree import DecisionTreeClassifier 
tree = DecisionTreeClassifier(max_depth=2,max_leaf_nodes=5, random_state=45).fit(X_train, y_train)
print('Accuracy for Decision tree training:', accuracy_score(y_train, tree.predict(X_train)))
print('Accuracy for Decision tree testing:', accuracy_score(y_test, tree.predict(X_test)))

Accuracy for Decision tree training: 0.9335180055401662
Accuracy for Decision tree testing: 0.9294871794871795


### *Support Vector Classifier*

In [19]:
from sklearn.svm import SVC
svc = SVC(C=2, kernel='rbf',max_iter=5, tol=0.001, random_state=5).fit(X_train, y_train)
print('Accuracy for Support Vector Machine traning:', accuracy_score(y_train, svc.predict(X_train)))
print('Accuracy for Support Vector Machine testing:', accuracy_score(y_test, svc.predict(X_test)))

Accuracy for Support Vector Machine traning: 0.703601108033241
Accuracy for Support Vector Machine testing: 0.7564102564102564


### *K-Neighbors Classifier*

In [20]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5,p=2).fit(X_train, y_train)
print('Accuracy for KNN training:', accuracy_score(y_train, knn.predict(X_train)))
print('Accuracy for KNN testing:', accuracy_score(y_test, knn.predict(X_test)))

Accuracy for KNN training: 0.925207756232687
Accuracy for KNN testing: 0.9358974358974359


**Applying binary classification improved the scores of KNN, Decision Tree and Logistic Regression greatly, whereas Support Vector Machines is not improved to the same performance.**