In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder,PowerTransformer, StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split,RandomizedSearchCV, GridSearchCV
import pickle

In [2]:
df = pd.read_csv('forest_outliers_removed1')
df.drop(columns=['Unnamed: 0'], inplace=True)
df['classes']=df['classes'].apply(lambda x :1 if x == 'fire' else 0)

In [3]:
df['classes'].value_counts()

1    138
0    106
Name: classes, dtype: int64

In [4]:
X = df.drop(columns=['Temperature','year'],axis=1)
y = df['Temperature']

In [5]:
scale = ColumnTransformer(transformers=[
    ('scale', PowerTransformer(),slice(0,11))
],remainder='passthrough')
tnf = ColumnTransformer(transformers=[
    ('ohe', OneHotEncoder(sparse=False, drop= 'first'),[11])
],remainder='passthrough')

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=256)
pipe = Pipeline([
        ('scale',scale),
        ('model',RandomForestRegressor(random_state=96))
    ])
pipe.fit(X_train.values, y_train.values)
y_pred = pipe.predict(X_test.values)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.6891959173624491
MAE 1.6501169590643272


In [7]:
pipe = Pipeline([
    ('scale',scale),
    ('model',RandomForestRegressor(random_state=96,n_jobs=-1))
])
# Number of trees in random forest
n_estimators = [100, 200, 300, 1000]
# Maximum number of levels in tree
max_depth = [80, 90, 100, 110]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [8, 10, 12],
# Minimum number of samples required at each leaf node
min_samples_leaf =[3, 4, 5]
# Method of selecting samples for training each tree
criterion =['mse', 'mae']
max_features=[2,3]
# Create the random grid
random_grid = {'model__n_estimators': n_estimators,
               'model__max_depth': max_depth,
               'model__min_samples_split': min_samples_split,
               'model__min_samples_leaf': min_samples_leaf,
               'model__max_features': max_features 
            }

In [8]:
gs = GridSearchCV(estimator=pipe, param_grid=random_grid, n_jobs=-1)

In [9]:
### Linear Regression

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=198)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',LinearRegression(fit_intercept=True))
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.5742557035193454
MAE 1.8057242421814617


In [11]:
### Ridge Regression

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=198)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',LinearRegression(fit_intercept=True))
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.5742557035193454
MAE 1.8057242421814617


In [13]:
lambdas=np.linspace(1,100,100)
params={'alpha':lambdas}
grid_search=GridSearchCV(pipe,param_grid=params,cv=10,)

In [14]:
### Lasso Regression

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=728)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',Lasso(fit_intercept=True))
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.5106456091075442
MAE 1.9218736285558602


In [16]:
### SVR

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=480)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',SVR(C=1,kernel='linear',gamma='auto',max_iter=5e4))
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.6009394261103427
MAE 1.785115786342106


In [18]:
rf_params = {
    'C': [1,10, 100],
    "kernel":['poly','rbf','sigmoid'],
    "epsilon":[0.01,0.1,1]
}
grid_search=GridSearchCV(pipe,param_grid=rf_params,cv=10,)

In [19]:
### KNeighborsRegressor

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=458)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',KNeighborsRegressor())
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.5573551804205851
MAE 1.8865497076023394


In [21]:
rf_params = {
    'n_neighbors': [2, 3, 5, 7, 10]
}
grid_search=GridSearchCV(pipe,param_grid=rf_params,cv=10,)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=657)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',DecisionTreeRegressor())
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.5799840757952087
MAE 1.8538011695906433


In [23]:
results = pd.DataFrame({
    'Model': ['Linear Regression','Lasso Regression', 'Ridge Regression','SVR' ,'Decision Tree','Random Forest'],
    'Score': [0.63,0.51,0.63,0.61,0.56,0.70]})

result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(9)

Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
0.7,Random Forest
0.63,Linear Regression
0.63,Ridge Regression
0.61,SVR
0.56,Decision Tree
0.51,Lasso Regression


In [24]:
### Creating pickle file
pickle.dump(pipe,open('pipe_reg.pkl','wb'))

In [25]:
### Preparing data to create batch prediction
import json
result = X_test.to_json(orient="records")
parsed = json.loads(result)

In [26]:
import pymongo
client=pymongo.MongoClient("mongodb+srv://Mongo:mongodb@cluster0.a6e83wm.mongodb.net/?retryWrites=true&w=majority")


In [27]:
db = client.batch_data
print(db)

Database(MongoClient(host=['ac-6dft1sn-shard-00-02.a6e83wm.mongodb.net:27017', 'ac-6dft1sn-shard-00-00.a6e83wm.mongodb.net:27017', 'ac-6dft1sn-shard-00-01.a6e83wm.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-z8wklm-shard-0', tls=True), 'batch_data')


In [28]:
coll = db['regression_batch_data']

In [29]:
db.list_collection_names()

['regression_batch_data', 'regression_batch', 'classification_batch']

In [30]:
coll.insert_many(parsed)

<pymongo.results.InsertManyResult at 0x20906362190>

In [32]:
# Testing created Pipe
pickle_model = pickle.load(open('pipe_reg.pkl','rb'))

In [33]:
test_input = np.array([1,6,57,18.0,0.00,65.7000,3.4,7.6,1.3,3.4,0.5,0],dtype=object).reshape(1,12)

In [34]:
pipe.predict(test_input)



array([30.])