In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder,PowerTransformer, StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split,RandomizedSearchCV, GridSearchCV
import pickle

In [51]:
df = pd.read_csv('forest_outliers_removed1')
df.drop(columns=['Unnamed: 0'], inplace=True)
df['classes']=df['classes'].apply(lambda x :1 if x == 'fire' else 0)

In [17]:
X = df.drop(columns=['Temperature','year'],axis=1)
y = df['Temperature']

In [14]:
scale = ColumnTransformer(transformers=[
    ('scale', PowerTransformer(),slice(0,11))
],remainder='passthrough')
tnf = ColumnTransformer(transformers=[
    ('ohe', OneHotEncoder(sparse=False, drop= 'first'),[11])
],remainder='passthrough')

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=256)
pipe = Pipeline([
        ('scale',scale),
        ('model',RandomForestRegressor(random_state=96))
    ])
pipe.fit(X_train.values, y_train.values)
y_pred = pipe.predict(X_test.values)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.7034450546485764
MAE 1.6114035087719296


In [10]:
pipe = Pipeline([
    ('scale',scale),
    ('model',RandomForestRegressor(random_state=96,n_jobs=-1))
])
# Number of trees in random forest
n_estimators = [100, 200, 300, 1000]
# Maximum number of levels in tree
max_depth = [80, 90, 100, 110]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [8, 10, 12],
# Minimum number of samples required at each leaf node
min_samples_leaf =[3, 4, 5]
# Method of selecting samples for training each tree
criterion =['mse', 'mae']
max_features=[2,3]
# Create the random grid
random_grid = {'model__n_estimators': n_estimators,
               'model__max_depth': max_depth,
               'model__min_samples_split': min_samples_split,
               'model__min_samples_leaf': min_samples_leaf,
               'model__max_features': max_features 
            }

In [16]:
gs = GridSearchCV(estimator=pipe, param_grid=random_grid, n_jobs=-1)

In [25]:
### Linear Regression

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=198)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',LinearRegression(fit_intercept=True))
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.639065653971792
MAE 1.6812671280266036


In [26]:
### Ridge Regression

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=198)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',LinearRegression(fit_intercept=True))
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.639065653971792
MAE 1.6812671280266036


In [33]:
lambdas=np.linspace(1,100,100)
params={'alpha':lambdas}
grid_search=GridSearchCV(pipe,param_grid=params,cv=10,)

In [29]:
### Lasso Regression

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=728)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',Lasso(fit_intercept=True))
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.5144289994685378
MAE 1.921390645183546


In [31]:
### SVR

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=480)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',SVR(C=1,kernel='linear',gamma='auto',max_iter=5e4))
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.6119294525821513
MAE 1.764549048094898


In [34]:
rf_params = {
    'C': [1,10, 100],
    "kernel":['poly','rbf','sigmoid'],
    "epsilon":[0.01,0.1,1]
}
grid_search=GridSearchCV(pipe,param_grid=rf_params,cv=10,)

In [None]:
### KNeighborsRegressor

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=458)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',KNeighborsRegressor())
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.5776245967325289
MAE 1.8631578947368423


In [37]:
rf_params = {
    'n_neighbors': [2, 3, 5, 7, 10]
}
grid_search=GridSearchCV(pipe,param_grid=rf_params,cv=10,)

In [38]:
## DecisionTreeRegressor

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=657)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',DecisionTreeRegressor())
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.5657678908083932
MAE 1.8654970760233918


In [40]:
results = pd.DataFrame({
    'Model': ['Linear Regression','Lasso Regression', 'Ridge Regression','SVR' ,'Decision Tree','Random Forest'],
    'Score': [0.63,0.51,0.63,0.61,0.56,0.70]})

result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(9)

Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
0.7,Random Forest
0.63,Linear Regression
0.63,Ridge Regression
0.61,SVR
0.56,Decision Tree
0.51,Lasso Regression


In [41]:
### Creating pickle file
pickle.dump(pipe,open('models/pipe_reg1.pkl','wb'))

In [44]:
### Preparing data to create batch prediction
import json
result = X_test.to_json(orient="records")
parsed = json.loads(result)

In [45]:
# cREATING DATABASE CONNCETION
import pymongo
client = pymongo.MongoClient("mongodb+srv://mongodb:mongodb@cluster0.oxgpt.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")

In [47]:
db = client.batch_data
print(db)

Database(MongoClient(host=['cluster0-shard-00-01.oxgpt.mongodb.net:27017', 'cluster0-shard-00-02.oxgpt.mongodb.net:27017', 'cluster0-shard-00-00.oxgpt.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-th8ou3-shard-0', tls=True), 'batch_data')


In [48]:
coll = db['regression_batch_data']

In [49]:
db.list_collection_names()

['classification_batch', 'regression_batch_data', 'regression_batch']

In [None]:
coll.insert_many(parsed)

In [53]:
# Testing created Pipe
pickle_model = pickle.load(open('models/pipe_reg2.pkl','rb'))

In [56]:
test_input = np.array([1,6,57,18.0,0.00,65.7000,3.4,7.6,1.3,3.4,0.5,0],dtype=object).reshape(1,12)

In [60]:
pipe.predict(test_input)



array([30.])