In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('./fire_archive.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(8, 8), dpi=120)
sns.heatmap(df.corr(), annot=True, cmap='gist_rainbow')
plt.show()

In [None]:
df.drop(['track'], axis=1, inplace=True)

In [None]:
df.info()

In [None]:
# finding categorical data
df['scan'].value_counts()

In [None]:
df['acq_time'].value_counts()

In [None]:
df['satellite'].value_counts()

In [None]:
df['instrument'].value_counts()

In [None]:
df['version'].value_counts()

In [None]:
df['daynight'].value_counts()

In [None]:
df.drop(['instrument', 'version'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df['daynight'] = df['daynight'].map({'D': 1, 'N':0})
df['satellite'] = df['satellite'].map({"Terra": 1, "Aqua": 0})

In [None]:
df.head()

In [None]:
df = pd.get_dummies(data=df, columns=['type'], drop_first=True)

In [None]:
df.head()

In [None]:
# binning the scan columns

bins= [0, 1, 2, 3, 4, 5]
labels = [1, 2, 3, 4, 5]

df['scan_bin'] = pd.cut(df['scan'], bins=bins, labels=labels)

In [None]:
df.head()

In [None]:
df['acq_date'] = pd.DatetimeIndex(df['acq_date'])
df['month'] = df['acq_date'].apply(lambda x: x.month)

In [None]:
df.head()

In [None]:
df.drop(['scan', 'acq_date', 'acq_time', 'bright_t31'], axis=True, inplace=True)

In [None]:
plt.figure(figsize=(15, 8), dpi=90)
sns.heatmap(df.corr(), annot=True, cmap='gist_rainbow')
plt.show()

In [None]:
df.head()

In [None]:
x = df.drop('confidence', axis=1).values
y = df['confidence'].values

In [None]:
scaler = StandardScaler().fit(x)
x = scaler.transform(x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=101)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
models = [LinearRegression, RandomForestRegressor]

In [None]:
def get_model_score(x):
  model = x()
  model.fit(x_train, y_train)
  print(type(model).__name__)
  print('the training score is', model.score(x_train, y_train))
  y_pred = model.predict(x_test)
  print('the Testing score is', metrics.r2_score(y_test, y_pred))
  return model

In [None]:
all_models = {}
for i in models:
  model = get_model_score(i)
  all_models[type(model).__name__] = model

In [None]:
# hyperparameter tuning for random forest regression
model = all_models['RandomForestRegressor']

In [None]:
model.get_params()

In [None]:
n_estimators = [int(x) for x in np.linspace(start=300, stop=1000, num=35)]
max_features = ['auto', 'sqrt', 'log2']
max_depth = [int(x) for x in np.linspace(15, 35, num=7)]
max_depth.append(None)
min_samples_split= [2, 3, 5, 6]
min_samples_leaf = [1, 3, 5, 7]


In [None]:
random_grid = {
    'n_estimators':n_estimators,
    'max_features':max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf' : min_samples_leaf
}


In [None]:
rf = RandomizedSearchCV(model, random_grid, n_iter=50, cv=3, verbose=1, random_state=42)

In [None]:
rf.fit(x_test, y_test)

In [None]:
rf.best_params_

In [None]:
model = RandomForestRegressor(n_estimators=315, min_samples_split=6, min_samples_leaf = 1,
 max_features='sqrt', max_depth = None)

In [None]:
model.fit(x_train, y_train)

In [None]:
model.score(x_train, y_train)

In [None]:
model.score(x_test, y_test)