In [85]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils import resample
import pickle

In [86]:
dataset = pd.read_csv("/content/Rainfall.csv")

In [87]:
dataset.head()

Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
0,1,1025.9,19.9,18.3,16.8,13.1,72,49,yes,9.3,80.0,26.3
1,2,1022.0,21.7,18.9,17.2,15.6,81,83,yes,0.6,50.0,15.3
2,3,1019.7,20.3,19.3,18.0,18.4,95,91,yes,0.0,40.0,14.2
3,4,1018.9,22.3,20.6,19.1,18.8,90,88,yes,1.0,50.0,16.9
4,5,1015.9,21.3,20.7,20.2,19.9,95,81,yes,0.0,40.0,13.7


In [88]:
dataset.isnull().sum()

Unnamed: 0,0
day,0
pressure,0
maxtemp,0
temparature,0
mintemp,0
dewpoint,0
humidity,0
cloud,0
rainfall,0
sunshine,0


In [89]:
dataset.shape


(366, 12)

In [90]:
dataset.columns = dataset.columns.str.strip()

In [91]:
dataset['winddirection'] = dataset['winddirection'].fillna(dataset['winddirection'].mode()[0])
dataset['windspeed'] = dataset['windspeed'].fillna(dataset['windspeed'].median())

In [92]:
dataset.isnull().sum()

Unnamed: 0,0
day,0
pressure,0
maxtemp,0
temparature,0
mintemp,0
dewpoint,0
humidity,0
cloud,0
rainfall,0
sunshine,0


In [93]:
dataset.shape

(366, 12)

In [94]:
dataset['rainfall'].replace({'yes':1,'no':0}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['rainfall'].replace({'yes':1,'no':0}, inplace=True)
  dataset['rainfall'].replace({'yes':1,'no':0}, inplace=True)


In [95]:
dataset.head()

Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
0,1,1025.9,19.9,18.3,16.8,13.1,72,49,1,9.3,80.0,26.3
1,2,1022.0,21.7,18.9,17.2,15.6,81,83,1,0.6,50.0,15.3
2,3,1019.7,20.3,19.3,18.0,18.4,95,91,1,0.0,40.0,14.2
3,4,1018.9,22.3,20.6,19.1,18.8,90,88,1,1.0,50.0,16.9
4,5,1015.9,21.3,20.7,20.2,19.9,95,81,1,0.0,40.0,13.7


In [96]:
dataset.columns = dataset.columns.str.strip()

In [97]:
dataset.columns


Index(['day', 'pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint',
       'humidity', 'cloud', 'rainfall', 'sunshine', 'winddirection',
       'windspeed'],
      dtype='object')

In [98]:
dataset = dataset.drop(columns='day')

In [99]:
dataset.head()

Unnamed: 0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
0,1025.9,19.9,18.3,16.8,13.1,72,49,1,9.3,80.0,26.3
1,1022.0,21.7,18.9,17.2,15.6,81,83,1,0.6,50.0,15.3
2,1019.7,20.3,19.3,18.0,18.4,95,91,1,0.0,40.0,14.2
3,1018.9,22.3,20.6,19.1,18.8,90,88,1,1.0,50.0,16.9
4,1015.9,21.3,20.7,20.2,19.9,95,81,1,0.0,40.0,13.7


In [100]:
print(dataset['rainfall'].value_counts())

rainfall
1    249
0    117
Name: count, dtype: int64


In [101]:
majority = dataset[dataset['rainfall']==1]
minority = dataset[dataset['rainfall']==0]

In [105]:
minority.shape

(117, 11)

In [106]:
majority.shape

(249, 11)

In [107]:
maj_downsampled = resample(majority, replace=False, n_samples=117, random_state=2)

In [108]:
dataset = pd.concat([maj_downsampled, minority])

In [109]:
dataset.shape

(234, 11)

In [110]:
dataset.head()

Unnamed: 0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
269,1006.7,32.2,28.7,26.9,26.3,87,71,1,4.7,90.0,14.0
124,1011.4,28.2,25.8,23.2,23.4,87,82,1,2.7,40.0,9.2
235,1000.0,32.9,30.6,28.0,24.2,69,30,1,7.6,220.0,11.7
326,1017.1,27.6,25.4,24.0,21.3,78,65,1,8.3,80.0,29.2
130,1008.1,28.4,26.6,23.7,24.3,87,84,1,0.3,220.0,12.0


In [111]:
dataset = dataset.sample(frac=1,random_state =42).reset_index(drop=True)

In [112]:
dataset.head()

Unnamed: 0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
0,1006.5,32.4,30.3,29.0,26.5,80,77,1,5.2,220.0,28.8
1,1013.4,30.8,26.2,23.6,19.5,69,17,0,10.5,70.0,12.4
2,1006.1,33.3,29.6,27.6,24.4,74,27,0,10.8,220.0,8.7
3,1014.4,26.7,23.1,20.1,20.8,87,81,1,1.2,80.0,19.6
4,1021.2,18.6,14.8,12.3,8.4,66,18,0,10.1,20.0,24.4


In [113]:
x = dataset.drop(columns='rainfall')
y = dataset['rainfall']

In [114]:
x_train, x_test, y_train ,y_test = train_test_split(x,y,test_size=.2,random_state=42)

In [132]:
model = RandomForestClassifier(random_state = 42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [136]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


In [138]:
best_rf_model = grid_search.best_estimator_
print(grid_search.best_params_)

{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [139]:
cv_scores = cross_val_score(best_rf_model, x_train, y_train, cv=5)
print(cv_scores)
print(np.mean(cv_scores))

[0.76315789 0.84210526 0.89189189 0.83783784 0.83783784]
0.8345661450924607


In [142]:
prediction = best_rf_model.predict(x_train)
acc = accuracy_score(y_train,prediction)
print(acc)

1.0


In [145]:
prediction = best_rf_model.predict(x_test)
acc = accuracy_score(y_test,prediction)
cff = confusion_matrix(y_test,prediction)
cfr = classification_report(y_test,prediction)
print(cfr)
print(cff)
print(acc)

              precision    recall  f1-score   support

           0       0.84      0.67      0.74        24
           1       0.71      0.87      0.78        23

    accuracy                           0.77        47
   macro avg       0.78      0.77      0.76        47
weighted avg       0.78      0.77      0.76        47

[[16  8]
 [ 3 20]]
0.7659574468085106


In [147]:
new_data = x_test.iloc[[19]]
predict = best_rf_model.predict(new_data)
if predict == 1:
  print("It will rain")
else:
  print("It will not rain")

It will not rain


In [148]:
model_data = {"model":best_rf_model, "feature_names":x.columns.tolist()}
with open("model.pkl", "wb") as f:
  pickle.dump(model_data, f)

In [149]:
with open("model.pkl", "rb") as f:
  model_data = pickle.load(f)

In [150]:
modela = model_data["model"]
feature_names = model_data["feature_names"]

In [152]:
new_data = x_test.iloc[[19]]
predict = modela.predict(new_data)
if predict == 1:
  print("It will rain")
else:
  print("It will not rain")

It will not rain
