In [3]:
import numpy as np
import pandas as pd 
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score 

In [4]:
ds = pd.read_csv("cleaned_data.csv")

In [6]:
# drop highly correlated column
ds = ds.drop(columns=['maxtemp', 'temparature', 'mintemp'])

In [9]:
ds.head()

Unnamed: 0.1,Unnamed: 0,pressure,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
0,0,1025.9,13.1,72,49,1,9.3,80.0,26.3
1,1,1022.0,15.6,81,83,1,0.6,50.0,15.3
2,2,1019.7,18.4,95,91,1,0.0,40.0,14.2
3,3,1018.9,18.8,90,88,1,1.0,50.0,16.9
4,4,1015.9,19.9,95,81,1,0.0,40.0,13.7


In [11]:
ds = ds.drop('Unnamed: 0',axis = 1)

In [13]:
print(ds["rainfall"].value_counts())

rainfall
1    249
0    117
Name: count, dtype: int64


In [15]:
# separate majority and minority class
ds_majority = ds[ds["rainfall"] == 1]
ds_minority = ds[ds["rainfall"] == 0]

In [17]:
print(ds_majority.shape)
print(ds_minority.shape)

(249, 8)
(117, 8)


In [19]:
# downsample majority class to match minority count
ds_majority_downsampled = resample(ds_majority, replace=False, n_samples=len(ds_minority), random_state=42)

In [20]:
ds_majority_downsampled.shape

(117, 8)

In [23]:
ds_downsampled = pd.concat([ds_majority_downsampled, ds_minority])

In [25]:
ds_downsampled.shape

(234, 8)

In [27]:
ds_downsampled.head()

Unnamed: 0,pressure,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
188,1005.9,25.6,77,53,1,10.5,270.0,11.3
9,1017.5,15.5,85,91,1,0.0,70.0,37.7
137,1012.3,20.1,80,86,1,0.3,80.0,39.5
89,1018.3,16.3,79,89,1,2.4,40.0,14.8
157,1008.8,24.7,91,80,1,2.2,20.0,11.2


In [29]:
ds_downsampled = ds_downsampled.sample(frac=1, random_state=42).reset_index(drop=True)

In [31]:
ds_downsampled.head()

Unnamed: 0,pressure,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
0,1022.2,14.1,78,90,1,0.0,30.0,28.5
1,1013.4,19.5,69,17,0,10.5,70.0,12.4
2,1006.1,24.4,74,27,0,10.8,220.0,8.7
3,1007.6,24.8,85,84,1,1.8,70.0,34.8
4,1021.2,8.4,66,18,0,10.1,20.0,24.4


In [33]:
ds_downsampled["rainfall"].value_counts()

rainfall
1    117
0    117
Name: count, dtype: int64

In [35]:
# split features and target as X and y
X = ds_downsampled.drop(columns=["rainfall"])
y = ds_downsampled["rainfall"]

In [37]:
X

Unnamed: 0,pressure,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
0,1022.2,14.1,78,90,0.0,30.0,28.5
1,1013.4,19.5,69,17,10.5,70.0,12.4
2,1006.1,24.4,74,27,10.8,220.0,8.7
3,1007.6,24.8,85,84,1.8,70.0,34.8
4,1021.2,8.4,66,18,10.1,20.0,24.4
...,...,...,...,...,...,...,...
229,1008.1,25.4,86,75,5.7,20.0,9.5
230,1010.1,19.9,91,89,0.0,70.0,31.8
231,1020.6,14.7,91,88,0.3,50.0,24.4
232,1008.3,24.1,74,29,5.7,10.0,4.4


In [39]:
X.columns

Index(['pressure', 'dewpoint', 'humidity', 'cloud', 'sunshine',
       'winddirection', 'windspeed'],
      dtype='object')

In [41]:
y

0      1
1      0
2      0
3      1
4      0
      ..
229    1
230    1
231    1
232    0
233    1
Name: rainfall, Length: 234, dtype: int64

In [43]:
# splitting the data into training data and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
rf_model = RandomForestClassifier(random_state=42)

param_grid_rf = {
    "n_estimators": [50, 100, 200],
    "max_features": ["sqrt", "log2"],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

In [47]:
# Hypertuning using GridSearchCV
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=2)

grid_search_rf.fit(X_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


  _data = np.array(data, dtype=dtype, copy=copy,


In [48]:
best_rf_model = grid_search_rf.best_estimator_

print("best parameters for Random Forest:", grid_search_rf.best_params_)

best parameters for Random Forest: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}


In [49]:
cv_scores = cross_val_score(best_rf_model, X_train, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", np.mean(cv_scores))

Cross-validation scores: [0.68421053 0.81578947 0.83783784 0.83783784 0.91891892]
Mean cross-validation score: 0.818918918918919


In [50]:
# test set performance
y_pred = best_rf_model.predict(X_test)
print("Test set Accuracy:", accuracy_score(y_test, y_pred))
print("Test set Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Test set Accuracy: 0.7446808510638298
Test set Confusion Matrix:
 [[17  7]
 [ 5 18]]
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.71      0.74        24
           1       0.72      0.78      0.75        23

    accuracy                           0.74        47
   macro avg       0.75      0.75      0.74        47
weighted avg       0.75      0.74      0.74        47



In [51]:
prediction = best_rf_model.predict(myinput)

NameError: name 'myinput' is not defined

In [54]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Initialize the scaler and the random forest model
scaler = StandardScaler()
rf = RandomForestClassifier()

# Create the pipeline
pipe = Pipeline([
    ('scaler', scaler),  
    ('rf', rf)          
])


In [60]:
# Assume X_train and y_train are your training features and labels
pipe.fit(X_train, y_train)

# Now you can make predictions

data = [[1015.9, 19.9, 95, 81, 0.0, 40.0, 13.7]]
myinput = pd.DataFrame(columns=X.columns, data=data)

result = pipe.predict(myinput)
prediction = best_rf_model.predict(myinput)
print("Prediction result:", "Rainfall" if prediction[0] == 1 else "No Rainfall")

Prediction result: Rainfall


In [64]:
# save model and feature names to a pickle file
import pickle
model_data = {"model": best_rf_model, "feature_names": X.columns.tolist()}

with open("rainfall_prediction_model.pkl", "wb") as file:

    
  pickle.dump(model_data, file)