# Import Libraries

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.utils  import resample
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = pd.read_csv('Rainfall.csv')

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.sample(5)

In [None]:
df.corr(numeric_only = True)

In [None]:
df.describe()

In [None]:
df.columns = df.columns.str.strip()

In [None]:
# create figure, size depends on number of columns
n_cols = 3  # how many plots per row
n_features = len(df.columns)
n_rows = (n_features + n_cols - 1) // n_cols  # number of rows needed

plt.figure(figsize=(15, 5 * n_rows))

for i, column in enumerate(df.columns, 1):  # loop through ALL columns
    plt.subplot(n_rows, n_cols, i)
    sns.histplot(df[column], kde=True)
    plt.title(f"Distribution of {column}")

plt.tight_layout()
plt.show()

In [None]:
# Let's Drop the day columns
df.drop(columns = ['day'] , axis = 1 , inplace = True)

In [None]:
plt.figure(figsize = (5,4))
sns.countplot(data = df , x = 'rainfall')
plt.title('Disturibution of Rain Fall')
plt.show()

In [None]:
# correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(numeric_only  = True ) , annot = True , cmap = 'coolwarm' , fmt = '.2f')
plt.title("Correlation heatmap")
plt.show()

In [None]:
n_cols = 3
n_features = len(df.columns)
n_rows = (n_features + n_cols - 1) // n_cols
plt.figure(figsize = (15 , 5 * n_rows))

for i , column in enumerate(df.columns ,1 ) :
    plt.subplot(n_rows , n_cols , i)
    sns.boxplot(df[column])
    plt.title(f"Box Plot of {column}")

plt.tight_layout()
plt.show()    

In [None]:
df.drop(columns = ['mintemp' , 'maxtemp' , 'temparature'] , axis = 1 , inplace = True)

In [None]:
min_rainfall = df[df['rainfall'] == 'no']
max_rainfall = df[df['rainfall'] == 'yes']

In [None]:
# Lets equal the value of yes and no in dataset 
max_rainfall_resample = resample(max_rainfall , replace = False , n_samples = len(min_rainfall) , random_state = 42)

In [None]:
max_rainfall_resample.value_counts().sum()

In [None]:
new_df = pd.concat([max_rainfall_resample , min_rainfall])

In [None]:
new_df = new_df.sample(frac = 1, random_state = 42 ).reset_index(drop = True)

In [None]:
X = df.drop(columns = ['rainfall'])
y = df['rainfall']

In [None]:
X.head()

In [None]:
# Lets Encode the output by label Encoder 
le = LabelEncoder()

In [None]:
y = le.fit_transform(y)

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X, y , test_size = 0.2 , random_state = 42)

In [None]:
rf =  RandomForestClassifier()
param_grid_rf = {
    "n_estimators": [50, 100, 200],
    "max_features": ["sqrt", "log2"],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

In [None]:
# Hypertuning using GridSearchCV
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=2)

grid_search_rf.fit(X_train, y_train)

In [None]:
best_rf_model = grid_search_rf.best_estimator_


In [None]:
cv_scores = cross_val_score(best_rf_model, X_train, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", np.mean(cv_scores))

In [None]:
input_data = (1015.9, 19.9, 95, 81, 0.0, 40.0, 13.7)

input_df = pd.DataFrame([input_data], columns=['pressure', 'dewpoint', 'humidity', 'cloud', 'sunshine','winddirection', 'windspeed'])

In [None]:
prediction = best_rf_model.predict(input_df)

In [None]:
prediction[0]

In [None]:
prediction = best_rf_model.predict(input_df)
print("Prediction result:", "Rainfall" if prediction[0] == 1 else "No Rainfall")

# Lets do this with Some Changes

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   pressure       366 non-null    float64
 1   dewpoint       366 non-null    float64
 2   humidity       366 non-null    int64  
 3   cloud          366 non-null    int64  
 4   rainfall       366 non-null    object 
 5   sunshine       366 non-null    float64
 6   winddirection  365 non-null    float64
 7   windspeed      365 non-null    float64
dtypes: float64(5), int64(2), object(1)
memory usage: 23.0+ KB


In [79]:
from sklearn.impute import SimpleImputer

In [81]:
from sklearn.compose import ColumnTransformer

In [83]:
X = df.drop(columns = ['rainfall'] , axis  = 1)
y = df['rainfall']

In [85]:
numeric_features = X.columns

In [89]:
from sklearn.preprocessing import StandardScaler

In [87]:
from sklearn.pipeline import Pipeline

In [90]:
numeric_pipeline =Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),   # fill NaN with mean
        ('scaler', StandardScaler())  
    ]
)

In [92]:
processor = ColumnTransformer(
    transformers = [
        ('num' ,numeric_pipeline , numeric_features )
    ]
)

In [94]:
# Step 4: Fit + Transform
X_processed = processor.fit_transform(X)

# Convert back to DataFrame
X_processed = pd.DataFrame(X_processed, columns=numeric_features)

print(X_processed)

     pressure  dewpoint  humidity     cloud  sunshine  winddirection  \
0    1.897809 -1.150321 -0.813795 -1.016547  1.242193      -0.263887   
1    1.289005 -0.732877  0.081842  0.545363 -0.972100      -0.631985   
2    0.929967 -0.265339  1.475055  0.912872 -1.124810      -0.754684   
3    0.805085 -0.198548  0.977479  0.775056 -0.870293      -0.631985   
4    0.336774 -0.014873  1.475055  0.453486 -1.124810      -0.754684   
..        ...       ...       ...       ...       ...            ...   
361  1.398278 -0.833064  0.380387  0.866933 -1.124810      -0.877383   
362  2.007081 -1.200415 -0.515250  0.637240 -0.870293      -1.000082   
363  1.897809 -1.116926 -0.515250  0.315671  0.045966      -0.386586   
364  1.804147 -1.116926 -0.216704  0.683179 -0.819390      -1.000082   
365  1.975861 -1.167019 -0.614765 -0.235592  0.325934      -1.000082   

     windspeed  
0     0.474302  
1    -0.621080  
2    -0.730618  
3    -0.461752  
4    -0.780408  
..         ...  
361  -0.312382  