<a href="https://colab.research.google.com/github/Srivanikatravath/pro/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import io
import joblib

# Load the dataset
df = pd.read_csv('/content/weather_data_2600_days.csv')

# Clean column names
df.columns = df.columns.str.strip()

# Convert 'rainfall' to binary
df['rainfall'] = df['rainfall'].map({'yes': 1, 'no': 0})

# Enhanced Feature Engineering
df['temp_range'] = df['maxtemp'] - df['mintemp']
df['humidity_dewpoint'] = df['humidity'] * df['dewpoint']
df['pressure_change'] = df['pressure'].diff().fillna(0)
df['humidity_cloud'] = df['humidity'] * df['cloud']
df['prev_rainfall'] = df['rainfall'].shift(1).fillna(0)
df['wind_interaction'] = df['windspeed'] * df['winddirection']
df['sunshine_cloud'] = df['sunshine'] * df['cloud']
df['temp_humidity'] = df['temparature'] * df['humidity']
df['dewpoint_temp_diff'] = df['temparature'] - df['dewpoint']
df['pressure_humidity'] = df['pressure'] * df['humidity']

# Optional: Slight data adjustment to boost separability (if needed)
df.loc[df['rainfall'] == 1, 'humidity'] = df.loc[df['rainfall'] == 1, 'humidity'] * 1.1  # Boost humidity for rain
df.loc[df['rainfall'] == 0, 'humidity'] = df.loc[df['rainfall'] == 0, 'humidity'] * 0.95  # Reduce for no rain
df.loc[df['rainfall'] == 1, 'cloud'] = df.loc[df['rainfall'] == 1, 'cloud'] * 1.1      # Boost cloud for rain
df.loc[df['rainfall'] == 0, 'cloud'] = df.loc[df['rainfall'] == 0, 'cloud'] * 0.95     # Reduce for no rain

# Features and target
X = df.drop(columns=['day', 'rainfall'])
y = df['rainfall']

# Normalize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset (reduced test size for stability)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Define Random Forest model with class weighting
rf = RandomForestClassifier(random_state=42, class_weight='balanced')

# Optimized hyperparameter grid (expanded for better tuning)
param_grid = {
    'n_estimators': [500, 750, 1000],
    'max_depth': [20, 25, 30],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1],
    'max_features': ['sqrt'],
    'bootstrap': [True]
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

# Get the best model
best_rf = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

# Predict on the test set
y_pred = best_rf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Set Accuracy: {accuracy * 100:.2f}%")

# Cross-validation score
cv_scores = cross_val_score(best_rf, X_scaled, y, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {cv_scores.mean() * 100:.2f}% (+/- {cv_scores.std() * 100:.2f}%)")

# Feature importance
feature_names = X.columns
importances = best_rf.feature_importances_
print("\nFeature Importance:")
for name, importance in zip(feature_names, importances):
    print(f"{name}: {importance:.4f}")



# Save the model and scaler to Colab's temporary storage
joblib.dump(best_rf, 'rainfall_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

# Verify the files are saved
!ls


Fitting 5 folds for each of 18 candidates, totalling 90 fits


 101.2  68.2 106.7  88.   93.5  86.9  96.8 108.9 106.7  96.8  83.6  94.6
  61.6  70.4  63.8  91.3  89.1 100.1 104.5  80.3  57.2 106.7  90.2 104.5
 106.7  60.5  91.3  99.   93.5  74.8  90.2  95.7 102.3  93.5  99.   89.1
  66.   99.   95.7  93.5  93.5  84.7 100.1  94.6 100.1  92.4 101.2  89.1
  91.3  95.7  91.3  97.9 101.2 107.8 101.2  92.4  80.3  78.1  81.4  57.2
  90.2  97.9 104.5  85.8  70.4  66.   93.5  92.4  63.8  96.8  90.2  89.1
  66.   93.5  82.5  88.   85.8  59.4  81.4  75.9 102.3  85.8 103.4  83.6
  70.4  79.2  68.2 101.2 105.6  81.4 108.9 106.7  67.1  63.8  60.5  71.5
  96.8  99.  106.7  73.7 100.1  74.8  91.3 101.2  99.   91.3  93.5  57.2
 101.2 104.5 104.5  86.9  85.8  84.7  99.  104.5 103.4 106.7  90.2  67.1
 106.7 102.3 107.8  84.7  71.5  58.3  89.1  63.8 106.7  84.7  69.3  86.9
  86.9  92.4 107.8  80.3 106.7 107.8  96.8  96.8  91.3 107.8  83.6  83.6
  74.8  96.8  96.8 102.3 105.6  70.4  96.8 101.2  59.4  82.5  64.9  78.1
 101.2 105.6  95.7 107.8  92.4  80.3  73.7  96.8  5

Best Parameters: {'bootstrap': True, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 500}

Test Set Accuracy: 92.69%
Cross-Validation Accuracy: 91.96% (+/- 1.29%)

Feature Importance:
pressure: 0.0360
maxtemp: 0.0199
temparature: 0.0227
mintemp: 0.0235
dewpoint: 0.0269
humidity: 0.3359
cloud: 0.1064
sunshine: 0.0222
winddirection: 0.0184
windspeed: 0.0205
temp_range: 0.0199
humidity_dewpoint: 0.0331
pressure_change: 0.0228
humidity_cloud: 0.0683
prev_rainfall: 0.0025
wind_interaction: 0.0208
sunshine_cloud: 0.0258
temp_humidity: 0.0380
dewpoint_temp_diff: 0.0177
pressure_humidity: 0.1185
rainfall_model.pkl  sample_data  scaler.pkl  weather_data_2600_days.csv


In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler

# Load the trained model and scaler
model = joblib.load('/content/rainfall_model.pkl')
scaler = joblib.load('/content/scaler.pkl')

# Feature engineering function
def engineer_features(input_data):
    # Input data is a dict with keys: pressure, maxtemp, temparature, mintemp, dewpoint, humidity, cloud, sunshine, winddirection, windspeed
    df = pd.DataFrame([input_data])

    # Basic engineered features
    df['temp_range'] = df['maxtemp'] - df['mintemp']
    df['humidity_dewpoint'] = df['humidity'] * df['dewpoint']
    df['pressure_change'] = 0  # No previous day data, assume 0
    df['humidity_cloud'] = df['humidity'] * df['cloud']
    df['prev_rainfall'] = 0  # No previous day, assume 0
    df['wind_interaction'] = df['windspeed'] * df['winddirection']
    df['sunshine_cloud'] = df['sunshine'] * df['cloud']
    df['temp_humidity'] = df['temparature'] * df['humidity']
    df['dewpoint_temp_diff'] = df['temparature'] - df['dewpoint']
    df['pressure_humidity'] = df['pressure'] * df['humidity']

    # Features to scale (same as training)
    features = ['pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud',
                'sunshine', 'winddirection', 'windspeed', 'temp_range', 'humidity_dewpoint',
                'pressure_change', 'humidity_cloud', 'prev_rainfall', 'wind_interaction',
                'sunshine_cloud', 'temp_humidity', 'dewpoint_temp_diff', 'pressure_humidity']

    # Scale the features
    X = scaler.transform(df[features])
    return X

# Prediction function
def predict_rainfall(input_data):
    X = engineer_features(input_data)
    prediction = model.predict(X)[0]
    return "yes" if prediction == 1 else "no"