In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Load dataset
data = pd.read_csv('weather_data.csv')

# Handle missing values
imputer = SimpleImputer(strategy='mean')
data[['avg_temperature', 'humidity', 'avg_wind_speed']] = imputer.fit_transform(data[['avg_temperature', 'humidity', 'avg_wind_speed']])

# Encoding the 'rain_or_not' target variable (if necessary)
label_encoder = LabelEncoder()
data['rain_or_not'] = label_encoder.fit_transform(data['rain_or_not'])

# Extract date-related features (year, month, day, weekday)
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day_of_week'] = data['date'].dt.weekday

# Drop the original 'date' column
data.drop(columns=['date'], inplace=True)

# Check data
print(data.head())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

# Plot distribution of target variable 'rain_or_not'
sns.countplot(x='rain_or_not', data=data)
plt.title('Distribution of Rain vs No Rain')
plt.show()

# Plot relationships between features and target variable
sns.pairplot(data, hue='rain_or_not', vars=['avg_temperature', 'humidity', 'avg_wind_speed'])
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Features and target
X = data[['avg_temperature', 'humidity', 'avg_wind_speed', 'day_of_week', 'month', 'year']]
y = data['rain_or_not']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

# Evaluate Random Forest
print("Random Forest Accuracy:", accuracy_score(y_test, rf_predictions))
print("Random Forest Classification Report:\n", classification_report(y_test, rf_predictions))

# Logistic Regression (example of another model)
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
log_predictions = log_model.predict(X_test)

# Evaluate Logistic Regression
print("Logistic Regression Accuracy:", accuracy_score(y_test, log_predictions))
print("Logistic Regression Classification Report:\n", classification_report(y_test, log_predictions))

In [None]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=3)
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Evaluate best model
best_rf_model = grid_search.best_estimator_
best_rf_predictions = best_rf_model.predict(X_test)
print("Optimized Random Forest Accuracy:", accuracy_score(y_test, best_rf_predictions))

In [None]:
# Get probabilities from the model (probability of rain - class 1)
probabilities = rf_model.predict_proba(X_test)
prob_of_rain = probabilities[:, 1]  # This will give the probability of rain (class 1)

# Output probability of rain for the first 5 test samples
print("Probability of Rain for First 5 Samples:", prob_of_rain[:5])