# Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Data Loading

In [None]:
data = pd.read_csv("data/rainfall.csv")

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.shape

In [None]:
data.describe(include='all')

In [None]:
data.isnull().sum()

In [None]:
data.dtypes

In [None]:
data.duplicated().sum()

In [None]:
data = data.dropna()

In [None]:
data.tail()

# 2. Data Visualizations

In [None]:
if 'date' in data.columns:
    data['date'] = pd.to_datetime(data['date'])

In [None]:
# Visualization 1: Time Series of Rainfall
plt.figure(figsize=(10, 6))
plt.plot(data['date'], data['rainfall'], marker='o', label='Rainfall')
plt.title('Rainfall Over Time')
plt.xlabel('Date')
plt.ylabel('Rainfall (mm)')
plt.grid()
plt.legend()
plt.show()

In [None]:
# Visualization 2: Temperature vs Humidity (Scatter Plot)
plt.figure(figsize=(8, 6))
sns.scatterplot(x='temperature', y='humidity', hue='weather_condition', data=data, palette='coolwarm', s=100)
plt.title('Temperature vs Humidity')
plt.xlabel('Temperature (°C)')
plt.ylabel('Humidity (%)')
plt.legend(title='Weather Condition')
plt.show()

In [None]:
# Visualization 3: Rainfall Distribution by Weather Condition (Box Plot)
plt.figure(figsize=(8, 6))
sns.boxplot(x='weather_condition', y='rainfall', data=data, palette='viridis')
plt.title('Rainfall Distribution by Weather Condition')
plt.xlabel('Weather Condition')
plt.ylabel('Rainfall (mm)')
plt.show()

In [None]:
# Visualization 4: Wind Speed Distribution (Histogram)
plt.figure(figsize=(8, 6))
plt.hist(data['wind_speed'], bins=5, color='skyblue', edgecolor='black')
plt.title('Wind Speed Distribution')
plt.xlabel('Wind Speed (m/s)')
plt.ylabel('Frequency')
plt.grid()
plt.show()

In [None]:
# Pairplot to visualize relationships between all numerical features
sns.pairplot(data, hue='weather_condition', diag_kind='kde', palette='coolwarm')
plt.suptitle('Pairplot of All Features', y=1.02)
plt.show()

# 3. Data Analysis

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import tree

In [None]:
data['weather_condition'] = data['weather_condition'].map({'Sunny': 0, 'Rainy': 1})

In [None]:
data.head()

## Tree Based Models

In [None]:
X = data[['rainfall', 'temperature', 'humidity', 'wind_speed']]
y = data['weather_condition']

In [None]:
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred

In [None]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
plt.figure(figsize=(50, 50))
tree.plot_tree(model, feature_names=X.columns, class_names=['Sunny', 'Rainy'], filled=True)
plt.title("Decision Tree")
plt.show()

## Linear Models

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
X = data[['rainfall', 'temperature', 'humidity']]
y = data['wind_speed']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

In [None]:
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.7, color='b')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r', linewidth=2)
plt.title('Actual vs Predicted Wind Speed')
plt.xlabel('Actual Wind Speed')
plt.ylabel('Predicted Wind Speed')
plt.grid()
plt.show()