In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv('GlobalWeather.csv')

df.shape
df.head()
df.info()
df.isnull().sum()
df.dropna()
df.duplicated().sum()
df.describe()
df.columns
weather_columns = [
    'last_updated',
    'temperature_celsius',
    'wind_kph',
    'precip_mm',
    'humidity',
     'pressure_mb',
    'cloud',
     'uv_index'
]

df = df[weather_columns]
df['last_updated'] = pd.to_datetime(df['last_updated'])

df['hour'] = df['last_updated'].dt.hour
df['day'] = df['last_updated'].dt.day
df['month'] = df['last_updated'].dt.month
df['year'] = df['last_updated'].dt.year

def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

df['season'] = df['month'].apply(get_season)

season_dummies = pd.get_dummies(df['season'], prefix='season')
df = pd.concat([df, season_dummies], axis=1)
df = df.drop(columns=['season'])

numerical_cols = df.select_dtypes(include=[np.number]).columns
corr_matrix = df[numerical_cols].corr()

 
temp_corr = corr_matrix['temperature_celsius'].sort_values(ascending=False)
print("Correlations with Temperature (Celsius):")
print(temp_corr)


plt.figure(figsize=(8, 5))
sns.histplot(df['temperature_celsius'], kde=True)
plt.title("Distribution of Temperature (°C)")
plt.xlabel("Temperature (°C)")
plt.ylabel("Frequency")
plt.show() 

monthly_temp = df.groupby('month')['temperature_celsius'].mean()

plt.figure(figsize=(8, 5))
monthly_temp.plot(kind='line', marker='o')
plt.title("Average Monthly Temperature Trend")
plt.xlabel("Month")
plt.ylabel("Temperature (°C)")
plt.grid(True)
plt.show()


plt.figure(figsize=(8, 5))
plt.plot(df['last_updated'], df['precip_mm'])
plt.title("Rainfall Distribution (mm)")
plt.xlabel('Date')
plt.ylabel('Rainfall')
plt.show()

plt.figure(figsize=(8, 5))
sns.scatterplot(
    x=df['humidity'],
    y=df['temperature_celsius'],
    alpha=0.6
)
plt.title("Temperature vs Humidity")
plt.xlabel("Humidity (%)")
plt.ylabel("Temperature (°C)")
plt.show()


df_plot = df.copy()
df_plot['season'] = df_plot['month'].apply(get_season)

plt.figure(figsize=(8, 5))
sns.boxplot(x='season', y='temperature_celsius', data=df_plot)
plt.title('Temperature Distribution by Season (Anomalies are outliers)')
plt.savefig('temp_season_boxplot.png')

plt.figure(figsize=(8, 5))

# Ensure 'precip_mm' is numeric before calculating correlation
df['precip_mm'] = pd.to_numeric(df['precip_mm'], errors='coerce')

# Select only numerical columns for correlation calculation
numerical_cols_for_corr = df.select_dtypes(include=[np.number]).columns
corr = df[numerical_cols_for_corr].corr()

sns.heatmap(
    corr,
    annot=True,
    cmap='coolwarm',
    fmt=".2f"
)
plt.title("Weather Correlation Heatmap")
plt.show()


# Drop rows with any NaN values from the relevant columns before splitting
df_cleaned = df.dropna(subset=['precip_mm'])



x = df_cleaned.drop(columns=['temperature_celsius', 'last_updated'])
y = df_cleaned['temperature_celsius']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_pred


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
print(f'Mean Absolute Error: {mae}')

compare = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(compare)

rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)

rf.fit(x_train, y_train)
rf_pred = rf.predict(x_test)
print("RF R²:", r2_score(y_test, rf_pred))