In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
%matplotlib inline

# Reading the dataset
bangalore_df = pd.read_csv('Bangalore.csv')

# Convert 'time' to datetime
bangalore_df['time'] = pd.to_datetime(bangalore_df['time'])

# Set the 'time' column as the index
bangalore_df.set_index('time', inplace=True)

# Perform time-weighted interpolation
bangalore_df.interpolate(method='time', inplace=True)

# Check for NaN values
print(bangalore_df.isna().sum())

# Impute remaining NaN values with mean
bangalore_df.fillna(bangalore_df.mean(), inplace=True)

# Machine learning models
features = bangalore_df[['tavg', 'tmax', 'tmin']]
target_temp = bangalore_df['tavg']
target_prcp = bangalore_df['prcp']

# Train-test split
X_train, X_test, y_train_temp, y_test_temp = train_test_split(features, target_temp, test_size=0.2, random_state=42)
X_train, X_test, y_train_prcp, y_test_prcp = train_test_split(features, target_prcp, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Linear Regression
lr_temp = LinearRegression()
lr_temp.fit(X_train_scaled, y_train_temp)
lr_prcp = LinearRegression()
lr_prcp.fit(X_train_scaled, y_train_prcp)

# Ridge Regression
ridge_temp = Ridge(alpha=1.0)
ridge_temp.fit(X_train_scaled, y_train_temp)
ridge_prcp = Ridge(alpha=1.0)
ridge_prcp.fit(X_train_scaled, y_train_prcp)

# Lasso Regression
lasso_temp = Lasso(alpha=0.01)
lasso_temp.fit(X_train_scaled, y_train_temp)
lasso_prcp = Lasso(alpha=0.01)
lasso_prcp.fit(X_train_scaled, y_train_prcp)

# Random Forest Regressor
rf_temp = RandomForestRegressor(n_estimators=100, random_state=42)
rf_temp.fit(X_train_scaled, y_train_temp)
rf_prcp = RandomForestRegressor(n_estimators=100, random_state=42)
rf_prcp.fit(X_train_scaled, y_train_prcp)

# Evaluate models
def evaluate_model(model, X_test_scaled, y_test, target):
    predictions = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    print(f'{target} Model:')
    print(f'Mean Squared Error: {mse:.2f}')
    print(f'R2 Score: {r2:.2f}\n')
    plt.scatter(y_test, predictions)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title(f'{target} Model: Predictions vs Actual Values')
    plt.show()
evaluate_model(lr_temp, X_test_scaled, y_test_temp, 'Linear Regression (Temperature)')
evaluate_model(lr_prcp, X_test_scaled, y_test_prcp, 'Linear Regression (Precipitation)')
evaluate_model(ridge_temp, X_test_scaled, y_test_temp, 'Ridge Regression (Temperature)')
evaluate_model(ridge_prcp, X_test_scaled, y_test_prcp, 'Ridge Regression (Precipitation)')
evaluate_model(lasso_temp, X_test_scaled, y_test_temp, 'Lasso Regression (Temperature)')
evaluate_model(lasso_prcp, X_test_scaled, y_test_prcp, 'Lasso Regression (Precipitation)')
evaluate_model(rf_temp, X_test_scaled, y_test_temp, 'Random Forest Regressor (Temperature)')
evaluate_model(rf_prcp, X_test_scaled, y_test_prcp, 'Random Forest Regressor (Precipitation)')
missing_pctg = round((drop_df.shape[0]/bangalore_df.shape[0])*100,2)
print(missing_pctg,'% of data contain NaN values')
bangalore_df[bangalore_df.isna().sum(axis=1) > 3] 
bangalore_df = bangalore_df.dropna(how='all') #dropping rows with no data in all columns
bangalore_df = bangalore_df.interpolate(method='time') #using interpolate to fill in NaN values
bangalore_df.isna().sum()
bangalore_df.head()

#date at which temperature(tmax) was highest
bangalore_df[bangalore_df['tmax'] == max(bangalore_df['tmax'])]

#date at which temperature(tmin) was lowest
bangalore_df[bangalore_df['tmin'] == min(bangalore_df['tmin'])]#date at which precipitation was the highest
bangalore_df[bangalore_df['prcp'] == max(bangalore_df['prcp'])]
plt.style.use('bmh')
bangalore_df[['tavg','tmax','tmin','prcp']].plot(subplots=True, figsize=(18,10));
bangalore_year = bangalore_df.groupby(bangalore_df.index.year).mean()
bangalore_year #yearwise averages
f, axes = plt.subplots(nrows = 2, figsize =(15,9))
ax = bangalore_year[['tavg','tmin','tmax']].plot(ax=axes[0])
ax.set_ylabel('Temperature (c)')
ax.set_xlabel('Year')
ax.set_title('Average Daily Temperature by Year')
ax = bangalore_year['prcp'].plot(ax=axes[1])
ax.set_ylabel('Precipitation (mm)')
ax.set_xlabel('Year')
ax.set_title('Average Daily Precipitation by Year')
plt.tight_layout()

#by month
months = ['Jan','Feb','Mar','Apr','May','Jun',
          'Jul','Aug','Sep','Oct','Nov','Dec']
ata = bangalore_df.groupby(bangalore_df.index.month, as_index=False)[['tavg','tmin','tmax','prcp']].mean()
f, axes = plt.subplots(nrows=2, figsize=(10,10))
ax = data[['tavg','tmin','tmax']].plot(ax=axes[0])
ax.set_ylabel('Temperature (C)')
ax.set_xlabel('Month')
ax.set_xticks(np.arange(0,12))
ax.set_xticklabels(months)
ax.set_title('Average Daily Temperature by Month')
ax = data['prcp'].plot(ax=axes[1]);
ax.set_ylabel('Precipitation (mm)')
ax.set_xlabel('Month')
ax.set_xticks(np.arange(0,12))
ax.set_xticklabels(months)
ax.set_title('Average Daily Precipitation by Month')
plt.tight_layout()
bangalore_df['year'] = bangalore_df.index.year
bangalore_df['mnth'] = bangalore_df.index.month
month_df = bangalore_df.groupby(['year','mnth'], as_index=False)[['tavg','prcp']].mean()
data = month_df.pivot('year','mnth','tavg')
data.columns = months
plt.subplots(figsize=(10,10))
sns.heatmap(data, cmap='YlOrRd',annot=True, fmt='.1f')
plt.title('Average Daily Temperature (C) by Month')
plt.yticks(rotation=0)
plt.show()
data = month_df.pivot('year','mnth','prcp')
data.columns = months
plt.subplots(figsize=(10,10))
sns.heatmap(data,cmap='Blues',annot=True, fmt='.1f')
plt.title('Average Daily Precipitation (mm) by Month')
plt.yticks(rotation=0)
plt.show()
data = bangalore_df[bangalore_df['year'].isin([1990,2016])]
plt.subplots(figsize=(15,6))
sns.boxplot(x='mnth', y='tavg', hue='year', data=data,saturation=1)
plt.xlabel('Month')
plt.ylabel('Temerature (C)')
plt.title('Temperature 1990 vs 2016');
plt.subplots(figsize=(15,6))
sns.barplot(x='mnth',y='prcp', hue='year', data=data,ci=None)
plt.xlabel('Month')
plt.ylabel('Precipitation (mm)')
plt.title('Precipitation 1990 vs 2016');
data = bangalore_df[bangalore_df['year'].isin([1990, 2016])]
plt.subplots(figsize=(15, 6))
sns.boxplot(x='mnth', y='tavg', hue='year', data=data, saturation=1)
plt.xlabel('Month')
plt.ylabel('Temperature (C)')
plt.title('Temperature 1990 vs 2016');
plt.subplots(figsize=(15, 6))
sns.barplot(x='mnth', y='prcp', hue='year', data=data, ci=None)
plt.xlabel('Month')
plt.ylabel('Precipitation (mm)')
plt.title('Precipitation 1990 vs 2016');
plt.show()
