## Global Horizontal Irradiance (GHI)
>### The radiation reaching the earth's surface can be represented in a number of different ways. Global Horizontal Irradiance (GHI) is the total amount of shortwave radiation received from above by a surface horizontal to the ground.

### Task: Given Half hourly features for 2018 and 2019 Predict the GHI generated after 48 hours

In [None]:
!pip install pycaret

In [None]:
import pandas as pd 
import numpy as np
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
%matplotlib inline
sns.set(color_codes=True)
pal = sns.color_palette("viridis", 10)
sns.set_palette(pal)

In [None]:
df = pd.read_csv('../input/weather-data/combined_data_from_2016_2019_with_out_extra_features.csv')

In [None]:
df.head()

## Checking for null values

In [None]:
df.isnull().sum()

### No null values

In [None]:
df.shape

In [None]:
df.head()

### Constructing Future GHI Feature(after 48 h)

In [None]:
l=[]
for i in range (96,df.shape[0]):
    l.append(df['GHI'][i])
    
df.drop(list(range(df.shape[0]-96,df.shape[0])),inplace=True)
df['GHI_after_48_hours'] = l

In [None]:
df=df[df['Year']>2017]

In [None]:
df

In [None]:
l=[]
for i in df['Minute']:
    if i ==30:
        l.append(0.5)
    else:
        l.append(0)
        
df['Hour'] = df['Hour']+l

In [None]:
df.drop(['Clearsky GHI','GHI','Minute'],axis=1,inplace=True)

## Check Corelation

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(data= df.corr(), vmin=-1,vmax=1, cmap='coolwarm')

In [None]:
sns.scatterplot(x='Hour',y='GHI_after_48_hours',data=df)

In [None]:
plt.figure(figsize=(10,8))
df[df.Month==1].groupby(['Hour']).GHI_after_48_hours.mean().plot(ylabel='Mean GHI',label='Jan')
df[df.Month==2].groupby(['Hour']).GHI_after_48_hours.mean().plot(ylabel='Mean GHI',label='Feb')
df[df.Month==3].groupby(['Hour']).GHI_after_48_hours.mean().plot(ylabel='Mean GHI',label='Mar')
df[df.Month==4].groupby(['Hour']).GHI_after_48_hours.mean().plot(ylabel='Mean GHI',label='Apr')
df[df.Month==5].groupby(['Hour']).GHI_after_48_hours.mean().plot(ylabel='Mean GHI',label='May')
df[df.Month==6].groupby(['Hour']).GHI_after_48_hours.mean().plot(ylabel='Mean GHI',label='June')
df[df.Month==7].groupby(['Hour']).GHI_after_48_hours.mean().plot(ylabel='Mean GHI',label='July')
df[df.Month==8].groupby(['Hour']).GHI_after_48_hours.mean().plot(ylabel='Mean GHI',label='Aug')
df[df.Month==9].groupby(['Hour']).GHI_after_48_hours.mean().plot(ylabel='Mean GHI',label='Sep')
df[df.Month==10].groupby(['Hour']).GHI_after_48_hours.mean().plot(ylabel='Mean GHI',label='Oct')
df[df.Month==11].groupby(['Hour']).GHI_after_48_hours.mean().plot(ylabel='Mean GHI',label='Nov')
df[df.Month==12].groupby(['Hour']).GHI_after_48_hours.mean().plot(ylabel='Mean GHI',label='Dec')
plt.legend()
plt.title('Avg Future Hourly00 GHI in different Months')

### Month

In [None]:
sns.violinplot(data=df,x='Month',y='GHI_after_48_hours')

### Month of July showing more 0 and lower values because of possible rainy season and CLOUD cover

In [None]:
df.groupby('Month').GHI_after_48_hours.mean().plot(kind='bar')

### Trends over the years

In [None]:
df[df['Year']==2018].groupby('Month').GHI_after_48_hours.mean().plot(kind='bar',title='2018')

In [None]:
df[df['Year']==2019].groupby('Month').GHI_after_48_hours.mean().plot(kind='bar',title='2019')

### So over the years the highest mean GHI is recorded in June-July and lowest in December and January

## Cloud Type

In [None]:
df.groupby('Cloud Type')['Cloud Type'].count().plot(kind='pie')

In [None]:
df.groupby('Cloud Type')['Cloud Type'].count().plot(kind='bar',ylabel='Frequency')

In [None]:
df.groupby('Cloud Type').GHI_after_48_hours.mean().plot(kind='bar',ylabel='Mean GHI')

> ### Cloud Type 2 is impacting the GHI very much

In [None]:
sns.violinplot(data=df,x='Cloud Type',y='GHI_after_48_hours')

> ### cloud type 2 has more impact on future GHI values 

## DEW POINT

In [None]:
sns.displot(kind='hist', data=df, x= 'Dew Point')

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(data=df,x='Month', y='Dew Point')

In [None]:
df.groupby('Dew Point').GHI_after_48_hours.mean().sort_values(ascending=False).head(20)

> ### Negative Dew Points tend to give High GHI after 48 Hours

## Solar Zenith Angle

In [None]:
sns.displot(kind='hist',data=df, x='Solar Zenith Angle')

In [None]:
df.groupby('Solar Zenith Angle')['GHI_after_48_hours'].mean().sort_values(ascending=False).head(10)

> ### 13-14 degrees has high impact

## Fill Flag

In [None]:
df.groupby('Fill Flag')['Fill Flag'].count().plot(kind='pie')

In [None]:
df.groupby('Fill Flag').GHI_after_48_hours.max().plot(kind='bar',ylabel='Mean GHI')

In [None]:
sns.violinplot(data=df,x='Fill Flag',y='GHI_after_48_hours')

## Surface Albedo

In [None]:
sns.displot(kind='kde',data=df,x='Surface Albedo')

In [None]:
sns.boxplot(data=df,y='Surface Albedo')

### As we can see due to skewed data it is showing noisy outliers

## Wind Speed

In [None]:
sns.displot(kind='hist',data=df,x='Wind Speed')

In [None]:
sns.boxplot(data=df, y='Wind Speed')

In [None]:
sns.boxplot(data=df, x='Month',y='Wind Speed')

## Precipitable Water

In [None]:
sns.displot(kind='hist',data=df,x='Precipitable Water')

In [None]:
sns.boxplot(data=df,y='Precipitable Water')

In [None]:
sns.boxplot(data=df,y='Precipitable Water',x='Month')

> ### As Expected Most precioitation in July 

## Wind Direction

In [None]:
sns.displot(data=df,kind='hist',x='Wind Direction')

In [None]:
df.groupby('Wind Direction')['GHI_after_48_hours'].mean().sort_values(ascending=False).head(10)

### Top 10 Mean GHI Forecast Recorded with **130-150** angles

## Relative Humidity

In [None]:
sns.displot(kind='hist',data=df,x='Relative Humidity')

In [None]:
sns.boxplot(data=df,y='Relative Humidity')

In [None]:
sns.boxplot(data=df,x='Month',y='Relative Humidity')

In [None]:
df.groupby('Relative Humidity')['GHI_after_48_hours'].mean().sort_values(ascending=False).head(10)

> ### This Shows That Lower Humidities will Have positive impact on GHI in Future

## Temperature

In [None]:
sns.displot(data=df,kind='hist',x='Temperature')

In [None]:
sns.boxplot(data = df,y = 'Temperature')

In [None]:
sns.boxplot(data = df,x='Month',y = 'Temperature')

### As You can see Tempertur trend is Normal, Most fluctuations are in July due to Monsoons

In [None]:
df.groupby('Temperature')['GHI_after_48_hours'].mean().sort_values(ascending=False).head(10)

### Higher Temperatures are mainly responsible for higher GHI, which is as expected, 
### Thats why Most GHI is recorede in May-April because those months recorded highest avg Temperatures

## Pressure

In [None]:
sns.displot(data=df,x='Pressure',kind='hist')

In [None]:
plt.figure(figsize=(10,8))
df.groupby('Pressure')['GHI_after_48_hours'].mean().plot(kind='bar',ylabel='Mean GHI',title='Mean GHI vs Pressure')

### EDA Finished

# Model Building 

### For Model building and Selection we will use automated Library Pycaret 🥕

### Setup (Dividing the dataset into 50-50 test train ratio for efficient results)

In [None]:

X = df.drop(['Year'],axis=1)
y= df['GHI_after_48_hours']
X_train,  X_test, y_train, y_test = train_test_split(X,y,random_state=42, train_size=0.5)

### Further dividing the train set into train and dev set into 80-20 ratio

In [None]:
from pycaret.regression import *
log = setup(data=X_train, target='GHI_after_48_hours',train_size=0.8,
            silent=True, feature_selection=True, create_clusters=True)

### Model Selection

In [None]:
compare_models()

> ## Catboost Regressor is the Most accurate model according to R2 value

### Building individual and Blend of top 

In [None]:
cb = create_model('catboost')

In [None]:
lgbm = create_model('lightgbm')

In [None]:
gbr = create_model('gbr')

In [None]:
et = create_model('et')

In [None]:
xgb = create_model('xgboost')

## Blend of CatBoost, LGBM , XGB

In [None]:
blender = blend_models(estimator_list=[cb,lgbm,xgb])

## Stack Model

In [None]:
stacker = stack_models(estimator_list=[cb,lgbm,xgb])

# Analysing Models

## Catboost

In [None]:
plot_model(cb)

In [None]:
plot_model(cb, plot='error')

In [None]:
plot_model(cb, plot='feature')

## LGBM

In [None]:
plot_model(lgbm)

In [None]:
plot_model(lgbm, plot='error')

## GBoost

In [None]:
plot_model(gbr)

## XGboost

In [None]:
plot_model(xgb)

### It's overfitting train data a little

## Extra Tree

In [None]:
plot_model(et)

### We can see that this model has overfitted the data alot, so we will not use it in Blend or stack Model

## Blender

In [None]:
plot_model(blender)

## Stacker

In [None]:
plot_model(stacker)

# Interpreting Model

In [None]:
interpret_model(cb)

> ## We can understand with the help of SHAP value how high and low values of a feature affect the predictions

In [None]:
from sklearn.metrics import r2_score
models = [cb,lgbm,gbr,et,xgb,blender,stacker]
models2 = ['cb','lgbm','gbr','et','xgb','blender','stacker']

> ### Training set R2_score

In [None]:
r2=[]
for i in models:
    pred = predict_model(i, data=X_train.drop(['GHI_after_48_hours'],axis=1))
    l=[]
    for j in pred['Label']:
        if j <0:
            l.append(0)
        else:
            l.append(int(j))
        
    pred['Label'] = l

    r2.append(r2_score(X_train['GHI_after_48_hours'],pred['Label']))
    
df3 = pd.DataFrame({'Model':models2, 'Train_R2':r2})

In [None]:
df3.sort_values(by='Train_R2',ascending=False)

> ### Test/Holdout Set R2_score

In [None]:
r2=[]
for i in models:
    pred2 = predict_model(i, data=X_test.drop(['GHI_after_48_hours'],axis=1))
    l=[]
    for j in pred2['Label']:
        if j <0:
            l.append(0)
        else:
            l.append(int(j))
        
    pred2['Label'] = l

    r2.append(r2_score(X_test['GHI_after_48_hours'],pred2['Label']))
    
df3 = pd.DataFrame({'Model':models2, 'Test_R2':r2})

In [None]:
df3.sort_values(by='Test_R2',ascending=False)

> 

### Predicted value comparison

In [None]:
def fun(l):
    new_l=[]
    for j in l:
        if j <0:
            new_l.append(0)
        else:
            new_l.append(int(j))
        
    return new_l

In [None]:
df_comp = pd.DataFrame({'True GHI': X_test['GHI_after_48_hours'],'Predicted GHI':fun(predict_model(stacker,data=X_test.drop(['GHI_after_48_hours'],axis=1))['Label'])})
df_comp.head(40)

# Saving Model

In [None]:
save_model(stacker,'model')