###  Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold,train_test_split
from sklearn.model_selection import TimeSeriesSplit,KFold, StratifiedKFold
import lightgbm as lgb
import xgboost as xgb

import catboost as cb
from time import time

In [None]:
'''
import glob

path = r'/content/drive/My Drive/Omdena Nigeria Challenge/DataSets/NASA_Weather_Solar_Dataset' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

train = pd.concat(li, axis=0, ignore_index=True)
'''

#### Data File available at https://drive.google.com/drive/folders/1Abvor2SMB4ByBdVnfFqYkVJXf7kzQI4i

In [2]:
train = pd.read_csv('Nigeria_complete.csv',low_memory=False)

In [3]:
train.head()

Unnamed: 0,LAT,LON,YEAR,MO,DY,PRECTOT,RH2M,PS,T2M_RANGE,T2M_MAX,T2M_MIN,T2M,WD10M,WS10M,KT,CLRSKY_SFC_SW_DWN,ALLSKY_SFC_LW_DWN,TS,QV2M,ALLSKY_SFC_SW_DWN
0,3.75,2.75,1982,1,2,0.04,80.54,100.83,0.36,27.74,27.38,27.55,204.91,2.6,-999,-999.0,-999.0,28.81,0.018508,-999.0
1,3.75,2.75,1982,1,3,0.26,81.63,100.89,0.57,27.78,27.21,27.53,199.29,2.9,-999,-999.0,-999.0,28.94,0.018725,-999.0
2,3.75,2.75,1982,1,4,1.45,80.47,100.93,0.86,27.6,26.74,27.25,203.05,2.32,-999,-999.0,-999.0,28.88,0.01815,-999.0
3,3.75,2.75,1982,1,5,1.07,80.8,100.91,0.46,27.21,26.75,27.01,186.59,2.25,-999,-999.0,-999.0,28.8,0.017964,-999.0
4,3.75,2.75,1982,1,6,10.35,80.55,100.97,0.28,27.15,26.87,27.02,270.6,2.64,-999,-999.0,-999.0,28.74,0.017915,-999.0



## Parameter Definitions

PRECTOT = Precipitation Per Day mm

PS = Surface Pressure

TS = Earth Skin Temperature

QV2M = Specific Humidity

RH2M = Relative Humidity

KT = Insolation Clearness Index

ALLSKY_SFC_SW_DWN	 = All Sky Insolation Incident on a Horizontal Surface

CLRSKY_SFC_SW_DWN = Clear Sky Insolation Incident On a Horizontal Surface 


### Data Preprocessing

In [4]:
drop_column = ['T2M_RANGE','T2M_MAX','T2M_MIN','WD10M','WS10M','KT',
               'CLRSKY_SFC_SW_DWN','ALLSKY_SFC_LW_DWN','TS','QV2M',
               'PRECTOT']


In [5]:
def preprocess(data, drop_column):
  data = data[data.ALLSKY_SFC_SW_DWN!= -999]
  data = data.drop(columns = drop_column)
  return data

In [6]:
train_processed = preprocess(train,drop_column)

In [7]:
y = train_processed.ALLSKY_SFC_SW_DWN.values
train_processed = train_processed.drop(columns = ['ALLSKY_SFC_SW_DWN'])

In [8]:
train_processed.head()

Unnamed: 0,LAT,LON,YEAR,MO,DY,RH2M,PS,T2M
545,3.75,2.75,1983,7,1,76.96,101.43,25.36
546,3.75,2.75,1983,7,2,79.74,101.31,25.38
547,3.75,2.75,1983,7,3,80.32,101.27,25.42
548,3.75,2.75,1983,7,4,78.98,101.28,25.55
549,3.75,2.75,1983,7,5,76.86,101.3,25.49


### Building the KFold XGBoost Model

In [None]:
'''
clf = lgb.LGBMRegressor(
    
                         n_estimators = 1000,
                         objective ='regression',
       

                        )
time_now = time()
clf.fit(X_train, y_train, eval_metric="rmse", early_stopping_rounds=100,  
                    eval_set=[(X_train, y_train), (X_test, y_test)],verbose=True)
time_new = time()
training_duration = time_new - time_now
print(f"Total Training time: {training_duration:.3f} seconds")
'''

In [None]:
'''

X_train, X_test, y_train, y_test = train_test_split(train_processed, y, 
                                                    test_size = 0.2,random_state = 1)

clf = xgb.XGBRegressor(
                            
                        n_estimators=100,
                        min_child_weight = 2,
                        max_depth=6,
                        verbosity = 1,
                        n_jobs=8,                                              
                        scale_pos_weight=1.025,
                        tree_method='exact',
                        objective = 'reg:squarederror',
                        predictor='cpu_predictor',
                        colsample_bytree = 0.66,
                        subsample = 1,
                        gamma = 0,
                        learning_rate=0.15,
                        num_parallel_tree = 1,    
                       )
    
  
clf.fit(X_train, y_train, eval_metric="rmse", early_stopping_rounds=50,
                eval_set=[(X_train, y_train), (X_test, y_test)],verbose=True) 
'''

In [None]:
n_fold = 5
random_state = 999
kf = KFold(n_splits = n_fold , shuffle = True, random_state = random_state)

models = []
train_no = 1
training_cycle = 10000
for train_index, val_index in kf.split(train_processed, y):
    train_X = train_processed.iloc[train_index]
    val_X = train_processed.iloc[val_index]
    train_y = y[train_index]
    val_y = y[val_index]
    
    
    clf = xgb.XGBRegressor(
                            
                        n_estimators= training_cycle,
                        min_child_weight = 2,
                        max_depth=6,
                        verbosity = 1,
                        n_jobs=8,                                              
                        scale_pos_weight=1.025,
                        tree_method='gpu_exact',
                        objective = 'reg:squarederror',
                        predictor='gpu_predictor',
                        colsample_bytree = 0.66,
                        subsample = 1,
                        gamma = 0,
                        learning_rate=0.15,
                        num_parallel_tree = 1,    
                       )
    

    clf.fit(train_X, train_y, eval_metric="rmse", early_stopping_rounds=50,
                eval_set=[(train_X, train_y), (val_X, val_y)],verbose=True)
    
    models.append(clf)

In [None]:
model_1_valid_rmse = models[1].evals_result()['validation_1']['rmse']
model_1_train_rmse = models[1].evals_result()['validation_0']['rmse']

import matplotlib.pyplot as plt

df = pd.DataFrame({'training cycle': range(0,training_cycle,1), 
                 'train_set':model_1_train_rmse,
                  'test_set':model_1_valid_rmse})

plt.figure(figsize=(20,20))

plt.plot( 'training cycle', 'train_set','bo-', data=df)
plt.plot( 'training cycle', 'test_set', 'ro-',data=df)


plt.title("RMSE error vs Training Cycle", fontsize =20, pad = 20)
plt.xlabel('Number of Training cycle',fontsize = 15, labelpad = 20)
plt.ylabel('RMSE', fontsize = 15, labelpad = 20)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

plt.legend(loc='upper left')
plt.grid(color='g', linestyle='-', linewidth=0.5)
plt.show()

### Input your Location, time of the year and weather information here

In [None]:
# input the test information here!!
test_dict = {'LAT': 9 , 'LON': 7, 'YEAR':2019, 'MO': 7, 'DY':2,'RH2M': 50, 'PS': 100, 'T2M':23}

In [None]:
test_df = pd.DataFrame([test_dict])
test_df= test_df[['LAT', 'LON', 'YEAR', 'MO', 'DY', 'RH2M', 'PS', 'T2M']]
pred_df = sum([clf.predict(test_df) for clf in models])/5.0
pred_value = abs(pred_df[0])


In [None]:
print(f'The predicted All Sky Insolation on a horizontal surface for Your specified location is {pred_value:.4f} KWh per square meter per day')

## Save the Model

In [None]:
clf_no =0
for clf in models:
    pickle.dump(clf, open(f"XGBoost_KFold_GPU_model_{clf_no}.dat", "wb"))
    clf_no+=1

In [None]:
# Load the model
#loaded_model = pickle.load(open(filename, 'rb'))