In [65]:
# Libraries for data loading, data manipulation and data visulisation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Libraries for data preparation and model building
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_squared_log_error,r2_score

from sklearn.svm import SVR
from sklearn.ensemble import VotingRegressor, StackingRegressor

from sklearn.feature_selection import VarianceThreshold

# Setting global constants to ensure notebook results are reproducible
PARAMETER_CONSTANT = 42

## We begin with the preparation of the Genetic Algorythm with our data


In [66]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from random import randint
%matplotlib inline 
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
def split(df,label):
    X_tr, X_te, Y_tr, Y_te = train_test_split(df, label, test_size=0.25, random_state=42)
    return X_tr, X_te, Y_tr, Y_te

from sklearn import svm
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_val_score


In [67]:
# for our purposes we need only regression models


classifiers = ['SupportVectorMachine',
               'LinearRegression', 'Ridge', 'Lasso', 'RandomForest', 
               'AdaBoost',  'DecisionTree', 'GradientBoosting']

#list of models initialized need to regressors
models = [svm.SVR(kernel='rbf', gamma='auto'),
          LinearRegression(),
          Ridge(),
          Lasso(),
          RandomForestRegressor(n_estimators=200, random_state=0),
          AdaBoostRegressor(random_state = 0),
          DecisionTreeRegressor(random_state=0),
          GradientBoostingRegressor(random_state=0)]


def acc_score(df, label):
    Score = pd.DataFrame({"Classifier": classifiers})
    j = 0
    acc = []
    X_train, X_test, Y_train, Y_test = split(df, label)
    for i in models:
        print(i)
        model = i
        model.fit(X_train, Y_train)
        predictions = model.predict(X_test)
        mse = mean_squared_error(Y_test, predictions)
        acc.append("{:.4f}".format(np.sqrt(mse)))
        j = j + 1
    Score["MSE"] = acc
    Score.sort_values(by="MSE", ascending=False, inplace=True)
    Score.reset_index(drop=True, inplace=True)
    return Score

def plot(score,x,y,c = "b"):
    gen = [1,2,3,4,5]
    plt.figure(figsize=(6,4))
    ax = sns.pointplot(x=gen, y=score,color = c )
    ax.set(xlabel="Generation", ylabel="Accuracy")
    ax.set(ylim=(x,y))

In [81]:
# this script needs to do a lot more logging and performance statiscical gathering.

def initilization_of_population(size,n_feat):
    population = []
    for i in range(size):
        chromosome = np.ones(n_feat,dtype=bool)     
        chromosome[:int(0.3*n_feat)]=False             
        np.random.shuffle(chromosome)
        population.append(chromosome)
    return population


def fitness_score(population):
    scores = []
    for chromosome in population:
        logmodel.fit(X_train.iloc[:,chromosome],Y_train)         
        predictions = logmodel.predict(X_test.iloc[:,chromosome])
        mse = mean_squared_error(Y_test,predictions)
        scores.append(np.sqrt(mse))
    scores, population = np.array(scores), np.array(population) 
    inds = np.argsort(scores)                                    
    return list(scores[inds]), list(population[inds,:]) 


def selection(pop_after_fit,n_parents):
    population_nextgen = []
    for i in range(n_parents):
        population_nextgen.append(pop_after_fit[i])
    return population_nextgen


def crossover(pop_after_sel):
    pop_nextgen = pop_after_sel
    for i in range(0,len(pop_after_sel),2):
        new_par = []
        child_1 , child_2 = pop_nextgen[i] , pop_nextgen[i+1]
        new_par = np.concatenate((child_1[:len(child_1)//2],child_2[len(child_1)//2:])) # We need a better crossover function.
        pop_nextgen.append(new_par)
    return pop_nextgen

def uniform_crossover(pop_after_sel):
    pop_nextgen = pop_after_sel
    for i in range(0, len(pop_after_sel), 2):
        new_par = []
        child_1, child_2 = pop_nextgen[i], pop_nextgen[i+1]
        for j in range(len(child_1)):
            if np.random.rand() < 0.5:
                new_par.append(child_1[j])
            else:
                new_par.append(child_2[j])
        pop_nextgen.append(new_par)


def mutation(pop_after_cross,mutation_rate,n_feat):   
    mutation_range = int(mutation_rate*n_feat)
    pop_next_gen = []
    for n in range(0,len(pop_after_cross)):
        chromo = pop_after_cross[n]
        rand_posi = [] 
        for i in range(0,mutation_range):
            pos = randint(0,n_feat-1)
            rand_posi.append(pos)
        for j in rand_posi:
            chromo[j] = not chromo[j]  
        pop_next_gen.append(chromo)
    return pop_next_gen


def generations(df,label,size,n_feat,n_parents,mutation_rate,n_gen,X_train,
                                   X_test, Y_train, Y_test):
    best_chromo= []
    best_score= []
    population_nextgen=initilization_of_population(size,n_feat)
    for i in range(n_gen):
        scores, pop_after_fit = fitness_score(population_nextgen)
        print('Best score in generation',i+1,':',scores[:1])  #2
        pop_after_sel = selection(pop_after_fit,n_parents)
        # pop_after_cross = crossover(pop_after_sel)
        pop_after_cross = uniform_crossover(pop_after_sel)
        population_nextgen = mutation(pop_after_cross,mutation_rate,n_feat)
        best_chromo.append(pop_after_fit[0])
        best_score.append(scores[0])
        print('Generation', i+1, 'completed.')
    print('Genetic algorithm completed.')
    return best_chromo,best_score

## Lets work with our data a little bit


In [68]:
# Read our data

df_test = pd.read_csv('df_test.csv')
df_train = pd.read_csv('df_train.csv')

# Convert 'time' in to datetime format
df_train['time'] = pd.to_datetime(df_train['time'])
df_test['time'] = pd.to_datetime(df_test['time'])

# Break down 'time' into year, month, day, hour, minute, second
# df_train['year'] = df_train['time'].dt.year
# df_train['month'] = df_train['time'].dt.month
# df_train['day'] = df_train['time'].dt.day
# df_train['hour'] = df_train['time'].dt.hour
# df_train['minute'] = df_train['time'].dt.minute

# df_test['year'] = df_test['time'].dt.year
# df_test['month'] = df_test['time'].dt.month
# df_test['day'] = df_test['time'].dt.day
# df_test['hour'] = df_test['time'].dt.hour
# df_test['minute'] = df_test['time'].dt.minute


In [71]:
display(df_train.head())



Unnamed: 0.1,Unnamed: 0,time,Madrid_wind_speed,Valencia_wind_deg,Bilbao_rain_1h,Valencia_wind_speed,Seville_humidity,Madrid_humidity,Bilbao_clouds_all,Bilbao_wind_speed,...,Madrid_temp_max,Barcelona_temp,Bilbao_temp_min,Bilbao_temp,Barcelona_temp_min,Bilbao_temp_max,Seville_temp_min,Madrid_temp,Madrid_temp_min,load_shortfall_3h
0,0,2015-01-01 03:00:00,0.666667,level_5,0.0,0.666667,74.333333,64.0,0.0,1.0,...,265.938,281.013,269.338615,269.338615,281.013,269.338615,274.254667,265.938,265.938,6715.666667
1,1,2015-01-01 06:00:00,0.333333,level_10,0.0,1.666667,78.333333,64.666667,0.0,1.0,...,266.386667,280.561667,270.376,270.376,280.561667,270.376,274.945,266.386667,266.386667,4171.666667
2,2,2015-01-01 09:00:00,1.0,level_9,0.0,1.0,71.333333,64.333333,0.0,1.0,...,272.708667,281.583667,275.027229,275.027229,281.583667,275.027229,278.792,272.708667,272.708667,4274.666667
3,3,2015-01-01 12:00:00,1.0,level_8,0.0,1.0,65.333333,56.333333,0.0,1.0,...,281.895219,283.434104,281.135063,281.135063,283.434104,281.135063,285.394,281.895219,281.895219,5075.666667
4,4,2015-01-01 15:00:00,1.0,level_7,0.0,1.0,59.0,57.0,2.0,0.333333,...,280.678437,284.213167,282.252063,282.252063,284.213167,282.252063,285.513719,280.678437,280.678437,6620.666667


In [72]:
column_var=df_train.describe().columns.to_list()
column_names = df_train.columns.to_list()
catergorical =set(column_names) -set(column_var)

print(catergorical)

{'Seville_pressure', 'Valencia_wind_deg'}


In [73]:
target_df = df_train['load_shortfall_3h']

In [74]:
# lets drop all categorical columns

# Note we had to remove Valencia_pressure as there are nulls here
df_train.drop(['Unnamed: 0', 'time', 'Seville_pressure', 'Valencia_wind_deg', 'load_shortfall_3h', 'Valencia_pressure'], axis=1, inplace=True)
df_train = df_train.iloc[:, 1:]  # Drop the first column
df_train

Unnamed: 0,Bilbao_rain_1h,Valencia_wind_speed,Seville_humidity,Madrid_humidity,Bilbao_clouds_all,Bilbao_wind_speed,Seville_clouds_all,Bilbao_wind_deg,Barcelona_wind_speed,Barcelona_wind_deg,...,Barcelona_temp_max,Madrid_temp_max,Barcelona_temp,Bilbao_temp_min,Bilbao_temp,Barcelona_temp_min,Bilbao_temp_max,Seville_temp_min,Madrid_temp,Madrid_temp_min
0,0.0,0.666667,74.333333,64.000000,0.000000,1.000000,0.000000,223.333333,6.333333,42.666667,...,281.013000,265.938000,281.013000,269.338615,269.338615,281.013000,269.338615,274.254667,265.938000,265.938000
1,0.0,1.666667,78.333333,64.666667,0.000000,1.000000,0.000000,221.000000,4.000000,139.000000,...,280.561667,266.386667,280.561667,270.376000,270.376000,280.561667,270.376000,274.945000,266.386667,266.386667
2,0.0,1.000000,71.333333,64.333333,0.000000,1.000000,0.000000,214.333333,2.000000,326.000000,...,281.583667,272.708667,281.583667,275.027229,275.027229,281.583667,275.027229,278.792000,272.708667,272.708667
3,0.0,1.000000,65.333333,56.333333,0.000000,1.000000,0.000000,199.666667,2.333333,273.000000,...,283.434104,281.895219,283.434104,281.135063,281.135063,283.434104,281.135063,285.394000,281.895219,281.895219
4,0.0,1.000000,59.000000,57.000000,2.000000,0.333333,0.000000,185.000000,4.333333,260.000000,...,284.213167,280.678437,284.213167,282.252063,282.252063,284.213167,282.252063,285.513719,280.678437,280.678437
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8758,0.0,2.666667,89.000000,95.666667,56.666667,4.333333,80.000000,226.666667,1.666667,83.666667,...,282.150000,280.816667,281.276667,285.150000,287.573333,280.483333,290.150000,284.816667,279.686667,278.483333
8759,0.0,2.000000,82.000000,85.000000,26.666667,8.000000,75.000000,220.000000,3.000000,213.333333,...,287.816667,283.483333,287.483333,286.483333,288.616667,287.150000,291.150000,287.150000,282.400000,280.150000
8760,0.4,7.333333,67.666667,71.000000,63.333333,8.333333,33.333333,283.333333,5.333333,256.666667,...,290.483333,285.150000,289.816667,283.816667,285.330000,289.150000,286.816667,289.150000,283.956667,281.150000
8761,0.2,7.333333,67.666667,79.000000,63.333333,2.666667,51.666667,220.000000,5.333333,250.000000,...,288.150000,283.483333,287.523333,278.816667,281.410000,286.816667,284.150000,289.150000,282.666667,280.816667


In [75]:
score1 = acc_score(df_train,target_df)
score1

SVR(gamma='auto')
LinearRegression()
Ridge()
Lasso()
RandomForestRegressor(n_estimators=200, random_state=0)
AdaBoostRegressor(random_state=0)
DecisionTreeRegressor(random_state=0)
GradientBoostingRegressor(random_state=0)


Unnamed: 0,Classifier,MSE
0,DecisionTree,6296.9388
1,SupportVectorMachine,5298.7105
2,AdaBoost,5111.8096
3,Lasso,4925.5384
4,Ridge,4923.3484
5,LinearRegression,4922.2711
6,GradientBoosting,4733.9448
7,RandomForest,4377.0043


In [83]:
#We pick the best model

logmodel = RandomForestRegressor(n_estimators=200, random_state=0)

X_train,X_test, Y_train, Y_test = split(df_train,target_df)


chromo_df_bc,score_bc=generations(df_train,target_df,size=100,n_feat=df_train.shape[1],n_parents=64,mutation_rate=0.20,n_gen=8,
                         X_train = X_train,X_test = X_test,Y_train = Y_train,Y_test = Y_test)

In [44]:
#Testin the reverse sort for the model


avgDists = np.random.rand(10)
n = 10


print(avgDists.argsort())
print((-avgDists).argsort()[:len(avgDists)])
print(avgDists.argsort()[::-1][:n])
print(avgDists.argsort()[-n:][::-1])

[3 9 4 1 7 2 5 6 0 8]
[8 0 6 5 2 7 1 4 9 3]
[8 0 6 5 2 7 1 4 9 3]
[8 0 6 5 2 7 1 4 9 3]
