# Thesis work--1

### Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from random import randrange, uniform

### Global variables and functions for tuning

In [2]:
FEATURES = 4
ROWS = 10
LOW_RANGE = 0
HIGH_RANGE = 10
NUMBER_OF_REGIONS = 3
SELECTION_X_GENERATOR = ['random_normal_dist','random_normal_range','set_manual_x']
SELECTED_GENERATOR = SELECTION_X_GENERATOR[0]
APPLY_FEATURE_SCALING = False
formula = lambda X: X[:,2] - X[:,0] + 5*X[:,1]**2 + (abs(X[:,3])*100)**2

### Generating random numbers using normal distribution

In [3]:
def rnd_norm():
    X = []
    for itr in range(FEATURES):
        X.append(np.random.normal(size=ROWS))
    X = np.asarray(X)
    X = X.reshape(ROWS,FEATURES)
    return X
    

### GENERATE NUMBER USING RANGES 

In [4]:
def rnd_rng():
    X = []
    for feat_ in range(FEATURES):
        temp_ = []
        for rows_ in range(ROWS):
            temp_.append(uniform(LOW_RANGE, HIGH_RANGE))
        X.append(temp_)
        temp_ = []
    X = np.asarray(X)
    X = X.reshape(ROWS,FEATURES)
    return X

### MANUALLY SET X

In [5]:
def mnl_x():
    X = [[4.77483186, 3.39440415, 57.05801446, 6.71629158],
 [6.61347886, 7.2255493,  122.77241787, 7.31532667],
 [4.5746061,  6.94376097, 1.57398063, 7.61596309],
 [3.99442041, 7.74271857, -22.18103276, 8.69864701],
 [7.74195296, 8.66056046, 1000.71418768, 8.66512174],
 [0.70227095, 8.48112974, 12.18087887, 9.98354661],
 [3.30624855, 1.28647579, 29.10406468, 8.96630542],
 [5.7062847,  7.72417331, 56.45891841, 5.15614429],
 [3.70369596, 1.79230683, 157.60607582, 9.4736015 ],
 [5.36628365, 9.68201719, 888.00342486, 2.75472882]]
    X = np.asarray(X)
    return X

### SELECT GENERATION METHOD

In [6]:
if 'random_normal_dist' == SELECTED_GENERATOR:
    X = rnd_norm()
elif 'random_normal_range' == SELECTED_GENERATOR:
    X = rnd_rng()
elif 'set_manual_x' == SELECTED_GENERATOR:
    X = mnl_x()

### Generating random numbers Y

In [7]:
Y = formula(X)
print(np.shape(Y))
print(Y)

(10,)
[10506.81355281 29068.70270821  3724.22666756    35.29256846
  2461.95590431  2531.86120472 10142.38020866  1650.15917647
  7934.60839792  1132.28396458]


### Splitting the dataset into the Training set and Test set

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

### Feature Scaling

In [9]:
from sklearn.preprocessing import StandardScaler
if APPLY_FEATURE_SCALING:
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

### Applying PCA

In [10]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 1)
#print(X_train)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
print(X_train_pca)
#print(X_test_pca)
# print(np.shape(X_train),np.shape(X_test))
#print(explained_variance)

[[ 1.15313159]
 [-0.53952908]
 [-0.10726384]
 [ 0.17536468]
 [ 0.37976124]
 [-1.47607555]
 [ 3.01147748]
 [-2.59686653]]


### Applying LDA

In [11]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
# lda = LDA(n_components = None)
# X_train_lda = lda.fit_transform(X_train, y_train)
# X_test_lda = lda.transform(X_test)

### Applying Kernel PCA

In [12]:
from sklearn.decomposition import KernelPCA
# kpca = KernelPCA(n_components = 2, kernel = 'rbf')
# X_train_pca = kpca.fit_transform(X_train)
# X_test_pca = kpca.transform(X_test)

### Calculating DV

In [13]:
# sort X_train_pca and y_train by index
sorted_indexes = np.argsort(X_train_pca,axis=0)
sorted_x_train_pca = X_train_pca[sorted_indexes]
sorted_y_train = y_train[sorted_indexes]

# Finding Change of Slope
slope1 = []
slope2 = []
for itr in range(1,len(sorted_x_train_pca)):
    slope1.append((sorted_y_train[itr]-sorted_y_train[itr-1])/(sorted_x_train_pca[itr]-sorted_x_train_pca[itr-1]))
for itr in range(1,len(slope1)):
    slope2.append((slope1[itr]-slope1[itr-1])/(sorted_x_train_pca[itr]-sorted_x_train_pca[itr-1]))

# normalize slope2 
normalized_slope2 = (slope2-min(slope2))/(max(slope2)-min(slope2))

# Calculating Quantiles
normalized_slope2 = np.reshape(normalized_slope2,len(normalized_slope2))
quantile_ranges = pd.qcut(normalized_slope2,NUMBER_OF_REGIONS,labels=False,retbins=True)
quantile_ranges = quantile_ranges[1]

# Adding the difficult Vectors
difficult_points = {}
for q_ind in range(NUMBER_OF_REGIONS):
    low = quantile_ranges[q_ind]
    high = quantile_ranges[q_ind+1]
    difficult_points[q_ind] = []
    for n_ind in range(0,len(normalized_slope2)):
        if normalized_slope2[n_ind] >= low and normalized_slope2[n_ind] <= high:
            difficult_points[q_ind].append(n_ind)
print(difficult_points)

{0: [0, 2], 1: [1, 5], 2: [3, 4]}


### Training 

In [14]:
from gplearn.genetic import SymbolicRegressor


In [15]:
est_gp = SymbolicRegressor(population_size=200,
                           generations=20, stopping_criteria=0.01,
                           p_crossover=0.7, p_subtree_mutation=0.1,
                           p_hoist_mutation=0.05, p_point_mutation=0.1,
                           max_samples=0.9, verbose=1,
                           parsimony_coefficient=0.01, random_state=0)

In [36]:
final_train_X = np.reshape(sorted_x_train_pca,(len(sorted_x_train_pca),1))
final_train_Y = np.reshape(sorted_y_train,(len(sorted_y_train),1))
est_gp.fit(final_train_X,final_train_Y)

  y = column_or_1d(y, warn=True)


    |    Population Average   |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    46.36 8609.485774353489       63 4043.5247695469393 29093.147720438188      4.33s
   1    44.89 7093.435950247986       63 4043.4849761709074 29094.37667615314      6.16s
   2    36.01 7118.205869947984       21 4039.060054533296 29068.209231878474      6.36s
   3    25.22 7190.03705608787       51 4033.639354668491 29106.519163274883      6.17s
   4    26.12 7291.795924910594       31 4038.7165558180463 29065.896686445623      6.05s
   5    32.75 7188.169015668722       31 4038.7165558180463 29065.896686445623      5.89s
   6    36.96 7148.7560677936835       31 4041.269222759722 29065.91453424749      5.61s
   7    37.27 7147.318640490105       49 4038.8165793952053 29068.70528130359      5.31s
   8    44.42 7175.602126331979       51 

SymbolicRegressor(const_range=(-1.0, 1.0),
         function_set=('add', 'sub', 'mul', 'div'), generations=20,
         init_depth=(2, 6), init_method='half and half', max_samples=0.9,
         metric='mean absolute error', n_jobs=1, p_crossover=0.7,
         p_hoist_mutation=0.05, p_point_mutation=0.1, p_point_replace=0.05,
         p_subtree_mutation=0.1, parsimony_coefficient=0.01,
         population_size=200, random_state=0, stopping_criteria=0.01,
         tournament_size=20, verbose=1, warm_start=False)

In [37]:
print(est_gp._program)

mul(sub(div(X0, -0.494), mul(sub(sub(0.220, X0), mul(-0.443, -0.387)), div(mul(add(sub(mul(div(div(X0, X0), mul(0.380, mul(X0, -0.847))), add(div(X0, div(sub(X0, X0), mul(mul(sub(0.905, X0), sub(-0.692, 0.377)), X0))), mul(X0, X0))), add(mul(add(mul(-0.286, -0.670), 0.260), div(X0, X0)), mul(sub(div(X0, -0.494), mul(-0.218, div(mul(add(add(X0, X0), mul(X0, X0)), add(0.359, div(0.260, X0))), sub(sub(mul(-0.692, 0.377), add(-0.212, 0.065)), add(add(0.712, X0), sub(mul(-0.286, -0.670), X0)))))), sub(div(X0, -0.494), add(-0.680, 0.823))))), mul(X0, X0)), add(sub(div(X0, -0.494), mul(sub(div(0.961, -0.149), mul(-0.443, -0.387)), div(mul(add(add(X0, X0), mul(X0, X0)), add(mul(-0.286, -0.670), 0.260)), sub(sub(sub(-0.692, 0.377), add(-0.212, 0.065)), add(div(0.712, X0), div(0.773, 0.306)))))), 0.260)), sub(sub(sub(-0.692, 0.377), add(-0.212, 0.065)), add(div(0.712, X0), div(0.773, 0.306)))))), sub(div(X0, -0.494), mul(X0, X0)))
