In [3]:
# Onur Can 
# Project is done for Prof. Mehmet Gönen's DASC 521: Introduction to Machine Learning @ Koç University MSc Data Science Program
# Thanks Prof Mehmet for the dataset generation and instructions

import matplotlib.pyplot as plt
import numpy as np
import math as math

# 0. Data Preparation

In [4]:
#importing data_set
data_set = np.genfromtxt("../input/volcan-dataset/volcano_data_set.csv", delimiter = ",", skip_header = 1)

training_set = data_set[0:150,:]
test_set = data_set[150:,:]
print(training_set.shape)
print(test_set.shape)

#Splitting and preparing the data
x_train = training_set[:,0]
y_train = training_set[:,1].astype(int)
x_test = test_set[:,0]
y_test = test_set[:,1].astype(int)
N_test = test_set.shape[0]

print(x_train.shape)

# 1. Regressogram
### 1.1. Calculating G_hat for training data

In [5]:
bin_width = 0.37
origin = +1.5
end_point = +5.2  #this is for finishing origin & Next bin will have 0 data

#--------------------------------------------------------------------
#Basic concept: Define bins with origin and width - put x_trains in these take AVERAGE of their
#respective R values that is your y_predicted for new_x if it is in that interval
# g = sum( b(x,x_t)) * r_t / sum( b(x,x_t))   where b(x,x_t) is 1 if x_t is 
#in same bin with x (One function)
#--------------------------------------------------------------------

#Bin Settings
left_borders = np.arange(origin, end_point, bin_width)
right_borders = np.arange(origin + bin_width, end_point + bin_width, bin_width)
print(left_borders)
print(right_borders)
print("Number of Bins = ", len(left_borders))

#Calculating Denominator of G_hat
Bin_Number_of_observations = np.asarray([np.sum((left_borders[b] < x_train) & (
    x_train <= right_borders[b])) for b in range(len(left_borders))])
print("\nNumber of elements in each Bin from left to right")
print(Bin_Number_of_observations)

#Calculating nominator of G_hat
Bin_R_Values_Sum = np.zeros(Bin_Number_of_observations.shape[0])
for i in range(len(left_borders)):
    Bin_R_Values_Sum[i] = np.sum(y_train[(left_borders[i] < x_train) & (
    x_train <= right_borders[i])])
print("\nSum of R values in each Bin from left to right")
print(Bin_R_Values_Sum)

#Calculating G_hat values
g_hat = Bin_R_Values_Sum / Bin_Number_of_observations
print("\nG_hat values for each Bin from left to right")
print(g_hat)


### 1.2. Plotting Training & Test Data and Regressogram

In [6]:
#Plotting all the fields in the homework
plt.figure(figsize = (12,6))
#training set
plt.plot(x_train, y_train, "b.", markersize = 10, label = "training")
#test set
plt.plot(x_test, y_test, "r.", markersize = 10, label = "test")
#Plotting regressogram
for j in range(len(left_borders)):
    plt.plot([left_borders[j], right_borders[j]], [g_hat[j], g_hat[j]], "k-")
for j in range(len(left_borders) - 1):
    plt.plot([right_borders[j], right_borders[j]], [g_hat[j], g_hat[j + 1]], "k-")
plt.xlabel("Eruption Time (min)")
plt.ylabel("Waiting time to next eruption (min)")
plt.legend(loc="upper left")
plt.show()

### 1.3. Regressogram Root Mean Squared Error (RMSE)


In [7]:
#Calculating predicted values for each test data point
y_predicted = np.zeros(x_test.shape[0])

#Finding the respective bin for test data points therefore corresponding y_predicted
for i in range(len(x_test)):
    for j in range(len(left_borders)):
        if x_test[i] > left_borders[j]:
            y_predicted[i] = g_hat[j]

#Root Mean Squared Error Formula
RMSE = np.sqrt(np.sum(((y_predicted - y_test)**2)) / N_test)
print("----------------------------------------------------------")
print("Regressogram => RMSE is ", RMSE ," when h is ", bin_width)
print("----------------------------------------------------------")



# 2. Running Mean Smoother
### 2.1 Calculating G_hat for Traning Set

In [8]:
end_point = +5.2
data_interval = np.linspace(origin, end_point, 371 )                            
bin_width = 0.37

#--------------------------------------------------------------------
#Basic Concept : Instead of defining origin/fixed bins - you define a symetric interval around X point
#take the AVERAGE values of their R values
#g^(x) = Sum(w(u)*r_t / Sum (w(u)) where u = (x-x_t)/h and ) .... w(u) is 1 if abs(w(u)) < 1 
#--------------------------------------------------------------------

#For Running Mean Smoother for training data
#Calculating Denominator of G_hat
Bin_Number_of_observations = np.asarray([np.sum(( x - 0.5 * bin_width < x_train) & (
   x_train <=  x + 0.5 * bin_width)) for x in data_interval])

#Calculating nominator of G_hat
Bin_R_Values_Sum = np.zeros(Bin_Number_of_observations.shape[0])
for i in range(Bin_Number_of_observations.shape[0]):
    Bin_R_Values_Sum[i] = np.sum(y_train[( data_interval[i] - 0.5 * bin_width < x_train) & (
   x_train <=  data_interval[i] + 0.5 * bin_width)])

g_hat = Bin_R_Values_Sum / Bin_Number_of_observations


### 2.2 Running Mean Smoother Root Mean Squared Error (RMSE)


In [9]:
#Calculating predicted values for each test data point
y_predicted = np.zeros(x_test.shape[0])

#Finding the respective bin for test data points therefore corresponding y_predicted
Number_of_observations = np.asarray([np.sum(( x - 0.5 * bin_width < x_train) & (
   x_train <=  x + 0.5 * bin_width)) for x in x_test])

Bin_Values_Sum = np.asarray([np.sum(y_train[( x - 0.5 * bin_width < x_train) & (
   x_train <=  x + 0.5 * bin_width)]) for x in x_test])

y_predicted = Bin_Values_Sum / Number_of_observations
#Root Mean Squared Error Formula
RMSE = np.sqrt(np.sum(((y_predicted - y_test)**2)) / N_test)

print("-----------------------------------------------------------------------")
print("Running Mean Smoother => RMSE is ", RMSE ," when h is ", bin_width)
print("-----------------------------------------------------------------------")

### 2.3 Plotting Training & Test Data and Running Mean Smoother Line

In [10]:
#Plotting all the fields in the homework
plt.figure(figsize = (12,6))
#training set
plt.plot(x_train, y_train, "b.", markersize = 10, label = "training")
#test set
plt.plot(x_test, y_test, "r.", markersize = 10, label = "test")
#Plotting running mean smooth
plt.plot(data_interval, g_hat, "k-")
plt.xlabel("Eruption Time (min)")
plt.ylabel("Waiting time to next eruption (min)")
plt.legend(loc="upper left")
plt.show()

# 3. Kernel Smoother
### 3.1 Calculating G_hat for Traning Set

In [11]:
#Parameters for G_Hat Calculation
h = +0.37
end_point = +5.2
origin = +1.5
data_interval = np.linspace(origin, end_point, 371)
Kernel_Line = np.zeros(371)

#--------------------------------------------------------------------
#Basic Concept: For any new X all traning data points have an effect on that point
#with decreasing effect as /x-xi/ .. Putting less weights to further points .. Also continuos
#User Gaussian Kernel for this K(u) = 1/sqrt(2pie) * exp(-u^2 / 2) where u = (x-xi)/h
#g^(x) = Sum(K(u)*r_t / Sum (K(u))
#--------------------------------------------------------------------

for x in range(len(data_interval)):
    #denominator of g_hat
    Kernel_estimation = np.asarray([ (1 / np.sqrt(2 * math.pi) * np.exp(
        -0.5 * ( data_interval[x] - i )**2 / h**2 )) for i in x_train])
    #nominator of g_hat
    r_values = Kernel_estimation * y_train
    #g_hat values
    Kernel_Line[x] = np.sum(r_values) / np.sum(Kernel_estimation)



### 3.2 Plotting Training & Test Data and Kernel Smoother Line

In [12]:
#Plotting all the fields in the homework
plt.figure(figsize = (12,6))
#training set
plt.plot(x_train, y_train, "b.", markersize = 10, label = "training")
#test set
plt.plot(x_test, y_test, "r.", markersize = 10, label = "test")
#Plotting running mean smooth
plt.plot(data_interval, Kernel_Line, "k-")
plt.xlabel("Eruption Time (min)")
plt.ylabel("Waiting time to next eruption (min)")
plt.legend(loc="upper left")
plt.show()

### 3.3 Kernel Smoother Root Mean Squared Error (RMSE)

In [13]:
#Calculating predicted values for each test data point
y_predicted = np.zeros(x_test.shape[0])

for x in range(len(x_test)):
    #denominator of g_hat
    Kernel_estimation = np.asarray([ (1 / np.sqrt(2 * math.pi) * np.exp(
        -0.5 * ( x_test[x] - i )**2 / h**2 )) for i in x_train])
    #nominator of g_hat
    r_values = Kernel_estimation * y_train
    #g_hat values
    y_predicted[x] = np.sum(r_values) / np.sum(Kernel_estimation)

#RMSE Formulation    
RMSE = np.sqrt(np.sum(((y_predicted - y_test)**2)) / N_test)

print("-----------------------------------------------------------------------")
print("Kernel Mean Smoother => RMSE is ", RMSE ," when h is ", bin_width)
print("-----------------------------------------------------------------------")