## The SUM Data Set (without noise) - Linear Regression

In [1]:
import pandas as pd
import numpy as np
import os

# specify data set path
data_path = os.path.abspath('Data Sets/The SUM dataset/without noise/The SUM dataset, without noise.csv')

# read CSV file directly from path and save the results
data = pd.read_csv(data_path, sep=';', index_col = 0) # 'sep' specifies separator used in the CSV file

# display the first 5 rows
data.head()

Unnamed: 0_level_0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5 (meaningless),Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,Target,Target Class
Instance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,57326,68791,82549,99059,72624,142645,171174,205409,246491,295789,1073444,Very Large Number
2,87859,105431,126517,151820,19982,218621,262345,314814,377777,453332,1645184,Very Large Number
3,23721,28465,34158,40990,20054,59026,70831,84997,101996,122395,444184,Large Number
4,24771,29725,35670,42804,7775,61638,73966,88759,106511,127813,463844,Large Number
5,47862,57434,68921,82705,60872,119095,142914,171497,205796,246955,896224,Very Large Number


In [2]:
# check the shape of the DataFrame (rows, cols)
data.shape

(968135, 12)

In [3]:
# create a python list of feature names
feature_cols = ['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4', 'Feature 6', 'Feature 7', 'Feature 8', 'Feature 9', 'Feature 10']



# use the list to create a subset of the original DataFrame (X)
X = data.loc[:100,feature_cols] # for 100 rows select the 'feature cols'

# alternative version
# X = data.loc[:100,['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4', 'Feature 6', 'Feature 7', 'Feature 8', 'Feature 9', 'Feature 10']]


# select the Target column as the response (Y)  
y = data.Target[:100] # select first 100 elements from the Target



In [4]:
# check shape of X and y
print(X.shape)
print(y.shape)

(100, 9)
(100,)


In [16]:
# 10-fold cross validation with linear regression, using RMSE (root mean squared error) metric 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

linear_reg = LinearRegression() # instantiate model

scores = cross_val_score(linear_reg, X, y, cv=10, scoring='neg_mean_squared_error')

# note: cross_val_score takes care of splitting the data into folds, 
# so we don't need to split the data ourselves using train/test split

# fix the sign of MSE scores
mse_scores = -scores
print (mse_scores)

[  1.21972744e-20   8.16539761e-20   6.30192513e-20   7.42847895e-20
   4.92973175e-20   1.15535294e-19   1.52465931e-20   2.78303263e-20
   6.98934530e-21   6.44592073e-20]


In [17]:
# convert from MSE to RMSE
rmse_scores = np.sqrt(mse_scores)
print (rmse_scores)

[  1.10441271e-10   2.85751599e-10   2.51036354e-10   2.72552361e-10
   2.22029992e-10   3.39904831e-10   1.23477095e-10   1.66824238e-10
   8.36023044e-11   2.53888179e-10]


In [7]:
# calculate average RMSE
print (rmse_scores.mean())

2.1095082248e-10


In [8]:
# create a python list of feature names
feature_cols = ['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4', 'Feature 6', 'Feature 7', 'Feature 8', 'Feature 9', 'Feature 10']



# use the list to create a subset of the original DataFrame (X)
X1 = data.loc[:500,feature_cols] # for 500 rows select the 'feature cols'

# select the Target column as the response (Y)  
y1 = data.Target[:500] # select first 100 elements from the Target

scores = cross_val_score(linear_reg, X1, y1, cv=10, scoring='neg_mean_squared_error')

# note: cross_val_score takes care of splitting the data into folds, 
# so we don't need to split the data ourselves using train/test split

# fix the sign of MSE scores
mse_scores = -scores

# convert from MSE to RMSE
rmse_scores = np.sqrt(mse_scores)

# calculate average RMSE
print (rmse_scores.mean())

2.16977235647e-10


In [9]:
# create a python list of feature names
feature_cols = ['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4', 'Feature 6', 'Feature 7', 'Feature 8', 'Feature 9', 'Feature 10']



# use the list to create a subset of the original DataFrame (X)
X2 = data.loc[:1000,feature_cols] # for 500 rows select the 'feature cols'

# select the Target column as the response (Y)  
y2 = data.Target[:1000] # select first 100 elements from the Target

scores = cross_val_score(linear_reg, X2, y2, cv=10, scoring='neg_mean_squared_error')

# note: cross_val_score takes care of splitting the data into folds, 
# so we don't need to split the data ourselves using train/test split

# fix the sign of MSE scores
mse_scores = -scores

# convert from MSE to RMSE
rmse_scores = np.sqrt(mse_scores)

# calculate average RMSE
print (rmse_scores.mean())

1.85330994948e-10


In [10]:
# create a python list of feature names
feature_cols = ['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4', 'Feature 6', 'Feature 7', 'Feature 8', 'Feature 9', 'Feature 10']


# use the list to create a subset of the original DataFrame (X)
X3 = data.loc[:5000,feature_cols] # for 500 rows select the 'feature cols'

# select the Target column as the response (Y)  
y3 = data.Target[:5000] # select first 100 elements from the Target

scores = cross_val_score(linear_reg, X3, y3, cv=10, scoring='neg_mean_squared_error')

# note: cross_val_score takes care of splitting the data into folds, 
# so we don't need to split the data ourselves using train/test split

# fix the sign of MSE scores
mse_scores = -scores

# convert from MSE to RMSE
rmse_scores = np.sqrt(mse_scores)

# calculate average RMSE
print (rmse_scores.mean())

2.12080415152e-10


In [11]:
# create a python list of feature names
feature_cols = ['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4', 'Feature 6', 'Feature 7', 'Feature 8', 'Feature 9', 'Feature 10']



# use the list to create a subset of the original DataFrame (X)
X4 = data.loc[:10000,feature_cols] # for 500 rows select the 'feature cols'

# select the Target column as the response (Y)  
y4 = data.Target[:10000] # select first 100 elements from the Target

scores = cross_val_score(linear_reg, X4, y4, cv=10, scoring='neg_mean_squared_error')

# note: cross_val_score takes care of splitting the data into folds, 
# so we don't need to split the data ourselves using train/test split

# fix the sign of MSE scores
mse_scores = -scores

# convert from MSE to RMSE
rmse_scores = np.sqrt(mse_scores)

# calculate average RMSE
print (rmse_scores.mean())

3.4512005982e-10


In [12]:
# create a python list of feature names
feature_cols = ['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4', 'Feature 6', 'Feature 7', 'Feature 8', 'Feature 9', 'Feature 10']



# use the list to create a subset of the original DataFrame (X)
X5 = data.loc[:50000,feature_cols] # for 500 rows select the 'feature cols'

# select the Target column as the response (Y)  
y5 = data.Target[:50000] # select first 100 elements from the Target

scores = cross_val_score(linear_reg, X2, y2, cv=10, scoring='neg_mean_squared_error')

# note: cross_val_score takes care of splitting the data into folds, 
# so we don't need to split the data ourselves using train/test split

# fix the sign of MSE scores
mse_scores = -scores

# convert from MSE to RMSE
rmse_scores = np.sqrt(mse_scores)

# calculate average RMSE
print (rmse_scores.mean())

1.85330994948e-10


In [13]:
# create a python list of feature names
feature_cols = ['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4', 'Feature 6', 'Feature 7', 'Feature 8', 'Feature 9', 'Feature 10']



# use the list to create a subset of the original DataFrame (X)
X6 = data.loc[:100000,feature_cols] # for 500 rows select the 'feature cols'

# select the Target column as the response (Y)  
y6 = data.Target[:100000] # select first 100 elements from the Target

scores = cross_val_score(linear_reg, X6, y6, cv=10, scoring='neg_mean_squared_error')

# note: cross_val_score takes care of splitting the data into folds, 
# so we don't need to split the data ourselves using train/test split

# fix the sign of MSE scores
mse_scores = -scores

# convert from MSE to RMSE
rmse_scores = np.sqrt(mse_scores)

# calculate average RMSE
print (rmse_scores.mean())

6.2798077502e-10
