In [1]:
import pandas as pd
import numpy as np
import random
import time

In [2]:
#df = pd.read_csv (r'C:\Notebook scripts\Features_Variant_1.csv')
df = pd.read_csv ('Features_Variant_1.csv')

data_train=np.asarray(df.iloc[:,:])
#print(np.shape(data_train))

#shuffle before folding
np.random.shuffle(data_train)

In [3]:
#fold_1, fold_2, fold_3, fold_4, fold_5= np.array_split(data_train,5)
data_train_arr=np.array_split(data_train,5) #{[],[],[],[],[]}

y_train_arr=[]
x_train_arr=[]
for i in range(len(data_train_arr)):
    y_train_arr.append(data_train_arr[i][:,-1].reshape((-1, 1)) )
    x_train_arr.append(data_train_arr[i][:,:-1])


In [4]:
def normalization(X):
    mean=np.mean(X,axis=0)#(1,53)
    std=np.std(X,axis=0)
    np.place(std, std==0, 1)#if std==0 then change it to 1 for division #std_item[std_item==0]=1 
    x_normalized=(X-mean)/std
    return x_normalized,mean,std
def normalization_with_params(X,mean,std):
    return (X-mean)/std

In [5]:
def linear_regression_predict(X,omega,omega0):
    prod=X.dot(omega)#(8190,1)
    #print(np.shape(y+prod))
    return prod+omega0

def root_mean_square_error(X,omega,omega0,y_actual):
    y_predict=linear_regression_predict(X,omega,omega0)
    summands=np.power(y_actual-y_predict,2)
    #print(np.shape(summands))
    mse=np.sum(summands)/len(y_predict)
    return np.sqrt(mse)
   
def r_squared(X,omega,omega0,y_actual):
    y_predict=linear_regression_predict(X,omega,omega0)
    nom=np.sum(np.power(y_actual-y_predict,2))
    denom=np.sum(np.power(y_actual-np.mean(y_actual,axis=0),2))
    return 1-nom/denom

In [6]:
def batch_gradient_descent(X,y_train,learning_rate,num_iterations):
    #omega = np.matrix([0.0 for i in range(num_parameters)])
    omega=np.random.sample((X.shape[1],1)) #np.zeros((X.shape[1],1))
    omega0=0
    cost = [] 
    # init omega
    #cost = [0.0 for i in range(num_iterations)]
    it=0
    for it in range(num_iterations): 
        N=len(X)
        error=y_train-linear_regression_predict(X,omega,omega0)
        omega=omega+learning_rate*2*((X.T).dot(error))/N
        omega0=omega0+learning_rate*2*np.sum(error)/N
        cost[it] = r_squared(X,omega,omega0,y_train)
    return omega,omega0,cost

In [7]:
# function to create a list containing mini-batches 
def create_mini_batch(data,i, batch_size): 
    mini_batch = data[i * batch_size:(i + 1)*batch_size, :] 
    X_mini = mini_batch[:, :-1] 
    Y_mini = mini_batch[:, -1].reshape((-1, 1)) 
        
    if data.shape[0] % batch_size != 0: 
        mini_batch = data[i * batch_size:data.shape[0]] 
        X_mini = mini_batch[:, :-1] 
        Y_mini = mini_batch[:, -1].reshape((-1, 1)) 
       
    return X_mini,Y_mini

def mini_batch_gradient_descent(X,y_train,learning_rate,num_epoches,batch_size):
    #omega = np.matrix([0.0 for i in range(num_parameters)])
    omega=np.random.sample((X.shape[1],1)) #np.zeros((X.shape[1],1))
    omega0=0
    #cost = [] 
    r2_tr=[]
    rmse_tr=[]
   
    it=0
    for it in range(num_epoches): 
        data = np.hstack((X, y_train)) 
        np.random.shuffle(data) #shuffle at each epoch (not every batch creation)
        n_minibatches = X.shape[0] // batch_size 
        i = 0
        for i in range(n_minibatches + 1):
            X_mini,y_mini= create_mini_batch(data,i, batch_size) 
        
            N=len(X_mini)
            error=y_mini-linear_regression_predict(X_mini,omega,omega0)
            omega=omega+learning_rate*2*((X_mini.T).dot(error))/N
            omega0=omega0+learning_rate*2*np.sum(error)/N
            r2_tr.append(r_squared(X_mini,omega,omega0, y_mini))
            rmse_tr.append(root_mean_square_error(X_mini,omega,omega0, y_mini))
    return omega,omega0

In [8]:
r2=[]
rmse=[]
omega=[]
omega0=[]
r2_train=[]
rmse_train=[]
for item in range(len(x_train_arr)):
    x_test_rd=x_train_arr[item]
    y_test=y_train_arr[item]
    x_train_folds_arr=np.delete(x_train_arr,item)
    y_train_folds_arr=np.delete(y_train_arr,item)
    
    x_train_rd=np.empty((0,x_train_arr[item].shape[1]), int)
    y_train=np.empty((0,y_train_arr[item].shape[1]), int)
    #print(np.shape(y_train_folds_arr))
    for i in range (len(x_train_folds_arr)):
        #print(np.shape(y_train_folds_arr[i]))
        #x_train_all55 =(np.vstack((x_train_all55,x_train_folds_arr[i])))
        x_train_rd=np.concatenate((x_train_rd,x_train_arr[i]))
        y_train=np.concatenate((y_train,y_train_arr[i]))
    #print(np.shape(y_train))
    x_train,mean,std=normalization(x_train_rd)     #normalize all train dataset
    x_test=normalization_with_params(x_test_rd,mean,std)  #normalize test dataset with mean and std of train
    #print(x)
    #start = time.time()
    omega_item,omega0_item=mini_batch_gradient_descent(x_train,y_train,learning_rate = 0.001,num_epoches=18,
                                                            batch_size=800)
    #stop = time.time()
    #duration = stop-start
    #print(duration)
    r2.insert(item,r_squared(x_test,omega_item,omega0_item,y_test))
    rmse.insert(item,root_mean_square_error(x_test,omega_item,omega0_item,y_test))
    omega.insert(item,omega_item)
    omega0.insert(item,omega0_item)
    r2_train.insert(item,r_squared(x_train,omega_item,omega0_item,y_train))
    rmse_train.insert(item,root_mean_square_error(x_train,omega_item,omega0_item,y_train))
    #print(cost)
    print(r2[item],rmse[item],r2_train[item],rmse_train[item])

0.3578946347063209 30.063347577617833 0.32724459204128575 28.626120971692643
0.31763605273608964 28.28945222185808 0.32697022000653686 28.631957718842195
0.3257396204015335 28.48496833827866 0.3267305285689389 28.637055728165596
0.29469612635553044 27.70566456047693 0.32536382443724854 28.66610690449708
0.21841619880190533 33.3975048592421 0.3271905712274372 28.627270256627323


In [9]:

# initialise data of lists. 
data = {'1':[rmse[0],r2[0],rmse_train[0],r2_train[0],omega0[0]], 
        '2':[rmse[1],r2[1],rmse_train[0],r2_train[1],omega0[1]],
        '3':[rmse[2],r2[2],rmse_train[2],r2_train[2],omega0[2]],
        '4':[rmse[3],r2[3],rmse_train[3],r2_train[3],omega0[3]],
        '5':[rmse[4],r2[4],rmse_train[4],r2_train[4],omega0[4]],
        'E':[np.mean(rmse),np.mean(r2),np.mean(rmse_train),np.mean(r2_train),np.mean(omega0)],
        'SD':[np.std(rmse),np.std(r2),np.std(rmse_train),np.std(r2_train), np.std(omega0)]} 
 
# Creates pandas DataFrame. 
df1 = pd.DataFrame(data, index =['RMSE', 'R^2','RMSE train','R^2 train','omega0']) 
df2 = pd.DataFrame(np.concatenate((np.hstack(omega),np.mean(omega,axis=0),np.std(omega,axis=0)),axis=1),
                   columns=['1', '2', '3','4','5','E','SD'])
df =pd.concat([df1, df2], axis=0)
df

Unnamed: 0,1,2,3,4,5,E,SD
RMSE,30.063348,28.289452,28.484968,27.705665,33.397505,29.588188,2.058367
R^2,0.357895,0.317636,0.32574,0.294696,0.218416,0.302877,0.046836
RMSE train,28.626121,28.626121,28.637056,28.666107,28.62727,28.637702,0.014719
R^2 train,0.327245,0.32697,0.326731,0.325364,0.327191,0.3267,0.000692
omega0,5.66274,5.619282,5.678759,5.674576,5.651793,5.65743,0.02128
0,-0.698615,-0.428353,-0.437821,-0.421706,-0.359029,-0.469105,0.118049
1,-0.475147,-0.511947,-0.390036,-0.474535,-0.341458,-0.438625,0.062931
2,-0.051332,-0.061293,-0.255216,-0.336546,-0.010436,-0.142965,0.128612
3,0.18057,0.174859,0.205012,0.185556,0.289167,0.207033,0.042301
4,0.500037,-0.065962,-0.510427,0.461258,0.450389,0.167059,0.397741


In [17]:
df3=pd.DataFrame(np.vstack(x_train_arr))
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
df3
#print((np.vstack(x_train_arr)))
                   

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52
0,1576517.0,0.0,33793.0,14.0,0.0,1053.0,159.034783,98.0,175.752194,0.0,782.0,64.921739,29.0,106.701465,0.0,704.0,63.791304,23.0,104.370619,0.0,782.0,138.026087,86.0,152.310350,-672.0,782.0,1.130435,-5.0,146.819767,285.0,8.0,120.0,260.0,-112.0,53.0,91.0,901.0,0.0,24.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1847524.0,0.0,186456.0,18.0,13.0,1152.0,348.583333,260.5,333.268775,1.0,271.0,78.000000,42.0,82.024387,0.0,627.0,171.666667,115.5,185.139287,13.0,1025.0,322.000000,243.5,299.331477,-514.0,213.0,-93.666667,-87.5,168.901516,46.0,1.0,16.0,43.0,-15.0,59.0,46.0,29.0,0.0,5.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,104037.0,0.0,30.0,14.0,0.0,113.0,6.748691,4.0,11.213291,0.0,26.0,2.104712,0.0,4.252444,0.0,57.0,2.565445,0.0,5.719871,0.0,106.0,6.445026,4.0,10.632651,-47.0,26.0,-0.460733,0.0,6.926486,9.0,0.0,2.0,9.0,-2.0,55.0,85.0,15.0,0.0,24.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,416948.0,0.0,57260.0,18.0,0.0,1122.0,90.872951,52.5,112.729348,0.0,1122.0,34.967213,8.5,87.066202,0.0,381.0,29.979508,11.5,51.041886,0.0,1122.0,86.422131,51.0,108.689238,-326.0,1122.0,4.987705,-4.0,102.201684,60.0,60.0,0.0,60.0,60.0,5.0,90.0,95.0,0.0,24.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,5077.0,3149.0,449.0,27.0,0.0,50.0,5.290909,3.0,9.263168,0.0,19.0,1.836364,0.0,3.676775,0.0,41.0,2.290909,0.0,6.966603,0.0,44.0,4.854545,2.0,8.343573,-36.0,12.0,-0.454545,0.0,6.210894,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,24.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,57916.0,0.0,4150.0,80.0,0.0,64.0,16.352941,14.0,13.329642,0.0,33.0,5.843137,2.0,7.357463,0.0,60.0,8.274510,4.0,11.998494,0.0,60.0,15.176471,12.0,12.227750,-56.0,33.0,-2.431373,-2.0,14.625059,18.0,1.0,4.0,17.0,-3.0,55.0,502.0,55.0,0.0,24.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,2942636.0,1.0,854.0,1.0,0.0,535.0,48.730769,17.0,102.946796,0.0,535.0,27.615385,1.5,102.108945,0.0,95.0,13.615385,4.5,24.867459,0.0,535.0,46.423077,15.0,102.256906,-94.0,535.0,14.000000,-2.0,107.577879,10.0,1.0,1.0,9.0,0.0,54.0,177.0,27.0,0.0,24.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,1128745.0,0.0,52663.0,9.0,0.0,614.0,48.874016,20.0,94.548658,0.0,245.0,18.370079,3.0,38.411121,0.0,500.0,24.842520,5.0,70.107053,0.0,576.0,46.984252,20.0,88.623950,-439.0,245.0,-6.472441,-4.0,63.274284,19.0,1.0,5.0,18.0,-4.0,52.0,27.0,57.0,0.0,24.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
8,292911.0,0.0,25073.0,24.0,0.0,212.0,40.290000,25.5,44.615310,0.0,132.0,14.230000,4.5,24.646239,0.0,141.0,16.610000,4.5,28.208827,0.0,184.0,38.050000,24.5,41.154678,-97.0,132.0,-2.380000,-2.0,35.700919,0.0,0.0,0.0,0.0,0.0,11.0,185.0,3.0,0.0,24.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,3319.0,10.0,1.0,17.0,0.0,30.0,3.619048,2.0,4.906842,0.0,18.0,1.373016,0.0,2.753740,0.0,16.0,1.261905,0.0,2.876226,0.0,19.0,3.365079,2.0,4.258704,-14.0,18.0,0.111111,0.0,3.703757,11.0,1.0,4.0,10.0,-3.0,52.0,110.0,6.0,0.0,24.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [33]:
#np.shape(np.mean(x_train_arr,axis=0))
mean_arr=[]
std_arr=[]
for i in range (len(x_train_arr)):
        mean_arr.append(np.mean(x_train_arr[i],axis=0))
        std_arr.append(np.std(x_train_arr[i],axis=0))
        
df4=pd.DataFrame(np.vstack(mean_arr),index =['Mean 1 fold', 'Mean 2 fold','Mean 3 fold','Mean 4 fold','Mean 5 fold'])
df5=pd.DataFrame(np.vstack(std_arr),index =['Std 1 fold', 'Std 2 fold','Std 3 fold','Std 4 fold','Std 5 fold'])
df =pd.concat([df4, df5], axis=0)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52
Mean 1 fold,1392565.0,4406.61746,45082.559219,24.567277,1.460562,436.128205,55.222867,35.592674,66.591279,0.203663,282.969719,22.143057,7.618498,40.192583,0.009035,266.328694,19.628214,5.047497,38.481981,1.396825,409.109402,52.187358,33.976679,62.373055,-217.044811,272.592918,2.514843,-2.074725,55.376042,55.262271,22.178632,20.956288,52.277411,1.222344,35.228694,163.935287,113.053114,0.0,23.76337,0.125763,0.142491,0.151038,0.164347,0.139683,0.141514,0.135165,0.138706,0.127839,0.137851,0.151038,0.150183,0.146154,0.14823
Mean 2 fold,1218914.0,4674.440904,44340.469597,24.144567,1.640049,446.604274,55.721711,35.426435,67.860515,0.364347,287.385592,22.28065,7.516422,40.630072,0.026007,271.948596,19.604436,4.823016,38.978202,1.556166,418.803053,52.679145,33.853114,63.59397,-223.647253,277.316361,2.676214,-1.768926,56.092333,56.255556,22.071917,19.677045,53.133822,2.394872,35.493651,160.589499,117.54188,0.0,23.764713,0.117216,0.139927,0.14884,0.158852,0.147131,0.151282,0.136752,0.149451,0.126496,0.130403,0.147863,0.152381,0.145055,0.148352
Mean 3 fold,1377329.0,4660.218926,45452.79243,24.029304,1.449206,445.306593,55.424534,35.467827,67.208323,0.162759,284.535775,22.018498,7.560073,40.15234,0.008181,267.562759,19.626701,4.949267,38.605278,1.380586,417.804396,52.332795,33.847314,62.904346,-220.039194,275.467766,2.391797,-2.092186,55.636143,55.433089,22.303785,18.446154,52.308791,3.857631,35.432723,161.949695,120.728571,0.0,23.757998,0.124664,0.146398,0.150427,0.1558,0.13956,0.146764,0.136386,0.13663,0.13895,0.136508,0.155189,0.144444,0.145177,0.143101
Mean 4 fold,1215110.0,4981.956283,44951.550006,24.297961,1.623031,446.515936,56.346832,36.031811,68.110777,0.191965,289.543656,22.469033,7.448773,41.197608,0.001221,269.049334,19.678585,4.918244,38.762026,1.540237,417.822201,53.209577,34.455184,63.700526,-221.0381,280.450727,2.790448,-2.065637,56.572583,55.597753,22.689828,19.039443,52.61778,3.650385,35.23153,167.345708,110.425449,0.0,23.762608,0.12297,0.14605,0.152644,0.152888,0.144828,0.143607,0.137013,0.138112,0.133716,0.143119,0.14605,0.157895,0.139333,0.141776
Mean 5 fold,1365227.0,4658.041275,44179.243497,24.237636,1.758579,442.070216,55.891497,35.713213,67.549583,0.174624,281.438881,22.023794,7.375748,40.194946,0.076078,266.925388,19.712588,4.870253,38.820458,1.612651,413.389791,52.75429,34.083649,63.201836,-218.48211,271.126145,2.311207,-2.103309,55.526219,56.060081,21.691782,20.131884,52.826597,1.559897,35.219929,164.442423,124.514226,0.0,23.790451,0.121138,0.141653,0.145805,0.155452,0.14898,0.147637,0.139333,0.145317,0.135548,0.14031,0.142875,0.149347,0.143729,0.142875
Std 1 fold,8677297.0,20045.682365,117923.157307,20.517316,11.926713,490.370839,86.78032,70.603501,81.603678,4.907748,373.04658,36.995867,22.029979,54.138016,0.449573,326.840387,31.95815,14.384008,51.391006,11.231618,466.971019,81.355004,66.907514,76.481245,280.765854,371.918347,16.134175,13.477244,74.062177,133.897546,75.574162,75.281246,125.657275,96.997484,20.880919,405.24233,513.902051,0.0,1.905567,0.331582,0.349553,0.358086,0.37059,0.346657,0.348551,0.3419,0.345639,0.33391,0.344744,0.358086,0.357251,0.35326,0.355327
Std 2 fold,2754130.0,20423.635071,93249.123534,19.857486,24.303205,500.179872,86.060526,68.700174,81.858985,21.32831,377.510858,40.945778,27.846192,54.912563,1.132571,330.287286,30.403062,12.184255,50.742865,23.792606,475.663916,81.168444,65.513431,76.879851,286.127524,376.455384,26.19142,24.028308,73.892747,136.410354,78.669183,69.063195,128.157722,95.387948,20.822223,339.695863,529.575794,0.0,1.947341,0.321678,0.346911,0.355931,0.365538,0.354236,0.358324,0.343585,0.356532,0.332407,0.336746,0.354964,0.35939,0.352156,0.355448
Std 3 fold,8262598.0,20824.259354,113337.657419,19.944407,12.840823,505.853729,85.387714,67.491018,81.912544,2.721193,378.195447,35.60383,19.894924,54.076484,0.687269,329.404197,31.186762,12.206456,51.694703,12.253034,481.079045,79.997778,64.182314,76.538355,282.744202,377.108324,13.457545,10.093721,74.471762,137.150472,77.913103,66.079378,127.991611,91.505102,21.130568,351.48854,981.659055,0.0,1.999705,0.330338,0.353505,0.35749,0.362665,0.34653,0.353871,0.343198,0.343456,0.345894,0.343327,0.362085,0.35154,0.352279,0.350176
Std 4 fold,3246940.0,21468.710652,98604.867462,19.773942,16.955983,495.574453,87.682119,70.733252,81.235913,3.027376,378.520143,35.913263,18.927172,55.101639,0.071605,324.583275,30.960597,13.823436,49.722031,15.185128,471.360921,81.788275,67.018315,75.998053,280.654989,377.697617,13.514504,9.497678,74.192698,139.598507,81.741246,71.660299,131.199089,97.303308,20.967486,382.463293,418.308255,0.0,1.944539,0.328403,0.353156,0.359644,0.35988,0.351928,0.350691,0.343861,0.345018,0.340347,0.350194,0.353156,0.364642,0.346294,0.34882
Std 5 fold,8276774.0,20167.293246,127909.096074,19.642542,31.115367,491.233157,88.711958,72.173684,81.218714,3.150793,364.64876,34.878711,18.887126,53.129066,4.202872,324.118669,30.940773,13.482801,50.655968,26.045451,466.57582,81.991887,67.09208,76.111381,278.671412,363.153435,15.129032,11.503444,72.41185,137.754903,71.328689,72.927227,127.829887,89.536537,20.777026,398.011883,1667.938274,0.0,1.795723,0.326288,0.348694,0.352911,0.362335,0.356069,0.35474,0.346294,0.35242,0.342308,0.347309,0.349945,0.35643,0.350815,0.349945
