In [91]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [473]:
df = pd.read_csv('Dataset.csv', sep=',')
df = df.drop(df.columns[0],axis=1)
X = df[df.columns[:53]]
Y = df[['Target']]
X = normalize(X,Y,X.shape[0])


In [485]:
def normalize(X,Y,nrows):
    BinaryFeatures = ['Post Promotion Status','published_weekday_0','published_weekday_1','published_weekday_2',
                      'published_weekday_3', 'published_weekday_4','published_weekday_5','published_weekday_6',
                     'base_weekday_0','base_weekday_1','base_weekday_2','base_weekday_3','base_weekday_4',
                      'base_weekday_5','base_weekday_6']
    BF = X[BinaryFeatures] 
    
    
    DecimalFeatures = ['Page Popularity','Page Checkins','Page talking about',
                       'extra_0','extra_1','extra_2','extra_3','extra_4','extra_5',
                       'extra_6','extra_7','extra_8','extra_9','extra_10',
                       'extra_11','extra_12','extra_13','extra_14','extra_15','extra_16',
                       'extra_17','extra_18','extra_19','extra_20','extra_21','extra_22','extra_23','extra_24',
                       'CC1','CC2','CC3','CC4','CC5','Base Time','Post Length','Post Share Count']
    DF = X[DecimalFeatures]
    E = DF.mean()
    D = DF.std()
    DF = (DF-E)/D
    
    # 'H Local' feature's modification
    HLocal = Y['Target']/X['H Local']#comments/an hour
      
    #CategoryFeature =['Page Category']
    #CF = X[CategoryFeature]
    frequency = X['Page Category'].value_counts(sort=False) #frequency of categories
    CF = 1/frequency[X['Page Category']]
    CF = CF.reset_index(drop=True)
    
    # x0 = [1,1,...1] fpr b koeficient
    x1 = pd.DataFrame({'feature 1': np.ones(nrows)})

    result = pd.concat([x1,DF,CF,BF,HLocal], axis=1) #,X['H Local']
    return result


In [179]:
def gradient(X,Y,W,lyambda, n, nfeat):
    Ypred = np.dot(X,W)
    dy = Y - Ypred
    cost = np.sum(dy**2)/n
    #print(dy[:10], "cost", cost)
    gr = np.dot(X.T,dy)/n
    Wnew = W + lyambda * gr.reshape(nfeat,1)
    return Wnew

def foo_RMSE(X, Y, W):
    rmse = math.sqrt((np.sum(((Y-np.dot(X,W))**2))))/len(Y)
    return rmse

def foo_R2(X,Y,W):
    Ey = Y.mean()
    r2 = 1 - np.sum((np.dot(X,W)-Y)**2)/np.sum((Y - Ey)**2)
    return r2

In [486]:
nsteps = 400
e = 10e-3

lyambda = 0.02
table = pd.DataFrame({})
nfeatures = X.shape[1]
kf = KFold(n_splits=5, shuffle = True)
chunk = 0
for train_index, test_index in kf.split(X):
    chunk+=1
    Xtrain = X.loc[train_index.tolist()]
    Ytrain = Y.loc[train_index.tolist()]
    Xtest = X.loc[test_index.tolist()]
    Ytest = Y.loc[test_index.tolist()]
    nrows = Xtrain.shape[0]
    k = 0
    W = np.ones(nfeatures).reshape(nfeatures,1)
    while True:
        k+=1
        Wnew = gradient(X,Y,W,lyambda/math.sqrt(k), nrows, nfeatures)
        estimation = np.max(abs(W - Wnew))
    
        if (estimation < e)or(k>nsteps):
            break
        W = Wnew
        
    print("CHUNK # ", chunk)    
    R2train = foo_R2(X,Y,W)
    print ("R2 for train: ",R2train[0] , "\n")

    RMSEtrain = foo_RMSE(X,Y,W)
    print ("RMSE for train: ",RMSEtrain , "\n") 

    
    #test
    R2test = foo_R2(Xtest,Ytest,W)
    print ("R2 for test: ",R2test[0] , "\n")

    RMSEtest = foo_RMSE(Xtest,Ytest,W)
    print ("RMSE for test: ",RMSEtest , "\n")
    
    a = [R2train[0],RMSEtrain,R2test[0],RMSEtest]
    for feature in W.tolist():
        a.append(feature[0])
    col = pd.DataFrame({'chunk %d'%chunk: a })
    table =  pd.concat([table, col],axis = 1)


table.index=['R2_train', 'RMSE_train', 'R2_test', 'RMSE_test','b','Page Popularity','Page Checkins',
             'Page talking about','extra_0','extra_1','extra_2','extra_3','extra_4','extra_5','extra_6',
             'extra_7','extra_8','extra_9','extra_10','extra_11','extra_12','extra_13','extra_14','extra_15',
             'extra_16','extra_17','extra_18','extra_19','extra_20','extra_21','extra_22','extra_23','extra_24',
             'CC1','CC2','CC3','CC4','CC5','Base Time','Post Length','Post Share Count','Page Category', 
             'Post Promotion Status','published_weekday_0','published_weekday_1','published_weekday_2',
             'published_weekday_3', 'published_weekday_4','published_weekday_5','published_weekday_6',
             'base_weekday_0','base_weekday_1','base_weekday_2','base_weekday_3','base_weekday_4',
             'base_weekday_5','base_weekday_6','Comments / H Local']
table.index.name = 'Features'
table =  pd.concat([table, pd.DataFrame({'E': table.mean(axis = 1) })],axis = 1)
table =  pd.concat([table, pd.DataFrame({'STD': table.std(axis = 1) })],axis = 1)
 
    

CHUNK #  1
R2 for train:  0.951159336731 

RMSE for train:  0.038763724230930635 

R2 for test:  0.953565730996 

RMSE for test:  0.09762083255330802 

CHUNK #  2
R2 for train:  0.951159336731 

RMSE for train:  0.038763724230930635 

R2 for test:  0.953299489461 

RMSE for test:  0.08076213142068418 

CHUNK #  3
R2 for train:  0.951159336731 

RMSE for train:  0.038763724230930635 

R2 for test:  0.941663503776 

RMSE for test:  0.0730633284230041 

CHUNK #  4
R2 for train:  0.951159336731 

RMSE for train:  0.038763724230930635 

R2 for test:  0.951390329967 

RMSE for test:  0.09856709544894535 

CHUNK #  5
R2 for train:  0.951156646047 

RMSE for train:  0.038764791983366506 

R2 for test:  0.951321637983 

RMSE for test:  0.08037209249064496 



In [484]:
table

Unnamed: 0_level_0,chunk 1,chunk 2,chunk 3,chunk 4,chunk 5,E,STD
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
R2_train,0.951159,0.951159,0.951159,0.951159,0.951157,0.951159,1.076273e-06
RMSE_train,0.038764,0.038764,0.038764,0.038764,0.038765,0.038764,4.27101e-07
R2_test,0.948199,0.948053,0.94943,0.95721,0.950257,0.95063,0.003388788
RMSE_test,0.084764,0.083158,0.081445,0.091942,0.09154,0.08657,0.004352867
b,1.10311,1.10311,1.10311,1.10311,1.103147,1.103117,1.489703e-05
Page Popularity,0.159358,0.159358,0.159358,0.159358,0.159366,0.15936,3.04735e-06
Page Checkins,0.16356,0.16356,0.16356,0.16356,0.163567,0.163561,2.867392e-06
Page talking about,-0.263646,-0.263646,-0.263646,-0.263646,-0.263635,-0.263644,4.498679e-06
extra_0,-0.190452,-0.190452,-0.190452,-0.190452,-0.190445,-0.190451,3.02461e-06
extra_1,0.152887,0.152887,0.152887,0.152887,0.152887,0.152887,2.171069e-08


In [None]:
table.to_csv('table.csv')

In [487]:
#checking Test_Case
dft = pd.read_csv('Test_Case_1.csv', sep=',')
Xt = df[df.columns[:53]]
Yt = df[['Target']]
Xt = normalize(Xt,Yt,Xt.shape[0])

In [488]:
R2t = foo_R2(Xt,Yt,W)
R2t

Target    0.951157
dtype: float64