In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [78]:
df = pd.read_csv('Dataset.csv', sep=',')
df = df.drop(df.columns[0],axis=1)
X = df[df.columns[:53]]
Y = df[['Target']]
X = normalize(X,Y,X.shape[0])


In [77]:
def normalize(X,Y,nrows):
    BinaryFeatures = ['Post Promotion Status','published_weekday_0','published_weekday_1','published_weekday_2',
                      'published_weekday_3', 'published_weekday_4','published_weekday_5','published_weekday_6',
                     'base_weekday_0','base_weekday_1','base_weekday_2','base_weekday_3','base_weekday_4',
                      'base_weekday_5','base_weekday_6']
    BF = X[BinaryFeatures] 
    
    DecimalFeatures = ['Page Popularity','Page Checkins','Page talking about',
                       'extra_0','extra_1','extra_2','extra_3','extra_4','extra_5',
                       'extra_6','extra_7','extra_8','extra_9','extra_10',
                       'extra_11','extra_12','extra_13','extra_14','extra_15','extra_16',
                       'extra_17','extra_18','extra_19','extra_20','extra_21','extra_22','extra_23','extra_24',
                       'CC1','CC2','CC3','CC4','CC5','Base Time','Post Length','Post Share Count','H Local']
    DF = X[DecimalFeatures]
    E = DF.mean()
    D = DF.std()
    DF = (DF-E)/D
    
    # 'H Local' feature's modification
    #HLocal = Y['Target']/X['H Local']#comments/an hour 
    #CategoryFeature =['Page Category']
    #CF = X[CategoryFeature]
    
    frequency = X['Page Category'].value_counts(sort=False) #frequency of categories
    CF = 1/frequency[X['Page Category']]
    CF = CF.reset_index(drop=True)
    
    # x0 = [1,1,...1] fpr b koeficient
    x1 = pd.DataFrame({'feature 1': np.ones(nrows)})

    result = pd.concat([x1,DF,CF, BF], axis=1) #,X['H Local']
    return result


In [90]:
def gradient(X,Y,W,lyambda, n, nfeat):
    Ypred = np.dot(X,W)
    dy = Y - Ypred
    cost = np.sum(dy**2)/n
    #print(dy[:10], "cost", cost)
    gr = np.dot(X.T,dy)/n
    Wnew = W + 2* lyambda * gr.reshape(nfeat,1)
    return Wnew

def foo_RMSE(X, Y, W):
    rmse = math.sqrt((np.sum(((Y-np.dot(X,W))**2))))/len(Y)
    return rmse

def foo_R2(X,Y,W):
    Ey = Y.mean()
    r2 = 1 - np.sum((np.dot(X,W)-Y)**2)/np.sum((Y - Ey)**2)
    return r2

In [None]:
nsteps = 300
e = 10e-3

lyambda = 0.05
table = pd.DataFrame({})
nfeatures = X.shape[1]
kf = KFold(n_splits=5, shuffle = True)
chunk = 0
for train_index, test_index in kf.split(X):
    chunk+=1
    Xtrain = X.loc[train_index.tolist()]
    Ytrain = Y.loc[train_index.tolist()]
    Xtest = X.loc[test_index.tolist()]
    Ytest = Y.loc[test_index.tolist()]
    nrows = Xtrain.shape[0]
    k = 0
    W = np.ones(nfeatures).reshape(nfeatures,1)
    while True:
        k+=1
        Wnew = gradient(X,Y,W,lyambda/math.sqrt(k), nrows, nfeatures)
        estimation = np.max(abs(W - Wnew))
    
        if (estimation < e)or(k>nsteps):
            break
        W = Wnew
        
    print("CHUNK # ", chunk)    
    R2train = foo_R2(X,Y,W)
    print ("R2 for train: ",R2train[0] , "\n")

    RMSEtrain = foo_RMSE(X,Y,W)
    print ("RMSE for train: ",RMSEtrain , "\n") 

    
    
    #test
    R2test = foo_R2(Xtest,Ytest,W)
    print ("R2 for test: ",R2test[0] , "\n")

    RMSEtest = foo_RMSE(Xtest,Ytest,W)
    print ("RMSE for test: ",RMSEtest , "\n")
    
    a = [R2train[0],RMSEtrain,R2test[0],RMSEtest]
    for feature in W.tolist():
        a.append(feature[0])
    col = pd.DataFrame({'chunk %d'%chunk: a })
    table =  pd.concat([table, col],axis = 1)


table.index=['R2_train', 'RMSE_train', 'R2_test', 'RMSE_test','b','Page Popularity','Page Checkins',
             'Page talking about','extra_0','extra_1','extra_2','extra_3','extra_4','extra_5','extra_6',
             'extra_7','extra_8','extra_9','extra_10','extra_11','extra_12','extra_13','extra_14','extra_15',
             'extra_16','extra_17','extra_18','extra_19','extra_20','extra_21','extra_22','extra_23','extra_24',
             'CC1','CC2','CC3','CC4','CC5','Base Time','Post Length','Post Share Count','H Local','Page Category', 
             'Post Promotion Status','published_weekday_0','published_weekday_1','published_weekday_2',
             'published_weekday_3', 'published_weekday_4','published_weekday_5','published_weekday_6',
             'base_weekday_0','base_weekday_1','base_weekday_2','base_weekday_3','base_weekday_4',
             'base_weekday_5','base_weekday_6']
table.index.name = 'Features'
table =  pd.concat([table, pd.DataFrame({'E': table.mean(axis = 1) })],axis = 1)
table =  pd.concat([table, pd.DataFrame({'STD': table.std(axis = 1) })],axis = 1)
 
    

In [85]:
table

Unnamed: 0_level_0,chunk 1,chunk 2,chunk 3,chunk 4,chunk 5,E,STD
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
R2_train,0.308952,0.308952,0.308952,0.308952,0.308951,0.308952,2.051461e-07
RMSE_train,0.145811,0.145811,0.145811,0.145811,0.145811,0.145811,2.164281e-08
R2_test,0.358777,0.295146,0.327305,0.273678,0.300886,0.311158,0.02930842
RMSE_test,0.306004,0.325983,0.302809,0.365699,0.325861,0.325271,0.02240999
b,3.644779,3.644779,3.644779,3.644779,3.644745,3.644772,1.36037e-05
Page Popularity,-0.004374,-0.004374,-0.004374,-0.004374,-0.004364,-0.004372,4.09104e-06
Page Checkins,-0.283603,-0.283603,-0.283603,-0.283603,-0.283587,-0.2836,6.634562e-06
Page talking about,-0.344754,-0.344754,-0.344754,-0.344754,-0.344737,-0.344751,6.860081e-06
extra_0,-0.121221,-0.121221,-0.121221,-0.121221,-0.121213,-0.121219,3.203703e-06
extra_1,0.193237,0.193237,0.193237,0.193237,0.193238,0.193237,3.228049e-07


In [86]:
table.to_csv('table.csv')

In [87]:
#checking Test_Case
dft = pd.read_csv('Test_Case_1.csv', sep=',')
Xt = df[df.columns[:53]]
Yt = df[['Target']]
Xt = normalize(Xt,Yt,Xt.shape[0])

In [88]:
R2t = foo_R2(Xt,Yt,W)
R2t

Target    0.308951
dtype: float64