In [1]:
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import csv
import math
import matplotlib.pyplot
from matplotlib import pyplot as plt

# Linear Regression

In [2]:
maxAcc = 0.0
maxIter = 0
C_Lambda = 0.03
TrainingPercent = 80 #80% of the dataset is the training dataset
ValidationPercent = 10 #10% of the dataset is the validation dataset
TestPercent = 10 #10% of the dataset is the testing dataset
M = 10 # number of basis functions. 
PHI = [] #basis function. Gaussian Radial Basis Function

In [3]:
#To generate training data
def GetTargetVector(filePath): 
    t = []
    with open(filePath, 'rU') as f:
        dataframe = pd.read_csv(f)
    for i,row in dataframe.iterrows():  
            t.append(row[1])
    return t

In [4]:
#To generate data matrix
def GenerateRawData(filePath):    
    dataMatrix = [] 
    with open(filePath, 'rU') as f:
        dataframe = pd.read_csv(f)
    data = dataframe.iloc[:,1:]
    result = np.asarray(data.values.T.tolist())
    return result

In [5]:
#To generate Training target
def GenerateTrainingTarget(rawTraining,TrainingPercent = 80):
    TrainingLen = int(math.ceil(len(rawTraining)*(TrainingPercent*0.01)))
    t           = rawTraining[:TrainingLen]
    return t

In [6]:
#To generate training data matrix which is 80 percent of the entire dataset
def GenerateTrainingDataMatrix(rawData, TrainingPercent = 80):
    T_len = int(math.ceil(len(rawData[0])*0.01*TrainingPercent))
    d2 = rawData[:,0:T_len]
    return d2

In [7]:
def GenerateValData(rawData, ValPercent, TrainingCount): 
    valSize = int(math.ceil(len(rawData[0])*ValPercent*0.01))
    V_End = TrainingCount + valSize
    dataMatrix = rawData[:,TrainingCount+1:V_End]
    return dataMatrix

In [8]:
def GenerateValTargetVector(rawData, ValPercent, TrainingCount): 
    valSize = int(math.ceil(len(rawData)*ValPercent*0.01))
    V_End = TrainingCount + valSize
    t =rawData[TrainingCount+1:V_End]
    return t

In [9]:
#To generate covariance matrix and retaining only the diagonal elements which is variance with itself. 
#All other elements other than diagonal elements are made zero.
def GenerateBigSigma(Data, MuMatrix,TrainingPercent):
    BigSigma    = np.zeros((len(Data),len(Data)))
    DataT       = np.transpose(Data)
    TrainingLen = math.ceil(len(DataT)*(TrainingPercent*0.01))        
    varVect     = []
    for i in range(0,len(DataT[0])):
        vct = []
        for j in range(0,int(TrainingLen)):
            vct.append(Data[i][j])    
        varVect.append(np.var(vct))
    
    for j in range(len(Data)):
        BigSigma[j][j] = varVect[j]
    
    BigSigma = np.dot(3,BigSigma)
    return BigSigma

In [10]:
def GetScalar(DataRow,MuRow, BigSigInv):  
    R = np.subtract(DataRow,MuRow)
    T = np.dot(BigSigInv,np.transpose(R))  
    L = np.dot(R,T)
    return L

In [11]:
#to obtain Basis function value for a given data point
def GetRadialBasisOut(DataRow,MuRow, BigSigInv):    
    phi_x = math.exp(-0.5*GetScalar(DataRow,MuRow,BigSigInv))
    return phi_x

In [12]:
#To construct the design matrix.
#the elements of a design matrix are the values of basis functions obtained on each data point
def GetPhiMatrix(Data, MuMatrix, BigSigma, TrainingPercent = 80):
    DataT = np.transpose(Data)
    TrainingLen = math.ceil(len(DataT)*(TrainingPercent*0.01))         
    PHI = np.zeros((int(TrainingLen),len(MuMatrix))) 
    BigSigInv = np.linalg.pinv(BigSigma)
    for  C in range(0,len(MuMatrix)):
        for R in range(0,int(TrainingLen)):
            PHI[R][C] = GetRadialBasisOut(DataT[R], MuMatrix[C], BigSigInv)
    #print ("PHI Generated..")
    return PHI

In [13]:
#To obtain the weights (which is the unknown in closed form solution)
def GetWeightsClosedForm(PHI, T, Lambda):
    Lambda_I = np.identity(len(PHI[0]))
    for i in range(0,len(PHI[0])):
        Lambda_I[i][i] = Lambda
    PHI_T       = np.transpose(PHI)
    PHI_SQR     = np.dot(PHI_T,PHI)
    PHI_SQR_LI  = np.add(Lambda_I,PHI_SQR)
    PHI_SQR_INV = np.linalg.pinv(PHI_SQR_LI)
    INTER       = np.dot(PHI_SQR_INV, PHI_T)
    W           = np.dot(INTER, T)
    ##print ("Training Weights Generated..")
    return W


In [14]:
def GetValTest(VAL_PHI,W):
    Y = np.dot(W,np.transpose(VAL_PHI))
    ##print ("Test Out Generated..")
    return Y

In [15]:
#To obtain the Evaluation metric, E_rms 
def GetErms(VAL_TEST_OUT,ValDataAct):
    sum = 0.0
    t=0
    accuracy = 0.0
    counter = 0
    val = 0.0
    for i in range (0,len(VAL_TEST_OUT)):
        sum = sum + math.pow((ValDataAct[i] - VAL_TEST_OUT[i]),2)
        if(int(np.around(VAL_TEST_OUT[i], 0)) == ValDataAct[i]):
            counter+=1
    accuracy = (float((counter*100))/float(len(VAL_TEST_OUT)))
    return (str(accuracy) + ',' +  str(math.sqrt(sum/len(VAL_TEST_OUT))))

# Generating raw data and raw target for all the datasets

In [16]:
#RawData   = GenerateRawData('human_dataset_features.csv')
#RawTarget = GetTargetVector('human_target_features.csv')

#RawData   = GenerateRawData('concatHuman.csv')
#RawTarget = GetTargetVector('target_Human.csv')


#RawData   = GenerateRawData('concatGSC.csv')
#RawTarget = GetTargetVector('target_GSC.csv')


#RawData   = GenerateRawData('subtractHuman.csv')
#RawTarget = GetTargetVector('subtract_target_Human.csv')

RawData   = GenerateRawData('subtractGSC.csv')
RawTarget = GetTargetVector('subtract_target_GSC.csv')

  after removing the cwd from sys.path.


In [17]:
TrainingTarget = np.array(GenerateTrainingTarget(RawTarget,TrainingPercent))
TrainingData   = GenerateTrainingDataMatrix(RawData,TrainingPercent)
print(TrainingTarget.shape)
print(TrainingData.shape)

(14400,)
(512, 14400)


In [18]:
ValDataAct = np.array(GenerateValTargetVector(RawTarget,ValidationPercent, (len(TrainingTarget))))
ValData    = GenerateValData(RawData,ValidationPercent, (len(TrainingTarget)))
print(ValDataAct.shape)
print(ValData.shape)

(1799,)
(512, 1799)


In [19]:
TestDataAct = np.array(GenerateValTargetVector(RawTarget,TestPercent, (len(TrainingTarget)+len(ValDataAct))))
TestData = GenerateValData(RawData,TestPercent, (len(TrainingTarget)+len(ValDataAct)))
print(TestDataAct.shape)
print(TestData.shape)

(1799,)
(512, 1799)


# Closed Form solution

In [20]:
ErmsArr = []
AccuracyArr = []

#k means clustering algorithm
kmeans = KMeans(n_clusters=M, random_state=0).fit(np.transpose(TrainingData))
Mu = kmeans.cluster_centers_

BigSigma     = GenerateBigSigma(RawData, Mu, TrainingPercent)
TRAINING_PHI = GetPhiMatrix(RawData, Mu, BigSigma, TrainingPercent)
W            = GetWeightsClosedForm(TRAINING_PHI,TrainingTarget,C_Lambda) 
TEST_PHI     = GetPhiMatrix(TestData, Mu, BigSigma, 100) 
VAL_PHI      = GetPhiMatrix(ValData, Mu, BigSigma, 100)

In [None]:
TR_TEST_OUT  = GetValTest(TRAINING_PHI,W)
VAL_TEST_OUT = GetValTest(VAL_PHI,W)
TEST_OUT     = GetValTest(TEST_PHI,W)

TrainingAccuracy   = str(GetErms(TR_TEST_OUT,TrainingTarget))
ValidationAccuracy = str(GetErms(VAL_TEST_OUT,ValDataAct))
TestAccuracy       = str(GetErms(TEST_OUT,TestDataAct))

In [None]:
print ('UBIT Name      = PRAVI')
print ('Person Number = 50291368')
print ("-------Closed Form with Radial Basis Function-------")
print ('----------------------------------------------------')
print ("M = ",M)
print ("Lambda = ",C_Lambda)
print ("E_rms Training   = " + str(float(TrainingAccuracy.split(',')[1])))
print ("E_rms Validation = " + str(float(ValidationAccuracy.split(',')[1])))
print ("E_rms Testing    = " + str(float(TestAccuracy.split(',')[1])))

# Stochastic Gradient Descent Solution

In [None]:
W_Now        = np.dot(220, W)
La           = 1
learningRate = 0.03
L_Erms_Val   = []
L_Erms_TR    = []
L_Erms_Test  = []
W_Mat        = []

#range values was changed to 400 and 1000. 
for i in range(0,400):
    
    #print ('---------Iteration: ' + str(i) + '--------------')
    Delta_E_D     = -np.dot((TrainingTarget[i] - np.dot(np.transpose(W_Now),TRAINING_PHI[i])),TRAINING_PHI[i])
    La_Delta_E_W  = np.dot(La,W_Now)
    Delta_E       = np.add(Delta_E_D,La_Delta_E_W)    
    Delta_W       = -np.dot(learningRate,Delta_E)
    W_T_Next      = W_Now + Delta_W
    W_Now         = W_T_Next
    
    #-----------------TrainingData Accuracy---------------------#
    TR_TEST_OUT   = GetValTest(TRAINING_PHI,W_T_Next) 
    Erms_TR       = GetErms(TR_TEST_OUT,TrainingTarget)
    L_Erms_TR.append(float(Erms_TR.split(',')[1]))
    
    #-----------------ValidationData Accuracy---------------------#
    VAL_TEST_OUT  = GetValTest(VAL_PHI,W_T_Next) 
    Erms_Val      = GetErms(VAL_TEST_OUT,ValDataAct)
    L_Erms_Val.append(float(Erms_Val.split(',')[1]))
    
    #-----------------TestingData Accuracy---------------------#
    TEST_OUT      = GetValTest(TEST_PHI,W_T_Next) 
    Erms_Test = GetErms(TEST_OUT,TestDataAct)
    L_Erms_Test.append(float(Erms_Test.split(',')[1]))

In [None]:
print ('----------Gradient Descent Solution--------------------')
print ("M = ",M)
print ("Lambda  = ",La)
print ("eta=",learningRate)
print ("E_rms Training   = " + str(np.around(min(L_Erms_TR),5)))
print ("E_rms Validation = " + str(np.around(min(L_Erms_Val),5)))
print ("E_rms Testing    = " + str(np.around(min(L_Erms_Test),5)))

In [None]:
plt.plot(range(400),L_Erms_TR,label="Training")
plt.plot(range(400),L_Erms_Val,label="Validation")
plt.plot(range(400),L_Erms_Test,label="Testing")
plt.xlabel("iterations")
plt.ylabel("E Rms")
plt.show()

# Logistic Regression

In [None]:
def sigmoid(weight,x):
    return (1 / (1 + np.exp(-np.dot(weight.T,x))))

In [None]:
def logistic_regression(x,y,eta):
    weight = np.ones(x.shape[0])
    L = []
    for i in range(5000):
        pred_target = sigmoid(weight,x)
        #print(pred_target.shape, y.shape )
        L_val = loss_function(pred_target,y)
        L.append(L_val)
        pred_error = pred_target - y
        delta_weight = np.dot(x,(pred_error))
        weight = weight - eta*delta_weight
    return weight,L

In [None]:
def accuracy(Data, TargetData,weight):
    pred_y = np.round(sigmoid(weight,Data))
    accuracy = (pred_y==TargetData).sum()/len(pred_y)
    return accuracy

In [None]:
def loss_function(h,y):
    return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()

In [None]:
weight_training,L_Training = logistic_regression(TrainingData,TrainingTarget,0.00001)
#weight_training,L_Training = logistic_regression(TrainingData,TrainingTarget,0.0001)
#weight_training,L_Training = logistic_regression(TrainingData,TrainingTarget,0.001)
#weight_training,L_Training = logistic_regression(TrainingData,TrainingTarget,0.02)

accuracy(TrainingData,TrainingTarget,weight_training)

In [None]:
weight_validation,L_Validation = logistic_regression(ValData,ValDataAct,0.00001)
#weight_validation,L_Validation = logistic_regression(ValData,ValDataAct,0.0001)
#weight_validation,L_Validation = logistic_regression(ValData,ValDataAct,0.002)
#weight_validation,L_Validation = logistic_regression(ValData,ValDataAct,0.02)

accuracy(ValData,ValDataAct,weight_training)

In [None]:
plt.plot(range(5000),L_Training,label="Training")
plt.plot(range(5000),L_Validation,label="Validation")
plt.xlabel("iterations")
plt.ylabel("Loss function")
plt.show()