In [57]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import copy, math
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler


In [58]:
# Load data from .data file 
data = pd.read_csv('./cervical+cancer+behavior+risk/sobar-72.csv')

In [59]:
#  Calculating the percentage of correctly classified examples
# 
def PercentCorrect(Inputs, targets, weights):
    N = len(targets)
    nCorrect = 0
    for n in range(N):
        OneInput = Inputs[n,:]
        if (targets[n]*np.dot(OneInput, weights) > 0):
            nCorrect += 1 
    return 100*nCorrect/N 

In [60]:
# Prepare data set as we want and devide to train and test sets

targets = []
targets = data.iloc[:, 19].values
print(targets.shape)

for i in range(len(targets)):
    if(targets[i] == 0):
        targets[i] = -1

X = data.iloc[:, 0:18+1]

NumDataPerClass = 36

rIndex = np.random.permutation(X.shape[0])  # Shuffle row indices
Xr = X.iloc[rIndex]  # Shuffle rows of X
yr = targets[rIndex]  # Corresponding labels after shuffling

X_train = Xr[0:NumDataPerClass]
y_train = yr[0:NumDataPerClass]
X_test = Xr[NumDataPerClass:2*NumDataPerClass]
y_test = yr[NumDataPerClass:2*NumDataPerClass]
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Ntrain = NumDataPerClass
Ntest = NumDataPerClass

X_train = X_train.values
X_test = X_test.values


# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler to your training data (learn mean and standard deviation)
X_train_scaled = scaler.fit_transform(X_train)
# print(X_train_scaled)

# Apply the scaling to your test data (using the parameters learned from training data)
X_test_scaled = scaler.transform(X_test)
# print(X_test_scaled)


(72,)
(36, 19) (36,) (36, 19) (36,)


In [61]:
# Random initialization of weights
w = np.random.randn(19)

# What is the performance with the initial random weights?
print('Initial Percentage Correct: %6.2f' %(PercentCorrect(X_train_scaled, y_train, w)))


Initial Percentage Correct:  58.33


In [62]:
def sigmoid(z):

    g = 1/(1+np.exp(-z))
   
    return g

In [63]:
def compute_gradient_logistic(X, y, w, b, lambda_): 
    
    
    m,n = X.shape
    dj_dw = np.zeros((n,))                           #(n,)
    dj_db = 0.
    
    for i in range(m):
        z = np.dot(X[i],w) + b
        f_wb_i = sigmoid(z)
        err_i = f_wb_i - y[i]
        for j in range(n):
            dj_dw[j] = dj_dw[j] + err_i * X[i,j] 
        dj_db = dj_db + err_i
    dj_dw = dj_dw/m
    dj_db = dj_db/m
    
    for j in range(n):
        dj_dw[j] = dj_dw[j] + (lambda_/m)*w[j]
    
    return dj_db, dj_dw

In [64]:
def compute_cost_logistic(X, y, w, b, lambda_ = 1):
    m,n  = X.shape
    cost = 0.0
    for i in range(m):
        z_i = np.dot(X[i],w) + b
        f_wb_i = sigmoid(z_i)
        cost +=  -y[i]*np.log(f_wb_i) - (1-y[i])*np.log(1-f_wb_i)         
    cost = cost / m
    
    reg_cost = 0
    for j in range(n):
        reg_cost += (w[j]**2)                                          #scalar
    reg_cost = (lambda_/(2*m)) * reg_cost                              #scalar
    
    total_cost = cost + reg_cost                                       #scalar
    return total_cost

In [65]:
def gradient_descent(X, y, w_in, b_in, alpha, num_iters, lambda_):
    # An array to store cost J and w's at each iteration primarily for graphing later
    J_history = []
    w = copy.deepcopy(w_in)  #avoid modifying global w within function
    b = b_in 
    
    for i in range(num_iters):
        
        dj_db, dj_dw = compute_gradient_logistic(X, y, w, b, lambda_)
        
        w = w - alpha * dj_dw
        b = b - alpha * dj_db 
        
        # Save cost J at each iteration
        J_history.append( compute_cost_logistic(X, y, w, b) )
        
        if i% math.ceil(num_iters / 10) == 0:
            print(f"Iteration {i:4d}: Cost {J_history[-1]}   ")
    
    return w, b, J_history  


In [66]:
w_tmp  = np.zeros_like(X_train[0])
b_tmp  = 0.
alph = 0.0001
iters = 100000
lambda_tmp = 2 

w_out, b_out, _ = gradient_descent(X_train_scaled, y_train, w_tmp, b_tmp, alph, iters, lambda_tmp) 
print(f"\nupdated parameters: w:{w_out}, b:{b_out}")

print('Percentage Correct After Training: %6.2f %6.2f'%(PercentCorrect(X_train_scaled, y_train, w_out), PercentCorrect(X_test_scaled, y_test, w_out)))

Iteration    0: Cost 0.6927632027527962   


Iteration 10000: Cost -1.266635566065823   
Iteration 20000: Cost -2.4387382442539365   
Iteration 30000: Cost -3.465889735561127   
Iteration 40000: Cost -4.414938552748203   
Iteration 50000: Cost -5.306953637234787   
Iteration 60000: Cost -6.1517417084935815   
Iteration 70000: Cost -6.955162747215859   
Iteration 80000: Cost -7.7214059864698585   
Iteration 90000: Cost -8.453822838821582   

updated parameters: w:[-1.16058656  1.01613169 -1.86779869 -1.59904868 -0.19782673  0.76216778
 -0.2392961  -1.73302794 -1.86510161 -2.00003781 -2.41594646 -1.79996446
 -1.49335052 -2.02514755 -0.97216479 -0.55983824 -2.10244003 -2.71946909
 -1.91638397], b:-5.93299213143852
Percentage Correct After Training:  94.44  80.56


In [67]:
desc = pd.DataFrame()
desc['sample'] = data.count()
desc['nunique'] = data.nunique()
desc['unique'] = desc['nunique'] / data.shape[0] * 100
desc['null'] = data.isnull().sum()
desc = desc.join(data.describe().T.drop(columns='count'))
desc

Unnamed: 0,sample,nunique,unique,null,mean,std,min,25%,50%,75%,max
behavior_sexualRisk,72,6,8.333333,0,9.666667,1.186782,2.0,10.0,10.0,10.0,10.0
behavior_eating,72,9,12.5,0,12.791667,2.361293,3.0,11.0,13.0,15.0,15.0
behavior_personalHygine,72,12,16.666667,0,11.083333,3.033847,3.0,9.0,11.0,14.0,15.0
intention_aggregation,72,9,12.5,0,7.902778,2.738148,2.0,6.0,10.0,10.0,10.0
intention_commitment,72,9,12.5,0,13.347222,2.374511,6.0,11.0,15.0,15.0,15.0
attitude_consistency,72,8,11.111111,0,7.180556,1.522844,2.0,6.0,7.0,8.0,10.0
attitude_spontaneity,72,7,9.722222,0,8.611111,1.515698,4.0,8.0,9.0,10.0,10.0
norm_significantPerson,72,5,6.944444,0,3.125,1.845722,1.0,1.0,3.0,5.0,5.0
norm_fulfillment,72,12,16.666667,0,8.486111,4.907577,3.0,3.0,7.0,14.0,15.0
perception_vulnerability,72,13,18.055556,0,8.513889,4.275686,3.0,5.0,8.0,13.0,15.0
