In [14]:
import pandas as pd
import numpy as np
import math


In [15]:
data = pd.read_csv("training_set_features.csv")
new_data = data.drop(['education', 'race', 'income_poverty', 'health_insurance', 'sex', 'age_group'], axis=1) #removing unnecesary features which is not required
ans = pd.read_csv('training_set_labels.csv') 
df = pd.merge(ans, new_data, on='respondent_id')


In [16]:
#filling empty data
df.fillna(method='ffill', inplace=True)


In [17]:
feature_columns = df.loc[:, 'xyz_concern':'opinion_xyz_sick_from_vacc'].columns
target_column = 'xyz_vaccine'
X_train = df[feature_columns].values
y_train = df[target_column].values

In [18]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def predict(X, w, b):
    z = np.dot(X, w) + b
    probabilities = sigmoid(z)
    return probabilities
    
    return (sigmoid(z) >= 0.5).astype(int)  

In [19]:
def compute_cost(X, y, w, b, lambda_=0):
    m = len(y)
    h = sigmoid(np.dot(X, w) + b)
    cost = (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h)) + (lambda_ / (2 * m)) * np.sum(w**2)
    return cost

def compute_gradient(X, y, w, b, lambda_=0):
    m = len(y)
    h = sigmoid(np.dot(X, w) + b)
    grad_w = (1 / m) * np.dot(X.T, (h - y)) + (lambda_ / m) * w
    grad_b = (1 / m) * np.sum(h - y)
    return grad_w, grad_b


In [20]:
def gradient_descent(X, y, w, b, cost_function, gradient_function, alpha, iterations, lambda_=0):
    m = len(y)
    J_history = []
    for i in range(iterations):
        grad_w, grad_b = gradient_function(X, y, w, b, lambda_)
        w = w - alpha * grad_w
        b = b - alpha * grad_b
        cost = cost_function(X, y, w, b, lambda_)
        J_history.append(cost)
        if i % (iterations // 10) == 0:
            print(f"Iteration {i}: Cost {cost}")
    return w, b, J_history

In [21]:
np.random.seed(1)
initial_w = 0.01 * np.random.randn(X_train.shape[1])
initial_b = 0
iterations = 40000
alpha = 0.02
lambda_ = 0
w, b, J_history = gradient_descent(X_train, y_train, initial_w, initial_b, 
                                       compute_cost, compute_gradient, alpha, iterations, lambda_)
predictions = predict(X_train, w, b)
train_accuracy = np.mean(predictions == y_train) * 100
print(f"Train Accuracy: {train_accuracy}%")


Iteration 0: Cost 0.6558326087474673
Iteration 4000: Cost 0.428063450766539
Iteration 8000: Cost 0.4115136130940572
Iteration 12000: Cost 0.4048523969682825
Iteration 16000: Cost 0.40165198749281733
Iteration 20000: Cost 0.3999771391178044
Iteration 24000: Cost 0.39905580318027256
Iteration 28000: Cost 0.3985322327652762
Iteration 32000: Cost 0.3982279034786218
Iteration 36000: Cost 0.3980480950664137
Train Accuracy: 0.0%


In [22]:
print(w,b)

[-0.07001288  0.18918463  0.06780784 -0.03086725  0.17178852 -0.00858609
 -0.18902521 -0.03278924  0.04119461  1.76750706 -0.3381563   0.19332028
  0.17614056  0.7291429   0.61690728  0.41372415 -0.05613618] -5.393633781860206


In [23]:
w = w.reshape(-1,1)

In [24]:
test_df = pd.read_csv("test_set_features.csv")
new_test_df = data.drop(['education', 'race', 'income_poverty', 'health_insurance', 'sex', 'age_group'], axis=1)

new_test_df.fillna(method='ffill', inplace=True)

In [25]:
feature_values = df.loc[:, 'xyz_concern':'opinion_xyz_sick_from_vacc'].columns
X_test = new_test_df[feature_columns].values

In [28]:
Y_prediction_xyz = predict(X_test, w, b)
print(Y_prediction_xyz)

[[0.0354661 ]
 [0.32268434]
 [0.051971  ]
 ...
 [0.23857057]
 [0.04389389]
 [0.12106571]]


In [29]:
final_xyz = pd.DataFrame(Y_prediction_xyz)
final_xyz.columns = ['xyz_vaccine']

In [31]:
final_xyz
final_xyz.to_excel('output1.xlsx', index=False)  
