In [51]:
import pandas as pd
import numpy as np
import math 

In [52]:
data = pd.read_csv("training_set_features.csv")
new_data = data.drop(['education', 'race', 'income_poverty', 'health_insurance', 'sex', 'age_group'], axis=1) #removing unnecesary features which is not required
ans = pd.read_csv('training_set_labels.csv') 
df = pd.merge(ans, new_data, on='respondent_id')

In [53]:
df.fillna(method='ffill', inplace=True)

In [54]:
feature_columns = df.loc[:, 'behavioral_antiviral_meds':'health_worker'].columns.union(
                   df.loc[:, 'opinion_seas_vacc_effective':'opinion_seas_sick_from_vacc'].columns).difference(['doctor_recc_xyz'])


target_column = 'seasonal_vaccine'
X_train = df[feature_columns].values
y_train = df[target_column].values
    

In [55]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def predict(X, w, b):
    z = np.dot(X, w) + b
    probabilities = sigmoid(z)
    return probabilities
    #return (sigmoid(z) >= 0.5).astype(int) comment out this to get training accuracy


In [56]:
def compute_cost(X, y, w, b, lambda_=0):
    m = len(y)
    h = sigmoid(np.dot(X, w) + b)
    cost = (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h)) + (lambda_ / (2 * m)) * np.sum(w**2)
    return cost

In [57]:
def compute_gradient(X, y, w, b, lambda_=0):
    m = len(y)
    h = sigmoid(np.dot(X, w) + b)
    grad_w = (1 / m) * np.dot(X.T, (h - y)) + (lambda_ / m) * w
    grad_b = (1 / m) * np.sum(h - y)
    return grad_w, grad_b

In [58]:
def gradient_descent(X, y, w, b, cost_function, gradient_function, alpha, iterations, lambda_=0):
    m = len(y)
    J_history = []
    for i in range(iterations):
        grad_w, grad_b = gradient_function(X, y, w, b, lambda_)
        w = w - alpha * grad_w
        b = b - alpha * grad_b
        cost = cost_function(X, y, w, b, lambda_)
        J_history.append(cost)
        if i % (iterations // 10) == 0:
            print(f"Iteration {i}: Cost {cost}")
    return w, b, J_history

In [59]:
np.random.seed(1)
initial_w = 0.001 * np.random.randn(X_train.shape[1])
initial_b = 0
iterations = 30000
alpha = 0.1
lambda_ = 0
w, b, J_history = gradient_descent(X_train, y_train, initial_w, initial_b, 
                                       compute_cost, compute_gradient, alpha, iterations, lambda_)
predictions = predict(X_train, w, b)
train_accuracy = np.mean(predictions == y_train) * 100
print(f"Train Accuracy: {train_accuracy}%")


Iteration 0: Cost 0.6884963022816077
Iteration 3000: Cost 0.5252873854916779
Iteration 6000: Cost 0.5243029345813581
Iteration 9000: Cost 0.5242690286902633
Iteration 12000: Cost 0.524267719823063
Iteration 15000: Cost 0.5242676682024887
Iteration 18000: Cost 0.5242676661580054
Iteration 21000: Cost 0.5242676660769636
Iteration 24000: Cost 0.5242676660737507
Iteration 27000: Cost 0.5242676660736233
Train Accuracy: 0.0%


In [60]:
print(w,b)

[-0.22764277 -0.0361184  -0.00870388 -0.01020322 -0.03750122  0.27437577
  0.08753046 -0.11060446  0.38444611  1.26431237  0.68531144  0.48402677
 -0.25942025  0.60977821] -4.212337043823675


In [61]:
test_df = pd.read_csv("test_set_features.csv")
new_test_df = data.drop(['education', 'race', 'income_poverty', 'health_insurance', 'sex', 'age_group'], axis=1)

In [62]:
test_df.describe()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,health_worker,health_insurance,opinion_xyz_vacc_effective,opinion_xyz_risk,opinion_xyz_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children
count,26708.0,26623.0,26586.0,26629.0,26495.0,26689.0,26668.0,26636.0,26626.0,26580.0,...,25919.0,14480.0,26310.0,26328.0,26333.0,26256.0,26209.0,26187.0,26483.0,26483.0
mean,40060.5,1.623145,1.266042,0.049645,0.729798,0.069279,0.826084,0.351517,0.337227,0.683747,...,0.111501,0.887914,3.844622,2.326838,2.360612,4.024832,2.708688,2.143392,0.89431,0.543745
std,7710.079831,0.902755,0.615617,0.217215,0.444072,0.253934,0.379045,0.477453,0.472772,0.465022,...,0.314758,0.315483,1.00757,1.275636,1.359413,1.083204,1.376045,1.339102,0.754244,0.935057
min,26707.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,33383.75,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,3.0,1.0,1.0,4.0,2.0,1.0,0.0,0.0
50%,40060.5,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,4.0,2.0,2.0,4.0,2.0,2.0,1.0,0.0
75%,46737.25,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,5.0,4.0,4.0,5.0,4.0,4.0,1.0,1.0
max,53414.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,3.0


In [63]:
new_test_df.fillna(method='ffill', inplace=True)

feature_values = df.loc[:, 'behavioral_antiviral_meds':'health_worker'].columns.union(
                   df.loc[:, 'opinion_seas_vacc_effective':'opinion_seas_sick_from_vacc'].columns).difference(['doctor_recc_xyz'])


In [64]:
X_test = new_test_df[feature_values].values
w = w.reshape(-1, 1)

In [65]:
Y_prediction_seasonal = predict(X_test, w, b)
print(Y_prediction_seasonal)

[[0.05782773]
 [0.17442791]
 [0.18851165]
 ...
 [0.6366148 ]
 [0.05990513]
 [0.2738934 ]]


In [66]:
final_seasonal = pd.DataFrame(Y_prediction_seasonal)
final_seasonal.columns = ['seasonal_vaccine']

In [67]:
final_seasonal
final_seasonal.to_excel('output2.xlsx', index = True)  
