In [16]:
import pandas as pd
import numpy as np
import math


In [17]:
data = pd.read_csv("training_set_features.csv")
new_data = data.drop(['education', 'race', 'income_poverty', 'health_insurance', 'sex', 'age_group'], axis=1) #removing unnecesary features which is not required
ans = pd.read_csv('training_set_labels.csv') 
df = pd.merge(ans, new_data, on='respondent_id')


In [18]:
#filling empty data
df.fillna(method='ffill', inplace=True)


In [19]:
feature_columns = df.loc[:, 'xyz_concern':'opinion_xyz_sick_from_vacc'].columns.difference(['doctor_recc_seasonal'])
target_column = 'xyz_vaccine'
X_train = df[feature_columns].values
y_train = df[target_column].values

In [20]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def predict(X, w, b):
    z = np.dot(X, w) + b
    probabilities = sigmoid(z)
    return probabilities
    
    #return (sigmoid(z) >= 0.5).astype(int)  comment out this to get training accuracy

In [21]:
def compute_cost(X, y, w, b, lambda_=0):
    m = len(y)
    h = sigmoid(np.dot(X, w) + b)
    cost = (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h)) + (lambda_ / (2 * m)) * np.sum(w**2)
    return cost

def compute_gradient(X, y, w, b, lambda_=0):
    m = len(y)
    h = sigmoid(np.dot(X, w) + b)
    grad_w = (1 / m) * np.dot(X.T, (h - y)) + (lambda_ / m) * w
    grad_b = (1 / m) * np.sum(h - y)
    return grad_w, grad_b


In [22]:
def gradient_descent(X, y, w, b, cost_function, gradient_function, alpha, iterations, lambda_=0):
    m = len(y)
    J_history = []
    for i in range(iterations):
        grad_w, grad_b = gradient_function(X, y, w, b, lambda_)
        w = w - alpha * grad_w
        b = b - alpha * grad_b
        cost = cost_function(X, y, w, b, lambda_)
        J_history.append(cost)
        if i % (iterations // 10) == 0:
            print(f"Iteration {i}: Cost {cost}")
    return w, b, J_history

In [23]:
np.random.seed(1)
initial_w = 0.01 * np.random.randn(X_train.shape[1])
initial_b = 0
iterations = 40000
alpha = 0.02
lambda_ = 0
w, b, J_history = gradient_descent(X_train, y_train, initial_w, initial_b, 
                                       compute_cost, compute_gradient, alpha, iterations, lambda_)
predictions = predict(X_train, w, b)
train_accuracy = np.mean(predictions == y_train) * 100
print(f"Train Accuracy: {train_accuracy}%")


Iteration 0: Cost 0.643206085456737
Iteration 4000: Cost 0.42772104785207526
Iteration 8000: Cost 0.41237949446324257
Iteration 12000: Cost 0.4058303560316817
Iteration 16000: Cost 0.40261570928495344
Iteration 20000: Cost 0.4009242469676245
Iteration 24000: Cost 0.39999329372550335
Iteration 28000: Cost 0.39946472537684047
Iteration 32000: Cost 0.3991578608234766
Iteration 36000: Cost 0.3989767757677262
Train Accuracy: 0.0%


In [24]:
print(w,b)

[ 0.06828622 -0.03026339  0.17137531 -0.19096359 -0.03720872  0.03430393
 -0.0171833   0.17950374  0.16094119  1.54140411  0.72965089  0.41493688
 -0.0541599   0.61424145 -0.07302411  0.18250441] -5.411713744029924


In [25]:
w = w.reshape(-1,1)

In [26]:
test_df = pd.read_csv("test_set_features.csv")
new_test_df = data.drop(['education', 'race', 'income_poverty', 'health_insurance', 'sex', 'age_group'], axis=1)

new_test_df.fillna(method='ffill', inplace=True)

In [27]:
feature_values = df.loc[:, 'xyz_concern':'opinion_xyz_sick_from_vacc'].columns.difference(['doctor_recc_seasonal'])
X_test = new_test_df[feature_columns].values

In [28]:
Y_prediction_xyz = predict(X_test, w, b)
print(Y_prediction_xyz)

[[0.03428035]
 [0.30963772]
 [0.04889012]
 ...
 [0.22844885]
 [0.04234699]
 [0.11815772]]


In [29]:
final_xyz = pd.DataFrame(Y_prediction_xyz)
final_xyz.columns = ['xyz_vaccine']

In [30]:
final_xyz
final_xyz.to_excel('output1.xlsx', index=False)  
