In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

In [3]:
df = pd.read_csv("mental_health_digital_behavior_data.csv")

In [4]:
X = df.drop(columns=["digital_wellbeing_score"]).values
y = df["digital_wellbeing_score"].values

In [5]:
x_min = X.min()
x_max = X.max()

X = (X - x_min) / (x_max - x_min)

def split_data(X, y, train_ratio, val_ratio, test_ratio):
    n = len(X)
    idx = np.random.permutation(n)
    X, y = X[idx], y[idx]
    train_size, val_size = int(train_ratio*n), int(val_ratio*n)
    X_train, y_train = X[:train_size], y[:train_size]
    X_val, y_val = X[train_size:train_size+val_size], y[train_size:train_size+val_size]
    X_test, y_test = X[train_size+val_size:], y[train_size+val_size:]
    return X_train, y_train, X_val, y_val, X_test, y_test

def rss(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)



In [6]:
# data = []
# data.append(split_data(X,y,60,20,20))
# data.append(split_data(X,y,70,10,20))
# data.append(split_data(X,y,80,10,10))

In [7]:
# print(data)

In [8]:
def batch_gd(X, y, alpha, rho, max_epochs=1000):
    m, n = X.shape
    theta = np.zeros(n)
    prev_error = float("inf")
    rss_history = []

    for epoch in range(max_epochs):
        y_pred = X.dot(theta)
        error = y_pred - y
        grad = (1/m) * X.T.dot(error)
        theta -= alpha * grad

        rss_val = rss(y, y_pred)
        rss_history.append(rss_val)

        if abs(prev_error - rss_val) < rho:
            break
        prev_error = rss_val

    return theta, rss_history

def predict(X, theta):
    return X.theta

In [9]:
# alpha = [0.0001,0.1]
# rho = [0.001,0.01]

# optimal = []
# optimal_rss = 9999999999999
# for i in data:
#     X_train, y_train, X_val, y_val, X_test, y_test = i[0]
#     for a in alpha:
#         for b in rho:
#             theta, rss_history = batch_gd(X_test,y_test,a,b,10000)
#             rss_curr = rss(y_val, predict(X_val,theta))
#             if(rss_curr < optimal_rss):
#                 optimal_rss = rss_curr
#                 optimal = [a,b]


# hypothesis = []
# for i in data:
#     X_train, y_train, X_val, y_val, X_test, y_test = i[0] 
#     theta, rss_history = batch_gd(X_test,y_test,a,b,10000)
#     hypothesis.append(theta)

# rows = []
# i = 0
# for theta in hypothesis:
#     rows.append([i,X_train,y_train,predict(X_train,theta)])
#     rows.append([i,X_test,y_test,predict(X_test,theta)])
#     i += 1

# dataFrame = pd.DataFrame(rows , columns= ["split_number" , "X_train", "y actual" , "y predicted"])
# # dataFrame.to_excel("result/output.xlsx")


In [10]:
# print(dataFrame)

In [11]:
splits = [(0.6, 0.2, 0.2), (0.7, 0.1, 0.2), (0.8, 0.1, 0.1)]
alphas = [0.0001, 0.1]
rhos = [0.001, 0.01]

results = []
best_params_final = None
for split in splits:
    X_train, y_train, X_val, y_val, X_test, y_test = split_data(X, y, *split)
    best_val_rss = float("inf")
    best_params = None
    best_theta = None
    best_history = None

    for alpha in alphas:
        for rho in rhos:
            theta, history = batch_gd(X_train, y_train, alpha, rho)
            y_val_pred = X_val.dot(theta)
            val_rss = rss(y_val, y_val_pred)
            if val_rss < best_val_rss:
                best_val_rss = val_rss
                best_params = (alpha, rho)
                best_theta = theta
                best_history = history
                best_params_final = best_params


    y_test_pred = X_test.dot(best_theta)
    test_rss = rss(y_test, y_test_pred)
    y_train_pred = X_train.dot(best_theta)
    train_rss = rss(y_train, y_train_pred)

    results.append([split, best_params, train_rss, test_rss])
    
  
    plt.plot(best_history)
    plt.xlabel("Epochs")
    plt.ylabel("Mean RSS")
    plt.title(f"Batch GD Training Curve Split={split}")
    plt.savefig(f"plots/batchgd_{split}.png")
    plt.close()


df_results = pd.DataFrame(results, columns=["Split", "Best Params (alpha,rho)", "Train RSS", "Test RSS"])
df_results.to_csv("result/final_results_gd.csv", index=False)


In [12]:
results = []

alpha , rho = best_params_final
for split in splits:
    X_train, y_train, X_val, y_val, X_test, y_test = split_data(X, y, *split)
    theta, history = batch_gd(X_train, y_train, alpha, rho)

    y_test_pred = X_test.dot(best_theta)
    y_train_pred = X_train.dot(best_theta)

    results.append([split, y_test_pred, y_test, y_train_pred, y_train])
    

df_results = pd.DataFrame(results, columns=["Split", "y_test_pred", "y_test", "y_train_pred", "y_train"])
df_results.to_csv("result/train_test_output_gd.csv", index=False)


In [13]:
def minibatch_gd(X, y, alpha, rho, batch_size=16, max_epochs=1000):
    m, n = X.shape
    theta = np.zeros(n)
    prev_error = float("inf")
    rss_history = []

    for epoch in range(max_epochs):
        indices = np.random.permutation(m)
        X_shuffled, y_shuffled = X[indices], y[indices]

        for i in range(0, m, batch_size):
            X_batch = X_shuffled[i:i+batch_size]
            y_batch = y_shuffled[i:i+batch_size]

            y_pred = X_batch.dot(theta)
            error = y_pred - y_batch
            grad = (1/len(y_batch)) * X_batch.T.dot(error)
            theta -= alpha * grad

        rss_val = rss(y, X.dot(theta))
        rss_history.append(rss_val)

        if abs(prev_error - rss_val) < rho:
            break
        prev_error = rss_val

    return theta, rss_history

In [14]:
splits = [(0.6, 0.2, 0.2), (0.7, 0.1, 0.2), (0.8, 0.1, 0.1)]
alphas = [0.0001, 0.1]
rhos = [0.001, 0.01]

results = []
best_params_final = None
b_size = int(input())
for split in splits:
    X_train, y_train, X_val, y_val, X_test, y_test = split_data(X, y, *split)
    best_val_rss = float("inf")
    best_params = None
    best_theta = None
    best_history = None

    for alpha in alphas:
        for rho in rhos:
            theta, history = minibatch_gd(X_train, y_train, alpha, rho,batch_size=b_size)
            y_val_pred = X_val.dot(theta)
            val_rss = rss(y_val, y_val_pred)
            if val_rss < best_val_rss:
                best_val_rss = val_rss
                best_params = (alpha, rho)
                best_theta = theta
                best_history = history
                best_params_final = best_params


    y_test_pred = X_test.dot(best_theta)
    test_rss = rss(y_test, y_test_pred)
    y_train_pred = X_train.dot(best_theta)
    train_rss = rss(y_train, y_train_pred)

    results.append([split, best_params, train_rss, test_rss])
    
  
    plt.plot(best_history)
    plt.xlabel("Epochs")
    plt.ylabel("Mean RSS")
    plt.title(f"Mini_Batch GD Training Curve Split={split}")
    plt.savefig(f"plots/batch_mini_batch_gd_{split}.png")
    plt.close()


df_results = pd.DataFrame(results, columns=["Split", "Best Params (alpha,rho)", "Train RSS", "Test RSS"])
df_results.to_csv("result/final_results_mini_batch_gd.csv", index=False)


In [15]:
results = []

alpha , rho = best_params_final
for split in splits:
    X_train, y_train, X_val, y_val, X_test, y_test = split_data(X, y, *split)
    theta, history = minibatch_gd(X_train, y_train, alpha, rho,b_size)

    y_test_pred = X_test.dot(best_theta)
    y_train_pred = X_train.dot(best_theta)

    results.append([split, y_test_pred, y_test, y_train_pred, y_train])
    

df_results = pd.DataFrame(results, columns=["Split", "y_test_pred", "y_test", "y_train_pred", "y_train"])
df_results.to_csv("result/train_test_output_mini_batch_gd.csv", index=False)


In [16]:

xi = X[0].reshape(1, -1)
print(X[0].shape)
print(xi.shape)

(8,)
(1, 8)


In [17]:
def stochastic_gd(X, y, alpha, rho, max_epochs=1000):
    m, n = X.shape
    theta = np.zeros(n)
    prev_error = float("inf")
    rss_history = []

    for epoch in range(max_epochs):
        for i in range(m):
            xi = X[i].reshape(1, -1)
            yi = y[i]
            y_pred = xi.dot(theta)
            grad = (y_pred - yi) * xi
            theta -= alpha * grad.ravel()

        rss_val = rss(y, X.dot(theta))
        rss_history.append(rss_val)

        if abs(prev_error - rss_val) < rho:
            break
        prev_error = rss_val

    return theta, rss_history

In [18]:
splits = [(0.6, 0.2, 0.2), (0.7, 0.1, 0.2), (0.8, 0.1, 0.1)]
alphas = [0.0001, 0.1]
rhos = [0.001, 0.01]

results = []
best_params_final = None
for split in splits:
    X_train, y_train, X_val, y_val, X_test, y_test = split_data(X, y, *split)
    best_val_rss = float("inf")
    best_params = None
    best_theta = None
    best_history = None

    for alpha in alphas:
        for rho in rhos:
            theta, history = stochastic_gd(X_train, y_train, alpha, rho)
            y_val_pred = X_val.dot(theta)
            val_rss = rss(y_val, y_val_pred)
            if val_rss < best_val_rss:
                best_val_rss = val_rss
                best_params = (alpha, rho)
                best_theta = theta
                best_history = history
                best_params_final = best_params


    y_test_pred = X_test.dot(best_theta)
    test_rss = rss(y_test, y_test_pred)
    y_train_pred = X_train.dot(best_theta)
    train_rss = rss(y_train, y_train_pred)

    results.append([split, best_params, train_rss, test_rss])
    
  
    plt.plot(best_history)
    plt.xlabel("Epochs")
    plt.ylabel("Mean RSS")
    plt.title(f"Mini_Batch GD Training Curve Split={split}")
    plt.savefig(f"plots/stochastic_gd_{split}.png")
    plt.close()


df_results = pd.DataFrame(results, columns=["Split", "Best Params (alpha,rho)", "Train RSS", "Test RSS"])
df_results.to_csv("result/final_results_mini_stochastic.csv", index=False)


In [19]:
results = []
alpha , rho = best_params_final
for split in splits:
    X_train, y_train, X_val, y_val, X_test, y_test = split_data(X, y, *split)
    theta, history = stochastic_gd(X_train, y_train, alpha, rho)

    y_test_pred = X_test.dot(best_theta)
    y_train_pred = X_train.dot(best_theta)

    results.append([split, y_test_pred, y_test, y_train_pred, y_train])
    

df_results = pd.DataFrame(results, columns=["Split", "y_test_pred", "y_test", "y_train_pred", "y_train"])
df_results.to_csv("result/train_test_output_stochastic_gd.csv", index=False)
