Build a Linear regression model to predict life expectancy using the dataset provided in the link below. You will have to load the data using the library of your choice and drop rows in which some cells are empty.You are expected to use Numpy to create the linear regression model
https://docs.google.com/spreadsheets/d/1PZyB7ce2W79PC-AgTuXwd8xW0zWwtk6Jqd8Qia2nxPE/edit?usp=sharing

In [None]:
import pandas as pd
import numpy as np

# Load the dataset
url = "https://docs.google.com/spreadsheets/d/1PZyB7ce2W79PC-AgTuXwd8xW0zWwtk6Jqd8Qia2nxPE/export?format=csv"
data = pd.read_csv(url)

# Drop rows with any missing values
data_cleaned = data.dropna()

# Select only numeric columns
data_cleaned = data_cleaned.select_dtypes(include=[np.number])

# Define the linear regression function
def linear_regression(X, y):
    # Add a column of ones to X for the intercept term
    X = np.c_[np.ones(X.shape[0]), X]

    # Compute the coefficients using the Normal Equation
    theta = np.linalg.inv(X.T @ X) @ X.T @ y
    return theta

# Prepare the data for the model
# Assuming 'Life expectancy' is the target variable and the rest are features
y = data_cleaned['Life expectancy'].values
X = data_cleaned.drop(columns=['Life expectancy']).values

# Train the model
theta = linear_regression(X, y)

# Print the coefficients
print("Coefficients (theta):", theta)

# Make predictions
X_with_intercept = np.c_[np.ones(X.shape[0]), X]
y_pred = X_with_intercept @ theta

# Calculate RMSE
rmse = np.sqrt(np.mean((y - y_pred) ** 2))

# Print the RMSE
print("RMSE:", rmse)


Coefficients (theta): [ 3.13352794e+02 -1.29874789e-01 -1.64364041e-02  8.88086801e-02
 -9.83150370e-02  3.10650869e-04 -2.32723002e-03 -1.10707305e-05
  3.15528061e-02 -6.66491382e-02  5.66159110e-03  9.61044099e-02
  1.35422582e-02 -4.49509516e-01  2.95015607e-05 -6.52688013e-10
 -2.28139560e-03 -5.31044786e-02  1.04700581e+01  9.06296651e-01]
RMSE: 3.5408604162649677


In [None]:
# Step 1: Upload the CSV file
from google.colab import files
uploaded = files.upload()

# Step 2: Load the data
import pandas as pd

# Load the dataset
data = pd.read_csv('framingham.csv')

# Display the first few rows of the dataset
data.head()

# Step 3: Preprocess the data
# Drop rows with any missing values
data_cleaned = data.dropna()

# Define the features (X) and the target (y)
X = data_cleaned.drop(columns=['TenYearCHD'])  # Assuming 'TenYearCHD' is the target column
y = data_cleaned['TenYearCHD']

# Standardize the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: Build and Train the Logistic Regression Model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 5: Evaluate the Model
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("ROC-AUC:", roc_auc)


Saving framingham.csv to framingham (1).csv
Accuracy: 0.8360655737704918
Precision: 0.5555555555555556
Recall: 0.08196721311475409
ROC-AUC: 0.6994759473259876
