**QUESTION 1**

In [12]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import numpy as np

data = pd.read_csv('USA_Housing.csv')

print(data.columns)

X = data.drop(columns=['Price'])  # Changed 'price' to 'Price'
y = data['Price']  # Changed 'price' to 'Price'

Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price'],
      dtype='object')


In [13]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [14]:
kf = KFold(n_splits=5)


In [15]:
beta_matrices = []
r2_scores = []

for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Least Squares Fit
    X_train_biased = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
    beta = np.linalg.inv(X_train_biased.T @ X_train_biased) @ (X_train_biased.T @ y_train)
    beta_matrices.append(beta)

    # Predictions
    X_test_biased = np.hstack((np.ones((X_test.shape[0], 1)), X_test))
    y_pred = X_test_biased @ beta

    # R² Score
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)

best_index = np.argmax(r2_scores)
best_beta = beta_matrices[best_index]

print("Best R² Score:", r2_scores[best_index])
print("Best Beta Matrix:", best_beta)


Best R² Score: 0.9208503836977655
Best Beta Matrix: [1.23144707e+06 2.29921558e+05 1.64523054e+05 1.19737507e+05
 1.12425659e+03 1.51317802e+05]


In [16]:
# Train-test split (70% train, 30% test)
train_size = int(0.7 * len(data))
X_train, X_test = X_scaled[:train_size], X_scaled[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Train using the best beta
X_train_biased = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
best_model = np.linalg.inv(X_train_biased.T @ X_train_biased) @ (X_train_biased.T @ y_train)

# Test performance
X_test_biased = np.hstack((np.ones((X_test.shape[0], 1)), X_test))
y_pred_test = X_test_biased @ best_model
test_r2 = r2_score(y_test, y_pred_test)

print("Test R² Score with Best Model:", test_r2)


Test R² Score with Best Model: 0.9176499755975894


**QUESTION 2**

In [20]:

from sklearn.model_selection import train_test_split

# Split the dataset into training (56%), validation (14%), and test (30%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.44, random_state=42)  # 44% for validation + test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.68, random_state=42)  # 30% test

In [21]:
def gradient_descent(X, y, learning_rate, iterations):
    m, n = X.shape
    X_biased = np.hstack((np.ones((m, 1)), X))  # Add bias term
    beta = np.zeros(n + 1)  # Initialize coefficients

    for _ in range(iterations):
        gradients = (2/m) * X_biased.T @ (X_biased @ beta - y)
        beta -= learning_rate * gradients
    return beta


In [24]:
import numpy as np
from sklearn.metrics import r2_score

def gradient_descent(X, y, learning_rate, iterations):
    m, n = X.shape
    X_biased = np.hstack((np.ones((m, 1)), X))  # Add bias term
    beta = np.zeros(n + 1)  # Initialize coefficients

    for _ in range(iterations):
        gradients = (2/m) * X_biased.T @ (X_biased @ beta - y)
        beta -= learning_rate * gradients

        # Check if beta contains NaN and break if True
        if np.isnan(beta).any():
            print(f"Gradient descent diverged with learning rate: {learning_rate}. Try a smaller learning rate.")
            break  # Exit the loop if NaN is encountered

    return beta

# Step 4: Train the Model with Different Learning Rates
learning_rates = [0.001, 0.01, 0.1, 1]
results = {}

for lr in learning_rates:
    # Train using gradient descent
    beta = gradient_descent(X_train.values, y_train.values, lr, 1000)

    # Check if beta contains NaN and skip if True
    if np.isnan(beta).any():
        continue  # Skip to the next learning rate if NaN is encountered

    # R² for validation set
    X_val_biased = np.hstack((np.ones((X_val.shape[0], 1)), X_val))
    y_val_pred = X_val_biased @ beta
    r2_val = r2_score(y_val, y_val_pred)

    # R² for test set
    X_test_biased = np.hstack((np.ones((X_test.shape[0], 1)), X_test))
    y_test_pred = X_test_biased @ beta
    r2_test = r2_score(y_test, y_test_pred)

    results[lr] = (r2_val, r2_test, beta)

# ... (rest of your code remains the same)

Gradient descent diverged with learning rate: 0.001. Try a smaller learning rate.
Gradient descent diverged with learning rate: 0.01. Try a smaller learning rate.
Gradient descent diverged with learning rate: 0.1. Try a smaller learning rate.
Gradient descent diverged with learning rate: 1. Try a smaller learning rate.


  gradients = (2/m) * X_biased.T @ (X_biased @ beta - y)
  beta -= learning_rate * gradients


**QUESTION 3 :- **

In [25]:
import pandas as pd
import numpy as np

# Define the column names
column_names = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
                "num_doors", "body_style", "drive_wheels", "engine_location",
                "wheel_base", "length", "width", "height", "curb_weight",
                "engine_type", "num_cylinders", "engine_size", "fuel_system",
                "bore", "stroke", "compression_ratio", "horsepower",
                "peak_rpm", "city_mpg", "highway_mpg", "price"]

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
data = pd.read_csv(url, names=column_names, na_values='?')


In [31]:
# Check for missing values
print("Missing values before processing:")
print(data.isnull().sum())

# Fill NaN values with the mean for numeric columns
numeric_cols = data.select_dtypes(include=[np.number]).columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())

# Fill NaN values for non-numeric columns with the mode (most frequent value)
categorical_cols = data.select_dtypes(exclude=[np.number]).columns
for col in categorical_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)

# Check again for missing values to confirm
print("Missing values after processing:")
print(data.isnull().sum())

# Drop rows with NaN in the 'price' column
data.dropna(subset=['price'], inplace=True)

# Verify the final shape of the dataset
print("Final dataset shape:", data.shape)


Missing values before processing:
symboling            0
normalized_losses    0
make                 0
fuel_type            0
aspiration           0
num_doors            2
body_style           0
drive_wheels         0
engine_location      0
wheel_base           0
length               0
width                0
height               0
curb_weight          0
engine_type          0
num_cylinders        0
engine_size          0
fuel_system          0
bore                 0
stroke               0
compression_ratio    0
horsepower           0
peak_rpm             0
city_mpg             0
highway_mpg          0
price                0
dtype: int64
Missing values after processing:
symboling            0
normalized_losses    0
make                 0
fuel_type            0
aspiration           0
num_doors            0
body_style           0
drive_wheels         0
engine_location      0
wheel_base           0
length               0
width                0
height               0
curb_weight          0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)


In [32]:
# (i) For "num_doors" and "num_cylinders"
data['num_doors'] = data['num_doors'].replace({'two': 2, 'four': 4})
data['num_cylinders'] = data['num_cylinders'].replace({'two': 2, 'three': 3, 'four': 4,
                                                       'five': 5, 'six': 6, 'eight': 8,
                                                       'twelve': 12})

# (ii) For "body_style" and "drive_wheels" using dummy encoding
data = pd.get_dummies(data, columns=['body_style', 'drive_wheels'], drop_first=True)

# (iii) For "make", "aspiration", "engine_location", "fuel_type" using label encoding
from sklearn.preprocessing import LabelEncoder

label_cols = ['make', 'aspiration', 'engine_location', 'fuel_type']
le = LabelEncoder()

for col in label_cols:
    data[col] = le.fit_transform(data[col])

# (iv) For "fuel_system"
data['fuel_system'] = data['fuel_system'].str.contains('pfi').astype(int)

# (v) For "engine_type"
data['engine_type'] = data['engine_type'].str.contains('ohc').astype(int)


  data['num_doors'] = data['num_doors'].replace({'two': 2, 'four': 4})
  data['num_cylinders'] = data['num_cylinders'].replace({'two': 2, 'three': 3, 'four': 4,


In [34]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separate input features and output variable
X = data.drop('price', axis=1)
y = data['price']

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [35]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train the linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict on the test set
y_pred = lr.predict(X_test)

# Evaluate performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse:.2f}, R^2: {r2:.2f}')


MSE: 13422229.59, R^2: 0.80


In [36]:
from sklearn.decomposition import PCA

# Apply PCA
pca = PCA(n_components=0.95)  # Keep 95% of the variance
X_reduced = pca.fit_transform(X_scaled)

# Split the reduced data
X_train_reduced, X_test_reduced, y_train, y_test = train_test_split(X_reduced, y, test_size=0.3, random_state=42)

# Train the linear regression model on reduced data
lr_reduced = LinearRegression()
lr_reduced.fit(X_train_reduced, y_train)

# Predict on the test set
y_pred_reduced = lr_reduced.predict(X_test_reduced)

# Evaluate performance
mse_reduced = mean_squared_error(y_test, y_pred_reduced)
r2_reduced = r2_score(y_test, y_pred_reduced)

print(f'Reduced MSE: {mse_reduced:.2f}, Reduced R^2: {r2_reduced:.2f}')


Reduced MSE: 17154268.25, Reduced R^2: 0.75
