In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

#### Task 1:
##### Having imported the data, write code to study the following data characteristics:
        a) number of rows and columns for the independent variables
        b) labels of the columns for the independent variables and their meaning
        c) target variable values and their meaning

#### Solution to Task 1

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing

# Fetch the California housing dataset
housing = fetch_california_housing()

# Task 1: Data Characteristics
# a) Number of rows and columns for the independent variables
num_rows, num_cols = housing.data.shape
print("Number of rows for independent variables:", num_rows)
print("Number of columns for independent variables:", num_cols)

# b) Labels of the columns for the independent variables and their meaning
column_labels = housing.feature_names
column_meaning = housing.DESCR.split("Attribute Information:")[-1].split(":Missing Attribute Values:")[0].strip().split("\n")[1:]
columns_info = dict(zip(column_labels, column_meaning))
print("Labels and meanings of columns for independent variables:")
for label, meaning in columns_info.items():
    print(f"{label}: {meaning}")

# c) Target variable values and their meaning
target_values = housing.target
target_name = housing.target_names[0]
target_meaning = housing.DESCR.split("The target variable is ")[-1].split("\n")[0]
print("\nTarget variable values:")
print(target_values[:5])  # Display first 5 values
print("\nTarget variable name:", target_name)
print("Meaning of target variable:", target_meaning)


Number of rows for independent variables: 20640
Number of columns for independent variables: 8
Labels and meanings of columns for independent variables:
MedInc:         - HouseAge      median house age in block group
HouseAge:         - AveRooms      average number of rooms per household
AveRooms:         - AveBedrms     average number of bedrooms per household
AveBedrms:         - Population    block group population
Population:         - AveOccup      average number of household members
AveOccup:         - Latitude      block group latitude
Latitude:         - Longitude     block group longitude

Target variable values:
[4.526 3.585 3.521 3.413 3.422]

Target variable name: MedHouseVal
Meaning of target variable: the median house value for California districts,


In [2]:
housing['feature_names']

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [3]:
housing['target']

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

#### Task 2:
##### Write the code to train prediction models with a data split ratio 80/20 between training and test data. Your code should also consider reshuffling of the rows.

#### Solution to Task 2

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the data from the CSV file
file_path = r'D:\CLIENT Asignment\california_housing_data.csv'
data = pd.read_csv(file_path)

# Splitting the data into features (X) and target (y)
X = data.drop(columns='target')
y = data['target']

# Splitting the data into training and test sets with shuffling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

# Standardizing the features
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Training the model (Example: Linear Regression)
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 0.5078085968729842


#### Task 2:
##### Having performed scaling for all values, you should develop the following regression models:
        a) a Linear Regression model by displaying its intercept, trained coefficients, RMSE score as fitness metric.
        b) a Stochastic Gradient Descent with Warm Restarts model, which is a variant of the stochastic gradient descent (SGD) optimisation algorithm commonly used in machine learning for training linear models, including linear regression models. You should display its intercept, trained coefficients, RMSE score as fitness metric. 
        b.1) For the model above, you should use 10 iterations as maximum and set both tol and eta, which are essential hyperparameters that need to be tuned carefully to achieve the desired balance between convergence speed and solution quality, as you think appropriate.
        c) Prepare the data and develop a model of a higher degree polynomial, for instance, degree = 2. You should display its intercept, trained coefficients, RMSE score as fitness metric.

#### Solution to Task 2

####  Linear Regression model by displaying its intercept, trained coefficients, RMSE score as fitness metric.

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the data from the CSV file
file_path = r'D:\CLIENT Asignment\california_housing_data.csv'
data = pd.read_csv(file_path)

# Splitting the data into features (X) and target (y)
X = data.drop(columns='target')
y = data['target']

# Splitting the data into training and test sets with shuffling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

# Standardizing the features
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Training the Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Getting intercept and coefficients
intercept = model.intercept_
coefficients = model.coef_

# Making predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculating RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Displaying results
print("Intercept:", intercept)
print("Coefficients:", coefficients)
print("RMSE Score:", rmse)


Intercept: 2.0682855935077566
Coefficients: [ 0.82502434  0.1184549  -0.25900377  0.3030255  -0.00631966 -0.03895182
 -0.91015401 -0.88707079]
RMSE Score: 0.7431262670963512


####  a Stochastic Gradient Descent with Warm Restarts model, which is a variant of the stochastic gradient descent (SGD) optimisation algorithm commonly used in machine learning for training linear models, including linear regression models. You should display its intercept, trained coefficients, RMSE score as fitness metric. 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the data from the CSV file
file_path = r'D:\CLIENT Asignment\california_housing_data.csv'
data = pd.read_csv(file_path)

# Splitting the data into features (X) and target (y)
X = data.drop(columns='target')
y = data['target']

# Splitting the data into training and test sets with shuffling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

# Standardizing the features
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Training the Stochastic Gradient Descent with Warm Restarts model
model = SGDRegressor(max_iter=1000, warm_start=True)
model.fit(X_train_scaled, y_train)

# Getting intercept and coefficients
intercept = model.intercept_
coefficients = model.coef_

# Making predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculating RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Displaying results
print("Intercept:", intercept)
print("Coefficients:", coefficients)
print("RMSE Score:", rmse)


####  For the model above, you should use 10 iterations as maximum and set both tol and eta, which are essential hyperparameters that need to be tuned carefully to achieve the desired balance between convergence speed and solution quality, as you think appropriate.

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the data from the CSV file
file_path = r'D:\CLIENT Asignment\california_housing_data.csv'
data = pd.read_csv(file_path)

# Splitting the data into features (X) and target (y)
X = data.drop(columns='target')
y = data['target']

# Splitting the data into training and test sets with shuffling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

# Standardizing the features
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Training the Stochastic Gradient Descent with Warm Restarts model
model = SGDRegressor(max_iter=10, warm_start=True, tol=1e-4, eta0=0.01)
model.fit(X_train_scaled, y_train)

# Getting intercept and coefficients
intercept = model.intercept_
coefficients = model.coef_

# Making predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculating RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Displaying results
print("Intercept:", intercept)
print("Coefficients:", coefficients)
print("RMSE Score:", rmse)


Intercept: [-8.83165267]
Coefficients: [-1.94931212e+01  5.63726955e+00  3.33676506e+01 -1.34036831e+01
  1.59871111e+01 -1.09750788e+03 -1.55752235e+01 -9.23622965e-01]
RMSE Score: 1054.3222563721017


In [None]:
####