#### Week 6 Exercise 6.2 Author: Rex Gayas Course & Section: DSC320-T301 Math for Data Science (2243-1) Date: 20 January 2024

##### Solution for Exercise 1 on Data Normalization

In [31]:
import numpy as np

def normalize_vector(vector):
    """
    Normalizes a vector so all values are scaled to the range [0, 1].
    
    Args:
    vector (numpy.array): A vector (array) of numerical data

    Returns:
    numpy.array: Normalized vector with values in the range [0, 1]
    """
    min_val = np.min(vector)
    max_val = np.max(vector)
    
    # Avoid division by zero if all vector elements are the same
    if max_val - min_val == 0:
        return np.zeros(vector.shape)
    
    normalized_vector = (vector - min_val) / (max_val - min_val)
    return normalized_vector

# Example using the normalization function
vector_example = np.array([1, 3, 7, 9])
normalized_vector = normalize_vector(vector_example)
print("Original Vector:", vector_example)
print("Normalized Vector:", normalized_vector)


Original Vector: [1 3 7 9]
Normalized Vector: [0.   0.25 0.75 1.  ]


Check using normalization formula:
1: 1-1/9-1 = 0/8 = 0
3: 3-1/9-1 = 2/8 = 0.25
7: 7-1/9-1 = 6/8 = 0.75
9: 9-1/9-1 = 8/8 = 1

##### Solution for Exercise 2 on Data Standardization

In [32]:
def standardize_vector(vector):
    """
    Standardizes a vector by calculating the z-score for each element.

    Args:
    vector (numpy.array): A vector (array) of numerical data

    Returns:
    numpy.array: Standardized vector
    """
    mean = np.mean(vector)
    std_dev = np.std(vector)
    
    # Avoid division by zero in case of a constant vector (std_dev is zero)
    if std_dev == 0:
        return np.zeros(vector.shape)
    
    standardized_vector = (vector - mean) / std_dev
    return standardized_vector

# Example using the standardized function
vector_example = np.array([1, 3, 7, 9])
standardized_vector = standardize_vector(vector_example)
print("Original Vector:", vector_example)
print("Standardized Vector:", standardized_vector)


Original Vector: [1 3 7 9]
Standardized Vector: [-1.26491106 -0.63245553  0.63245553  1.26491106]


In [33]:
# Check rational calculation for each element in the vector [1, 3, 7, 9]
# Using the mean (5) and standard deviation (sqrt(10))

mean = 5
std_dev = np.sqrt(10)

# Manual standardization calculations
manual_1 = (1 - mean) / std_dev
manual_3 = (3 - mean) / std_dev
manual_7 = (7 - mean) / std_dev
manual_9 = (9 - mean) / std_dev

manual_1, manual_3, manual_7, manual_9


(-1.2649110640673518,
 -0.6324555320336759,
 0.6324555320336759,
 1.2649110640673518)

#### Working with a Dataframe

##### Solution for Exercise 3(a)

In [34]:
import pandas as pd

# Load the dataset
file_path = "D:/ALPHA/Dynamic Folder/Bellevue/Winter 2023/Math for Data Science/Week 6/calif_housing_data.csv"
df = pd.read_csv(file_path)

# Count the number of rows
row_count = df.shape[0]

print("Number of rows in the dataset:", row_count)


Number of rows in the dataset: 20640


##### Solution for Exercise 3(b)

In [35]:
# Identify the target vector
target_vector = df['median_house_value']
print("Target Vector:", target_vector.head())

Target Vector: 0    452600.0
1    358500.0
2    352100.0
3    341300.0
4    342200.0
Name: median_house_value, dtype: float64


##### Solution for Exercise 3(c)

In [36]:
# Create a new feature (bedrooms per household)
df['bedrooms_per_household'] = df['total_bedrooms'] / df['households']

This feature calculates the mean number of bedrooms per household in a given block. It's a measure of the average household size in terms of bedrooms. A higher value might indicate larger homes with more bedrooms and vice versa.

##### Solution for Exercise 3(d)

In [37]:
# Create a new DataFrame with requested three features
features_df = df[['housing_median_age', 'median_income', 'bedrooms_per_household']]

# Check the first 5 rows of the new DataFrame to ensure it's correct
print("New DataFrame with selected features :")
print(features_df.head())


New DataFrame with selected features :
   housing_median_age  median_income  bedrooms_per_household
0                  41         8.3252                1.023810
1                  21         8.3014                0.971880
2                  52         7.2574                1.073446
3                  52         5.6431                1.073059
4                  52         3.8462                1.081081


##### Solution for Exercise 3(e)

In [38]:
# Standardize the features in the new DataFrame
standardized_features = features_df.apply(standardize_vector, axis=0)

# Show the first 5 rows of the standardized DataFrame
print("Standardized Features :")
print(standardized_features.head())

Standardized Features :
   housing_median_age  median_income  bedrooms_per_household
0            0.982143       2.344766               -0.153863
1           -0.607019       2.332238               -0.262936
2            1.856182       1.782699               -0.049604
3            1.856182       0.932968               -0.050417
4            1.856182      -0.012881               -0.033568
