In [87]:
import numpy as np

In [88]:
# Load the CSV file
data = np.genfromtxt('Data/train.csv', delimiter=',', dtype=None, encoding='utf-8', names=True)

# Get column names
print("Columns:", data.dtype.names)

# Display the first few rows
print(data[:5])


Columns: ('Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',

In [89]:
print("Shape:", data.shape)
print("Data Types:")
for name in data.dtype.names:
    print(f"{name}: {data[name].dtype}")


Shape: (1460,)
Data Types:
Id: int32
MSSubClass: int32
MSZoning: <U7
LotFrontage: <U3
LotArea: int32
Street: <U4
Alley: <U4
LotShape: <U3
LandContour: <U3
Utilities: <U6
LotConfig: <U7
LandSlope: <U3
Neighborhood: <U7
Condition1: <U6
Condition2: <U6
BldgType: <U6
HouseStyle: <U6
OverallQual: int32
OverallCond: int32
YearBuilt: int32
YearRemodAdd: int32
RoofStyle: <U7
RoofMatl: <U7
Exterior1st: <U7
Exterior2nd: <U7
MasVnrType: <U7
MasVnrArea: <U4
ExterQual: <U2
ExterCond: <U2
Foundation: <U6
BsmtQual: <U2
BsmtCond: <U2
BsmtExposure: <U2
BsmtFinType1: <U3
BsmtFinSF1: int32
BsmtFinType2: <U3
BsmtFinSF2: int32
BsmtUnfSF: int32
TotalBsmtSF: int32
Heating: <U5
HeatingQC: <U2
CentralAir: <U1
Electrical: <U5
1stFlrSF: int32
2ndFlrSF: int32
LowQualFinSF: int32
GrLivArea: int32
BsmtFullBath: int32
BsmtHalfBath: int32
FullBath: int32
HalfBath: int32
BedroomAbvGr: int32
KitchenAbvGr: int32
KitchenQual: <U2
TotRmsAbvGrd: int32
Functional: <U4
Fireplaces: int32
FireplaceQu: <U2
GarageType: <U7
Garag

In [90]:
for col in data.dtype.names:
    column_data = data[col]

    if np.issubdtype(column_data.dtype, np.number):
        num_missing = np.sum(np.isnan(column_data))
    else:
        # Check for empty strings OR 'NA' strings
        num_missing = np.sum((column_data == '') | (column_data == 'NA'))
    
    print(f"{col}: {num_missing} missing values")

Id: 0 missing values
MSSubClass: 0 missing values
MSZoning: 0 missing values
LotFrontage: 259 missing values
LotArea: 0 missing values
Street: 0 missing values
Alley: 1369 missing values
LotShape: 0 missing values
LandContour: 0 missing values
Utilities: 0 missing values
LotConfig: 0 missing values
LandSlope: 0 missing values
Neighborhood: 0 missing values
Condition1: 0 missing values
Condition2: 0 missing values
BldgType: 0 missing values
HouseStyle: 0 missing values
OverallQual: 0 missing values
OverallCond: 0 missing values
YearBuilt: 0 missing values
YearRemodAdd: 0 missing values
RoofStyle: 0 missing values
RoofMatl: 0 missing values
Exterior1st: 0 missing values
Exterior2nd: 0 missing values
MasVnrType: 8 missing values
MasVnrArea: 8 missing values
ExterQual: 0 missing values
ExterCond: 0 missing values
Foundation: 0 missing values
BsmtQual: 37 missing values
BsmtCond: 37 missing values
BsmtExposure: 38 missing values
BsmtFinType1: 37 missing values
BsmtFinSF1: 0 missing values
B

In [91]:
def impute_with_mode(column):
    values, counts = np.unique(column[column != 'NA'], return_counts=True)
    mode_val = values[np.argmax(counts)]
    return np.where(column == 'NA', mode_val, column)

# Example for categorical columns
for col in data.dtype.names:
    if data[col].dtype.kind in {'U', 'S'}:
        data[col] = impute_with_mode(data[col])


In [92]:
def impute_with_mean(column):
    mean_val = np.nanmean(column)
    return np.where(column == 'NA', mean_val, column)

# Example for numeric columns
for col in data.dtype.names:
    if np.issubdtype(data[col].dtype, np.number):
        data[col] = impute_with_mean(data[col])


In [93]:
def one_hot_encode(column):
    # Get unique classes
    unique_vals = np.unique(column)
    one_hot = np.zeros((column.shape[0], len(unique_vals)))

    for i, val in enumerate(column):
        col_index = np.where(unique_vals == val)[0][0]
        one_hot[i, col_index] = 1

    return one_hot, unique_vals


In [94]:
categorical_cols = [col for col in data.dtype.names if data[col].dtype.kind in {'U', 'S'}]

one_hot_features = []
for col in categorical_cols:
    if len(np.unique(data[col])) <3:
        onehot, labels = one_hot_encode(data[col])
        one_hot_features.append(onehot)

# Combine all encoded features into one matrix
X_categorical = np.hstack(one_hot_features)


In [95]:
def correlation(x, y):
    x_mean = np.mean(x)
    y_mean = np.mean(y)
    
    numerator = np.sum((x - x_mean) * (y - y_mean))
    denominator = np.sqrt(np.sum((x - x_mean)**2)) * np.sqrt(np.sum((y - y_mean)**2))
    
    return numerator / denominator if denominator != 0 else 0
    
# Select numeric features only
numeric_cols = [col for col in data.dtype.names 
                if np.issubdtype(data[col].dtype, np.number) and col != 'SalePrice']

def normalize(col):
    return (col - np.mean(col)) / np.std(col)

data = {col: normalize(data[col]) for col in numeric_cols + ['SalePrice'] }

target = data['SalePrice']

correlations = {}
for col in numeric_cols:
    corr = correlation(data[col], target)
    correlations[col] = corr

# Display
for col, corr in correlations.items():
    print(f"{col}: correlation with Survived = {corr:.4f}")

threshold = 0.5
selected_features = [col for col, corr in correlations.items() if abs(corr) > threshold]

print("Selected features:", selected_features)


Id: correlation with Survived = -0.0219
MSSubClass: correlation with Survived = -0.0843
LotArea: correlation with Survived = 0.2638
OverallQual: correlation with Survived = 0.7910
OverallCond: correlation with Survived = -0.0779
YearBuilt: correlation with Survived = 0.5229
YearRemodAdd: correlation with Survived = 0.5071
BsmtFinSF1: correlation with Survived = 0.3864
BsmtFinSF2: correlation with Survived = -0.0114
BsmtUnfSF: correlation with Survived = 0.2145
TotalBsmtSF: correlation with Survived = 0.6136
1stFlrSF: correlation with Survived = 0.6059
2ndFlrSF: correlation with Survived = 0.3193
LowQualFinSF: correlation with Survived = -0.0256
GrLivArea: correlation with Survived = 0.7086
BsmtFullBath: correlation with Survived = 0.2271
BsmtHalfBath: correlation with Survived = -0.0168
FullBath: correlation with Survived = 0.5607
HalfBath: correlation with Survived = 0.2841
BedroomAbvGr: correlation with Survived = 0.1682
KitchenAbvGr: correlation with Survived = -0.1359
TotRmsAbvGrd:

In [96]:
len(selected_features)

10

In [97]:
'''# Numeric columns (excluding target)
numeric_cols = [col for col in data.dtype.names 
                if np.issubdtype(data[col].dtype, np.number) and col != 'SalePrice']'''

X_numeric = np.column_stack([data[col] for col in selected_features])

# Final feature matrix
X = np.hstack([X_numeric, X_categorical])

# Target variable
y = data['SalePrice'].reshape(-1, 1)


In [98]:
X.shape

(1460, 18)

In [99]:
# Add intercept
X = np.hstack((np.ones((X.shape[0], 1)), X))

# Theta = (X^T X)^-1 X^T y
theta = np.linalg.inv(X.T @ X) @ X.T @ y

# Predict
y_pred = X @ theta


# Accuracy
accuracy = np.mean(y_pred == y)
print("Accuracy:", accuracy)


Accuracy: 0.0


In [100]:
y_pred, y

(array([[-1.09375 ],
        [-1.640625],
        [-0.953125],
        ...,
        [-0.953125],
        [-2.1875  ],
        [-2.09375 ]]),
 array([[ 0.34727322],
        [ 0.00728832],
        [ 0.53615372],
        ...,
        [ 1.07761115],
        [-0.48852299],
        [-0.42084081]]))

In [101]:
def MAPE(y, y_pred):
    mape = 0
    for i in range(len(y)):
        mape= mape + abs((y[i]-y_pred[i])/y[i])
    mape= mape *100/len(data)
    return mape

In [102]:
MAPE(y, y_pred)

array([58261.93558088])

In [103]:
def R_Square(data):
    rss=0 # residual sum of square
    tss=0 # total sum of square
    avg= np.mean(data[:][0])
    for i in range(len(data)):
        rss= rss + np.square(data[i][0] - data[i][1])
        tss= tss + np.square(data[i][0] - avg)
    r_square= 1- (rss/tss)
    return r_square

In [104]:
R_Square([y, y_pred])

array([0.68511971])

In [107]:
def MSE(data):
    mse=0
    for i in range(len(data)):
        mse=mse+ np.square(data[i][0]-data[i][1])
    mse= mse/len(data)
    return mse
def RMSE(data):
    mse= MSE(data)
    rmse= np.sqrt(mse)
    return rmse

In [108]:
RMSE([y, y_pred])

array([0.45533614])