<a href="https://colab.research.google.com/github/Suren1206/HPP_Ver2/blob/main/001_HPP_Final_Version2_Clean_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
## to have the csv data file worked in collab first it has to be imported into collab.
from google.colab import files
uploaded = files.upload()

import pandas as pd
df = pd.read_csv('train.csv')

## columns / features picked up for the purpose of ML coding - ' TotalBsmtSF',  'TotRmsAbvGrd','YrSold', 'Foundation', 'GarageArea'
## Now we will create a new feature using 'yr sold' to determine the age of the house
df['HouseAge'] = df['YrSold'] - df['YearBuilt']

## Now, lets go to the next challenge of Foundation which has a categorical value. we will do one hot encode for the same
df_encoded = pd.get_dummies(df, columns=['Foundation'])

## Neighborhood (location can drive price a lot) - this is a categorical data type & so needs one hot encode here too
def classify_neighborhood(neigh):
    avg_price = df[df['Neighborhood'] == neigh]['SalePrice'].mean()
    if avg_price > 225000:
        return 'high'
    elif avg_price >= 150000:
        return 'mid'
    else:
        return 'low'
df['NeighborhoodBand'] = df['Neighborhood'].apply(classify_neighborhood)
df_encoded['NeighborhoodBand'] = df['NeighborhoodBand']

## so we use one hot encoding on this feature now
df_encoded = pd.get_dummies(df_encoded, columns=['NeighborhoodBand'])

## we are going to have a flag for abnormal prediction (as high as 11%) for Q12008 when we analysed quarterly average

df_encoded['Flag_2008Q1'] = (
    (df_encoded['YrSold'] == 2008) &
    (df_encoded['MoSold'].isin([1, 2, 3]))
).astype(int)

df_encoded['Flag_AbnormalSale'] = ~df_encoded['SaleCondition'].isin(['Normal', 'Partial'])
df_encoded['Flag_AbnormalSale'] = df_encoded['Flag_AbnormalSale'].astype(int)

## few other categorical fields added by giving a rating on the input values

import numpy as np
bsmt_qual_map = {
    'Ex': 5,
    'Gd': 4,
    'TA': 3,
    'Fa': 2,
    'Po': 1,
    'NA': 0,
    np.nan: 0  # In case missing values are actual NaNs
}

df_encoded['BsmtQualScore'] = df_encoded['BsmtQual'].map(bsmt_qual_map)

bsmt_cond_map = {
    'Ex': 5,
    'Gd': 4,
    'TA': 3,
    'Fa': 2,
    'Po': 1,
    'NA': 0,
    np.nan: 0
}

df_encoded['BsmtCondScore'] = df_encoded['BsmtCond'].map(bsmt_cond_map)

bsmt_exposure_map = {
    'Gd': 5,
    'Av': 3,
    'Mn': 1,
    'No': 0,
    'NA': 0,
    np.nan: 0
}

df_encoded['BsmtExposureScore'] = df_encoded['BsmtExposure'].map(bsmt_exposure_map)

bsmt_fin_type1_map = {
    'GLQ': 5,
    'ALQ': 4,
    'Rec': 3,
    'BLQ': 2,
    'LwQ': 1,
    'Unf': 0,
    'NA': 0,
    np.nan: 0
}

df_encoded['BsmtFinType1Score'] = df_encoded['BsmtFinType1'].map(bsmt_fin_type1_map)

bsmt_fin_type2_map = {
    'GLQ': 5,
    'ALQ': 4,
    'Rec': 3,
    'BLQ': 2,
    'LwQ': 1,
    'Unf': 0,
    'NA': 0,
    np.nan: 0
}

df_encoded['BsmtFinType2Score'] = df_encoded['BsmtFinType2'].map(bsmt_fin_type2_map)

heating_qc_map = {
    'Ex': 4,
    'Gd': 3,
    'TA': 2,
    'Fa': 1,
    'Po': -1,
    np.nan: 0  # Just in case!
}

df_encoded['HeatingQCScore'] = df_encoded['HeatingQC'].map(heating_qc_map)

central_air_map = {
    'Y': 1,
    'N': 0,
    np.nan: 0
}

df_encoded['CentralAirScore'] = df_encoded['CentralAir'].map(central_air_map)

kitchen_qual_map = {
    'Ex': 5,
    'Gd': 4,
    'TA': 3,
    'Fa': 2,
    'Po': 1,
    np.nan: 0
}

df_encoded['KitchenQualScore'] = df_encoded['KitchenQual'].map(kitchen_qual_map)

quality_cond_map = {
    'Very Poor': 1,
    'Poor': 2,
    'Fair': 3,
    'Below Average': 4,
    'Average': 5,
    'Above Average': 6,
    'Good': 7,
    'Very Good': 8,
    'Excellent': 9,
    'Very Excellent': 10,
    np.nan: 0
}

df_encoded['OverallQualNum'] = df_encoded['OverallQual'].map(quality_cond_map)
df_encoded['OverallCondNum'] = df_encoded['OverallCond'].map(quality_cond_map)

df_encoded['OverallQualNum'] = df_encoded['OverallQual']
df_encoded['OverallCondNum'] = df_encoded['OverallCond']


functional_map = {
    'Typ': 7,
    'Min1': 6,
    'Min2': 5,
    'Mod': 4,
    'Maj1': 3,
    'Maj2': 2,
    'Sev': 1,
    'Sal': 0,
    np.nan: 0
}

df_encoded['FunctionalScore'] = df_encoded['Functional'].map(functional_map)

df_encoded['CompositeQualityScore'] = (
    df_encoded['OverallQualNum'] +
    df_encoded['OverallCondNum'] +
    df_encoded['FunctionalScore']
)

## from original list following changes are made
## (1) GarageArea is removed and GarageCars is added
## (2)OverallQual is removed and CompositeQualityScore is added instead
## (3) Additional fields added - KitchenQual, basement related scores, ac, etc

all_features_revised = [
    'TotalBsmtSF', 'TotRmsAbvGrd', 'GarageCars', 'HouseAge',
    'Foundation_BrkTil', 'Foundation_CBlock', 'Foundation_PConc',
    'Foundation_Slab', 'Foundation_Stone', 'Foundation_Wood',
    'NeighborhoodBand_high', 'NeighborhoodBand_mid', 'NeighborhoodBand_low',
    'GrLivArea', 'CompositeQualityScore', 'KitchenQualScore',
    'CentralAirScore', 'HeatingQCScore', 'BsmtFinType2Score',
    'BsmtFinType1Score', 'BsmtExposureScore', 'BsmtCondScore',
    'BsmtQualScore', 'Flag_AbnormalSale'
]

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Feature matrix (X) and target (y)
X = df_encoded[all_features_revised]
y = df_encoded['SalePrice']

## now we are going to split the data set into training and test set to proceed further
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

print("Model trained.")
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

# Combine column names with coefficients
for name, coef in zip(X_train.columns, model.coef_):
    print(f"{name}: {coef}")

y_train_pred = model.predict(X_train)
train_mse = mean_squared_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)
print("Training MSE:", train_mse)
print("Training R²:", train_r2)

y_test_pred = model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)
print("Test MSE:", test_mse)
print("Test R²:", test_r2)



Saving train.csv to train.csv
Model trained.
Intercept: -153627.34659223518
Coefficients: [ 2.08424452e+01  2.26516830e+03  1.08539500e+04  1.83156672e+00
 -5.71661989e+03  2.59583665e+03  2.25741374e+03  3.50257310e+04
 -6.83097605e+03 -2.73313854e+04  2.54438373e+04 -8.50962267e+03
 -1.69342146e+04  4.87701107e+01  6.84589611e+03  1.25130849e+04
  5.18186475e+03  1.18293111e+03  6.55656653e+02  2.63148308e+03
  4.42511200e+03 -7.10025860e+03  1.11699549e+04 -5.84419502e+03]
Intercept: -153627.34659223518
TotalBsmtSF: 20.842445200566164
TotRmsAbvGrd: 2265.1682952908986
GarageCars: 10853.949957718047
HouseAge: 1.831566723551864
Foundation_BrkTil: -5716.619885968546
Foundation_CBlock: 2595.8366488845572
Foundation_PConc: 2257.413741173231
Foundation_Slab: 35025.73097521299
Foundation_Stone: -6830.976047496168
Foundation_Wood: -27331.38543180589
NeighborhoodBand_high: 25443.837292320484
NeighborhoodBand_mid: -8509.622667809315
NeighborhoodBand_low: -16934.214624511176
GrLivArea: 48.77011

NameError: name 'train_df' is not defined