# House Price Predictor

In [10]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from torch.utils.data import TensorDataset, DataLoader

In [11]:
# -----------------------------
# 1. Load the dataset
# -----------------------------
# Replace 'house_prices.csv' with the path to your dataset.
data = pd.read_csv('train.csv')
print(data)

       Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0       1          60       RL         65.0     8450   Pave   NaN      Reg   
1       2          20       RL         80.0     9600   Pave   NaN      Reg   
2       3          60       RL         68.0    11250   Pave   NaN      IR1   
3       4          70       RL         60.0     9550   Pave   NaN      IR1   
4       5          60       RL         84.0    14260   Pave   NaN      IR1   
..    ...         ...      ...          ...      ...    ...   ...      ...   
995   996          50       RL         51.0     4712   Pave   NaN      IR1   
996   997          20       RL          NaN    10659   Pave   NaN      IR1   
997   998          20       RL          NaN    11717   Pave   NaN      IR1   
998   999          30       RM         60.0     9786   Pave   NaN      Reg   
999  1000          20       RL         64.0     6762   Pave   NaN      Reg   

    LandContour Utilities  ... PoolArea PoolQC  Fence MiscFeatu

In [12]:
# -----------------------------
# 2. Data Cleaning
# -----------------------------

target = 'SalePrice'

data = pd.get_dummies(data, drop_first=True)

# Convert any boolean columns to integer (0 or 1)
data = data.apply(lambda col: col.astype(int) if col.dtype == 'bool' else col)

# Select only numeric columns that have no missing data.
#numeric_cols = data.select_dtypes(include=[np.number]).columns
clean_numeric_cols = [col for col in data if data[col].isna().sum() == 0]
data_clean = data[clean_numeric_cols]


# Ensure that the target column 'price' is present.
if 'SalePrice' not in data_clean.columns:
    raise ValueError("The target column 'price' is not present in the complete numeric data.")


print(data_clean)


       Id  MSSubClass  LotArea  OverallQual  OverallCond  YearBuilt  \
0       1          60     8450            7            5       2003   
1       2          20     9600            6            8       1976   
2       3          60    11250            7            5       2001   
3       4          70     9550            7            5       1915   
4       5          60    14260            8            5       2000   
..    ...         ...      ...          ...          ...        ...   
995   996          50     4712            4            7       1946   
996   997          20    10659            5            6       1961   
997   998          20    11717            6            6       1970   
998   999          30     9786            3            4       1922   
999  1000          20     6762            7            5       2006   

     YearRemodAdd  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF  ...  SaleType_ConLI  \
0            2003         706           0        150  ...             

In [13]:
# -----------------------------
# 3. Feature Selection
# -----------------------------
# Compute the correlation matrix using only the cleaned numeric data.
corr_matrix = data_clean.corr()

# Compute absolute correlations of features with the target and drop the target itself.
target_corr = corr_matrix['SalePrice'].drop('SalePrice').abs().sort_values(ascending=False)

# Print all features with their correlation values
print("All features sorted by correlation with SalePrice:")
print(target_corr)
target_corr.to_csv('target_corr.csv')

# Select only the top 4 features with the highest correlation with 'SalesPrice'
top15_features = target_corr.head(20).index
print("Selected top 15 features:", list(top15_features))

train_columns = data_clean.drop(columns=[target]).columns

# Define input features (X) and target variable (y).
X = data_clean[top15_features].values
y = data_clean['SalePrice'].values.reshape(-1, 1)


All features sorted by correlation with SalePrice:
OverallQual        0.797666
GrLivArea          0.734997
GarageCars         0.658204
GarageArea         0.647953
TotalBsmtSF        0.642127
                     ...   
Condition1_RRNe    0.004722
Heating_GasW       0.000977
RoofMatl_Metal     0.000901
PoolQC_Fa          0.000506
BsmtFinSF2         0.000359
Name: SalePrice, Length: 228, dtype: float64
Selected top 15 features: ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'ExterQual_TA', 'TotRmsAbvGrd', 'FullBath', 'KitchenQual_TA', 'YearBuilt', 'YearRemodAdd', 'Foundation_PConc', 'Fireplaces', 'BsmtQual_TA', 'Neighborhood_NridgHt', 'ExterQual_Gd', 'BsmtFinType1_GLQ', 'GarageFinish_Unf', 'BsmtFinSF1']


In [14]:
# -----------------------------
# 4. Data Preprocessing
# -----------------------------
# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features to improve training stability.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
# Convert the numpy arrays to PyTorch tensors.
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Create a TensorDataset and DataLoader for batch processing.
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [16]:
#-----------------------------
# 5. Define the Neural Network Model
# -----------------------------
class HousePriceModel(nn.Module):
    def __init__(self, input_dim):
        super(HousePriceModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)  # Output layer for regression
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HousePriceModel(input_dim=X_train.shape[1]).to(device)



In [17]:
# -----------------------------
# 6. Set Up Loss Function and Optimizer
# -----------------------------
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# -----------------------------
# 7. Train the Model
# -----------------------------
num_epochs = 1000
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * batch_X.size(0)
    
    epoch_loss = running_loss / len(train_dataset)
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")
        

Epoch [10/1000], Loss: 35640258396.1600
Epoch [20/1000], Loss: 13182327050.2400
Epoch [30/1000], Loss: 6063930979.8400
Epoch [40/1000], Loss: 4980816424.9600
Epoch [50/1000], Loss: 4209443287.0400
Epoch [60/1000], Loss: 3588340894.7200
Epoch [70/1000], Loss: 3113222735.3600
Epoch [80/1000], Loss: 2774982913.2800
Epoch [90/1000], Loss: 2526361007.3600
Epoch [100/1000], Loss: 2344245689.6000
Epoch [110/1000], Loss: 2193129515.5200
Epoch [120/1000], Loss: 2064111947.5200
Epoch [130/1000], Loss: 1954554814.7200
Epoch [140/1000], Loss: 1857810864.6400
Epoch [150/1000], Loss: 1769540798.7200
Epoch [160/1000], Loss: 1682837222.4000
Epoch [170/1000], Loss: 1606826398.0800
Epoch [180/1000], Loss: 1536684347.8400
Epoch [190/1000], Loss: 1475659985.9200
Epoch [200/1000], Loss: 1413742528.0000
Epoch [210/1000], Loss: 1359493647.3600
Epoch [220/1000], Loss: 1309110744.3200
Epoch [230/1000], Loss: 1271732599.0400
Epoch [240/1000], Loss: 1224923363.2000
Epoch [250/1000], Loss: 1187760486.4000
Epoch [

In [18]:
# 8. Evaluate the Model
# -----------------------------
model.eval()
with torch.no_grad():
    predictions = model(X_test_tensor.to(device))
    test_loss = criterion(predictions, y_test_tensor.to(device)).item()
    print("Test Mean Squared Error:", test_loss)

    # Convert predictions to NumPy array and print
    predictions_np = predictions.cpu().numpy()
    print("Predictions (NumPy):", predictions_np)

# Optionally, to evaluate using scikit-learn's MSE:
mse = mean_squared_error(y_test, predictions_np)
print("Test MSE (scikit-learn):", mse)
#Test Mean Squared Error: 935741376.0

Test Mean Squared Error: 697186304.0
Predictions (NumPy): [[157803.78 ]
 [251705.22 ]
 [114728.57 ]
 [191357.72 ]
 [138121.34 ]
 [294526.5  ]
 [130679.984]
 [141683.94 ]
 [225542.16 ]
 [137784.5  ]
 [160974.56 ]
 [104454.74 ]
 [ 65842.42 ]
 [190606.75 ]
 [259168.58 ]
 [132805.62 ]
 [213462.4  ]
 [117348.14 ]
 [115642.28 ]
 [207971.25 ]
 [230617.75 ]
 [229648.12 ]
 [125788.95 ]
 [430898.66 ]
 [ 94881.234]
 [118616.266]
 [169684.28 ]
 [131577.97 ]
 [188999.03 ]
 [306831.62 ]
 [132027.42 ]
 [198798.34 ]
 [260635.06 ]
 [114438.67 ]
 [184652.66 ]
 [146466.92 ]
 [157878.58 ]
 [120098.48 ]
 [150393.3  ]
 [119956.61 ]
 [173235.1  ]
 [227721.3  ]
 [119589.15 ]
 [172981.38 ]
 [139843.94 ]
 [100746.484]
 [193384.38 ]
 [121923.88 ]
 [130362.84 ]
 [119441.97 ]
 [151142.44 ]
 [111245.516]
 [319371.25 ]
 [163547.53 ]
 [139055.34 ]
 [ 91849.43 ]
 [111924.875]
 [127766.375]
 [214560.02 ]
 [186551.5  ]
 [121724.06 ]
 [126412.734]
 [225347.28 ]
 [149606.8  ]
 [233284.   ]
 [295916.97 ]
 [178140.16 ]
 [26

In [None]:
# Load the test.csv file (which lacks SalePrice but includes an 'Id' column)
test_df = pd.read_csv('test.csv')
ids = test_df['Id']

# One-hot encode the test data using the same categorical columns as before
#test_encoded = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)

data = pd.get_dummies(test_df, drop_first=True)
# Convert any boolean columns to integer (0 or 1)
data = data.apply(lambda col: col.astype(int) if col.dtype == 'bool' else col)

# Reindex to match the training features
data = data.reindex(columns=train_columns, fill_value=0)
data = data[top15_features]  # Select the same top features
data = data.astype(float)
X_test_new = scaler.transform(data)

X_test_tensor_new = torch.tensor(X_test_new, dtype=torch.float32)
model.eval()
with torch.no_grad():
    test_predictions = model(X_test_tensor_new.to(device))
    test_predictions_np = test_predictions.cpu().numpy()

# Inverse transform predictions to get SalePrice in the original scale.
#test_predictions_unscaled = scaler_y.inverse_transform(test_predictions_np)

submission_df = pd.DataFrame({
    'ID': ids.astype(int),
    'SALEPRICE': test_predictions_np.flatten().astype(float)
})


print("\nSubmission Preview:")
print(submission_df.head())

# Optionally, export to CSV:
submission_df.to_csv('predictions.csv', index=False)


Submission Preview:
     ID      SALEPRICE
0  1001   87043.984375
1  1002   81143.265625
2  1003  257956.968750
3  1004  150092.390625
4  1005  204407.328125


