<a href="https://colab.research.google.com/github/NishWasHere/Celebal-Assignment/blob/main/Week5_House_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Save and drop IDs
train_ID = train['Id']
test_ID = test['Id']
train.drop("Id", axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)

# Save target and drop from train
y = train["SalePrice"]
train.drop("SalePrice", axis=1, inplace=True)

# Combine train and test for uniform preprocessing
all_data = pd.concat([train, test], axis=0, ignore_index=True)

# Handle missing values
for col in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType',
            'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond',
            'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType']:
    all_data[col] = all_data[col].fillna("None")

for col in ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2',
            'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea',
            'LotFrontage']:
    all_data[col] = all_data[col].fillna(all_data[col].median())

# Convert to string (categorical)
for col in ['MSSubClass', 'YrSold', 'MoSold']:
    all_data[col] = all_data[col].astype(str)

# Label Encoding for ordinal variables
label_cols = ['ExterQual', 'BsmtQual', 'KitchenQual', 'HeatingQC', 'FireplaceQu',
              'GarageQual', 'GarageCond', 'PoolQC', 'BsmtCond', 'BsmtExposure',
              'BsmtFinType1', 'BsmtFinType2', 'Functional', 'Fence']

for col in label_cols:
    lbl = LabelEncoder()
    all_data[col] = lbl.fit_transform(all_data[col].astype(str))

# Add new features
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['TotalBath'] = (all_data['FullBath'] + 0.5 * all_data['HalfBath'] +
                         all_data['BsmtFullBath'] + 0.5 * all_data['BsmtHalfBath'])

# One-hot encoding
all_data = pd.get_dummies(all_data)

# Separate back
X = all_data[:len(y)]
X_test = all_data[len(y):]

# Log transform the target
y_log = np.log1p(y)

# Split for validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y_log, test_size=0.2, random_state=42)

# Train Ridge Regression model
model = Ridge(alpha=10)
model.fit(X_train, y_train)

# Validate
y_pred = model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(f"Validation RMSE: {rmse:.4f}")

# Predict on test and inverse log transform
final_preds = model.predict(X_test)
final_preds = np.expm1(final_preds)

# Create submission file
submission = pd.DataFrame({
    "Id": test_ID,
    "SalePrice": final_preds
})
submission.to_csv("submission.csv", index=False)
print("✅ Submission file created: submission.csv")


Validation RMSE: 0.1396
✅ Submission file created: submission.csv
