In [None]:
import pandas as pd
import numpy as np

In [None]:
#Load the dataset
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
#Data Preview
train.head()

In [None]:
#Dataset Overview
train.info()

In [None]:
#Statistical Summary
train.describe()

In [None]:
#Dataset Dimensions
train.shape

In [None]:
train.columns

In [None]:
#Target aur Features
y = train["SalePrice"]
X = train.drop("SalePrice", axis=1)

In [None]:
#Missing Values Handle
num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(include="object").columns

In [None]:
X[num_cols] = X[num_cols].fillna(X[num_cols].mean())
X[cat_cols] = X[cat_cols].fillna(X[cat_cols].mode().iloc[0])

In [None]:
test[num_cols] = test[num_cols].fillna(test[num_cols].mean())
test[cat_cols] = test[cat_cols].fillna(test[cat_cols].mode().iloc[0])


In [None]:
#Categorical Encode
X = pd.get_dummies(X, drop_first=True)
test = pd.get_dummies(test, drop_first=True)

X, test = X.align(test, join="left", axis=1)
test = test.fillna(0)

In [None]:
#SalePrice Distribution
import matplotlib.pyplot as plt

plt.hist(y, bins=50)
plt.title("SalePrice Distribution")
plt.show()


In [None]:
#Detect and Remove Outliers
plt.scatter(train["GrLivArea"], train["SalePrice"])
plt.xlabel("Living Area")
plt.ylabel("Sale Price")
plt.show()


In [None]:
#Correlation Analysis
corr = train.corr(numeric_only=True)
top_corr = corr["SalePrice"].sort_values(ascending=False).head(15)
print(top_corr)


In [None]:
#Feature Engineering
train["TotalArea"] = train["GrLivArea"] + train["TotalBsmtSF"]
train["TotalBath"] = train["FullBath"] + train["HalfBath"]*0.5


In [None]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
#Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
#Linear Regression Train
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
#Validation + RMSE
from sklearn.metrics import mean_squared_error
pred_val = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, pred_val))
print("RMSE:", rmse)

In [None]:
#Test Data Prediction
test_preds = model.predict(test)

In [None]:
#Submission File
sub = pd.read_csv("sample_submission.csv")
sub["SalePrice"] = test_preds
sub.to_csv("final_submission.csv", index=False)