In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
data = pd.read_csv(r"C:\Users\SWAROOP\Downloads\area_price_prediction.csv")

# Step 1: Data Validation
# Check for missing values
if data.isnull().sum().sum() > 0:
    raise ValueError("Dataset contains missing values. Please handle them before proceeding.")

# Validate numerical columns for non-negative values
numerical_cols = ['Price', 'LivingArea', 'LotSize', 'Bedrooms', 'Bathrooms', 'GarageSize', 'DistanceToCityCenter']
for col in numerical_cols:
    if (data[col] < 0).any():
        raise ValueError(f"Column {col} contains negative values, which is invalid.")

# Validate PropertyType categories
valid_types = ['SingleFamily', 'Condo', 'Townhouse']
if not data['PropertyType'].isin(valid_types).all():
    raise ValueError("Invalid PropertyType categories found.")

# Step 2: Preprocessing
# Define features and target
X = data.drop('Price', axis=1)
y = data['Price']

# Define numerical and categorical columns
numerical_cols = ['LivingArea', 'Bedrooms', 'Bathrooms', 'LotSize', 'YearBuilt', 'NeighborhoodQuality', 'DistanceToCityCenter', 'GarageSize']
categorical_cols = ['PropertyType']

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
    ])

# Step 3: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Create and train the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

pipeline.fit(X_train, y_train)

# Step 5: Make predictions and evaluate
y_pred = pipeline.predict(X_test)

# Calculate standard metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Calculate custom accuracy: predictions within 10% of actual price
tolerance = 0.1  # 10% tolerance
within_tolerance = np.abs(y_pred - y_test) / y_test <= tolerance
accuracy = np.mean(within_tolerance) * 100  # Convert to percentage

# Print results
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared: {r2:.2f}")
print(f"Accuracy (within 10% of actual price): {accuracy:.2f}%")

# Optional: Check feature correlations to detect multicollinearity
correlation_matrix = data[numerical_cols].corr()
print("\nCorrelation Matrix for Numerical Features:")
print(correlation_matrix)

Mean Squared Error: 92716252.12
Root Mean Squared Error: 9628.93
R-squared: 1.00
Accuracy (within 10% of actual price): 97.25%

Correlation Matrix for Numerical Features:
                      LivingArea  Bedrooms  Bathrooms   LotSize  YearBuilt  \
LivingArea              1.000000  0.978082   0.971032  0.987356   0.946157   
Bedrooms                0.978082  1.000000   0.954579  0.964736   0.942861   
Bathrooms               0.971032  0.954579   1.000000  0.964196   0.934275   
LotSize                 0.987356  0.964736   0.964196  1.000000   0.978655   
YearBuilt               0.946157  0.942861   0.934275  0.978655   1.000000   
NeighborhoodQuality     0.950492  0.940685   0.941761  0.978387   0.991399   
DistanceToCityCenter   -0.959174 -0.953496  -0.943700 -0.984723  -0.995030   
GarageSize              0.840781  0.849502   0.821263  0.889019   0.939878   

                      NeighborhoodQuality  DistanceToCityCenter  GarageSize  
LivingArea                       0.950492       