In [27]:
# =========================================================
#                 GURGAON HOUSE PRICE PREDICTION
# =========================================================

# **Step 1: Import Libraries**
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings("ignore")  # suppress harmless warnings

# **Step 2: Load Dataset**
file_path = "data of gurugram real Estate 2024.csv"  # change if needed
data = pd.read_csv(file_path)

print("=== Dataset Loaded Successfully ===")
print("Shape of dataset:", data.shape)
print("\nColumns in dataset:")
print(list(data.columns))

# **Step 3: Preview Data**
print("\n=== Dataset Preview ===")
try:
    from IPython.display import display
    display(data.head())
except:
    print(data.head())

# **Step 4: Handle Missing Values**
data = data.dropna()
print("\nMissing values removed. Current shape:", data.shape)

# **Step 5: Clean and Convert Numeric Columns**
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = (
            data[col]
            .astype(str)
            .str.replace(',', '', regex=False)
            .str.replace('₹', '', regex=False)
            .str.strip()
        )
        try:
            data[col] = pd.to_numeric(data[col])
        except Exception:
            pass

print("\n=== Data Types After Cleaning ===")
print(data.dtypes)

# **Step 6: Detect Numeric Columns**
numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
if not numeric_cols:
    raise ValueError("No numeric columns found. Please verify your dataset.")
print("\nNumeric columns detected:", numeric_cols)

# **Step 7: Select Feature (X) and Target (y) Columns**
print("\nAvailable Columns:")
print(list(data.columns))

possible_targets = [col for col in data.columns if 'price' in col.lower()]

if possible_targets:
    y_col = possible_targets[0]
    X_cols = [col for col in numeric_cols if col != y_col]
    print(f"\nAuto-selected target column: {y_col}")
else:
    print("\nNo 'price'-like column found automatically.")
    print("Please specify manually below:")
    X_cols = ['Area', 'BHK', 'Bathroom']  # modify according to your dataset
    y_col = 'Price'

print(f"\nUsing '{y_col}' as target variable.")
print(f"Using {X_cols} as feature columns.")

# **Step 8: Convert Target Column to Numeric**
data[y_col] = pd.to_numeric(data[y_col], errors='coerce')
data = data.dropna(subset=[y_col])
X = data[X_cols]
y = data[y_col]

# **Step 9: Split Dataset**
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("\n=== Data Split Completed ===")
print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])

# **Step 10: Train Linear Regression Model**
model = LinearRegression()
model.fit(X_train, y_train)
print("\nModel trained successfully.")

# **Step 11: Evaluate Model**
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print("\n=== Model Evaluation Metrics ===")
print(f"R² Score: {r2:.4f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")

# **Step 12: Save the Trained Model**
pickle_filename = "gurgaon_property_predictor.pkl"
with open(pickle_filename, "wb") as file:
    pickle.dump(model, file)
print(f"\nModel saved as '{pickle_filename}'")

# **Step 13: Load Model Back**
with open(pickle_filename, "rb") as file:
    loaded_model = pickle.load(file)
print("Model loaded successfully.")

# **Step 14: Test Example Prediction**
print("\nFeature columns:", X_cols)
sample_input = pd.DataFrame([X.mean().to_dict()])  # ensures valid structure
predicted_price = loaded_model.predict(sample_input)[0]

print("\nPredicted House Price (Example Input):", predicted_price)

print("\n=== Project Completed Successfully ===")


=== Dataset Loaded Successfully ===
Shape of dataset: (19515, 12)

Columns in dataset:
['Price', 'Status', 'Area', 'Rate per sqft', 'Property Type', 'Locality', 'Builder Name', 'RERA Approval', 'BHK_Count', 'Socity', 'Company Name', 'Flat Type']

=== Dataset Preview ===


Unnamed: 0,Price,Status,Area,Rate per sqft,Property Type,Locality,Builder Name,RERA Approval,BHK_Count,Socity,Company Name,Flat Type
0,10700000.0,Under Construction,1138,9450,2 BHK Apartment in M3M Antalya Hills Phase I,Sector 79,home,Approved by RERA,2.0,M3M Antalya Hills Phase I,M3M,Apartment
1,14400000.0,Under Construction,1528,9450,3 BHK Apartment in M3M Antalya Hills Phase I,Sector 79,Property In Gurgaon,Approved by RERA,3.0,M3M Antalya Hills Phase I,M3M,Apartment
2,10700000.0,Under Construction,1138,9450,2 BHK Apartment in M3M Antalya Hills Phase I,Sector 79,properties for sale in Gurgaon,Approved by RERA,2.0,M3M Antalya Hills Phase I,M3M,Apartment
3,40000000.0,Ready to move,4500,8888,4 BHK Independent Floor,Sector 57,MM India Pvt Ltd,Not approved by RERA,4.0,Outside Socity,Outside,Plot
4,24000000.0,Under Construction,1800,13333,3 BHK Independent Floor in Anant Raj Estate Plots,Sector 63,MM India Pvt Ltd,Approved by RERA,3.0,Anant Raj Estate Plots,Anant,Floor



Missing values removed. Current shape: (19515, 12)

=== Data Types After Cleaning ===
Price            float64
Status            object
Area               int64
Rate per sqft      int64
Property Type     object
Locality          object
Builder Name      object
RERA Approval     object
BHK_Count        float64
Socity            object
Company Name      object
Flat Type         object
dtype: object

Numeric columns detected: ['Price', 'Area', 'Rate per sqft', 'BHK_Count']

Available Columns:
['Price', 'Status', 'Area', 'Rate per sqft', 'Property Type', 'Locality', 'Builder Name', 'RERA Approval', 'BHK_Count', 'Socity', 'Company Name', 'Flat Type']

Auto-selected target column: Price

Using 'Price' as target variable.
Using ['Area', 'Rate per sqft', 'BHK_Count'] as feature columns.

=== Data Split Completed ===
Training samples: 15612
Testing samples: 3903

Model trained successfully.

=== Model Evaluation Metrics ===
R² Score: 0.4753
Mean Absolute Error: 18034272.38
Mean Squared Error: 