In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("C:\\Users\\Utkarsh\\Downloads\\properties_cleaned.csv", low_memory=False)
print("Phase 2 start shape:", df.shape)

Phase 2 start shape: (11338, 150)


In [2]:
high_missing = df.isna().mean()
drop_cols = high_missing[high_missing > 0.5].index.tolist()

# Keep key features even if sparse
exceptions = ["balconies"]
drop_cols = [c for c in drop_cols if c not in exceptions]

df.drop(columns=drop_cols, inplace=True)
print("After dropping >50% missing columns:", df.shape)

After dropping >50% missing columns: (11338, 145)


In [3]:
# Create single numeric parking flag
if "Parking" in df.columns:
    df["has_parking"] = df["Parking"].astype(str).str.contains("Covered|Open|1", case=False).astype(int)
    df.drop(columns=["Parking"], inplace=True)

In [4]:
location_noise = [
    "Location",
    "Landmark",
    "Area Name",
    "City",
    "Project Name",
    "Society"
]

df.drop(columns=[c for c in location_noise if c in df.columns], inplace=True)

In [5]:
# Keep only aggregated/important features
binary_cols = [c for c in df.columns if df[c].dropna().isin([0,1]).all()]

keep_binary = ["amenity_count", "luxury_score", "Luxury Flat_Y", "isPrimeLocationProperty_Y"]

binary_drop = [c for c in binary_cols if c not in keep_binary]
df.drop(columns=binary_drop, inplace=True)

In [6]:
final_features = [
    # Target
    "Price",

    # Area & configuration
    "usable_area_sqft",
    "bedroom",
    "Bathroom",
    "bathroom_ratio",
    "balconies",

    # Floor & building
    "Floor No",
    "floors",
    "floor_ratio",

    # Location intelligence
    "area_price_mean",
    "city_price_mean",

    # Time
    "years_to_possession",

    # Amenities
    "amenity_count",
    "luxury_score",

    # Property characteristics
    "has_parking",
    "Facing",
    "Ownership Type_Freehold",
    "furnished Type_Unfurnished",
    "Transaction Type_New Property",

    # Quality signals
    "Maintenance Charges",
    "Approved Authority Name",

    # Flags
    "Luxury Flat_Y",
    "isPrimeLocationProperty_Y"
]

# Keep only columns that exist in dataframe
final_features = [c for c in final_features if c in df.columns]

df_final = df[final_features].copy()
print("Phase 2 final shape (rows, features):", df_final.shape)

Phase 2 final shape (rows, features): (11338, 16)


In [7]:
for col in df_final.columns:
    if df_final[col].dtype in ["float64", "int64"]:
        df_final[col] = df_final[col].fillna(df_final[col].median())
    else:
        df_final[col] = df_final[col].fillna(df_final[col].mode()[0])


In [8]:
X = df_final.drop("Price", axis=1)
y = np.log1p(df_final["Price"])  # Log-transform target

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (11338, 15)
y shape: (11338,)


In [9]:
df_final.to_csv("properties_phase2_final.csv", index=False)

print(df_final.shape)

(11338, 16)
