In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../')

from src.data_preprocessing import DataPreprocessor


In [2]:
# Load raw data
df = pd.read_csv('../data/raw/india_housing_prices.csv')
print(f"Original Data Shape: {df.shape}")
df.head()


Original Data Shape: (250000, 23)


Unnamed: 0,ID,State,City,Locality,Property_Type,BHK,Size_in_SqFt,Price_in_Lakhs,Price_per_SqFt,Year_Built,...,Age_of_Property,Nearby_Schools,Nearby_Hospitals,Public_Transport_Accessibility,Parking_Space,Security,Amenities,Facing,Owner_Type,Availability_Status
0,1,Tamil Nadu,Chennai,Locality_84,Apartment,1,4740,489.76,0.1,1990,...,35,10,3,High,No,No,"Playground, Gym, Garden, Pool, Clubhouse",West,Owner,Ready_to_Move
1,2,Maharashtra,Pune,Locality_490,Independent House,3,2364,195.52,0.08,2008,...,17,8,1,Low,No,Yes,"Playground, Clubhouse, Pool, Gym, Garden",North,Builder,Under_Construction
2,3,Punjab,Ludhiana,Locality_167,Apartment,2,3642,183.79,0.05,1997,...,28,9,8,Low,Yes,No,"Clubhouse, Pool, Playground, Gym",South,Broker,Ready_to_Move
3,4,Rajasthan,Jodhpur,Locality_393,Independent House,2,2741,300.29,0.11,1991,...,34,5,7,High,Yes,Yes,"Playground, Clubhouse, Gym, Pool, Garden",North,Builder,Ready_to_Move
4,5,Rajasthan,Jaipur,Locality_466,Villa,4,4823,182.9,0.04,2002,...,23,4,9,Low,No,Yes,"Playground, Garden, Gym, Pool, Clubhouse",East,Builder,Ready_to_Move


In [3]:
# Initialize preprocessor
preprocessor = DataPreprocessor()


In [4]:
# Define categorical columns to encode
categorical_cols = [
    'State', 
    'City', 
    'Locality', 
    'Property_Type',
    'Furnished_Status', 
    'Facing', 
    'Owner_Type', 
    'Availability_Status',
    'Security'
]

# Define numerical columns that might have outliers
outlier_cols = [
    'Price_in_Lakhs',
    'Size_in_SqFt',
    'Floor_No',
    'Total_Floors'
]


In [5]:
# Run complete preprocessing
df_processed = preprocessor.preprocess(
    df, 
    categorical_cols=categorical_cols,
    outlier_cols=outlier_cols
)



üöÄ STARTING DATA PREPROCESSING

üîç Handling Missing Values...

üîç Removing Duplicates...
  ‚úì Removed 0 duplicate rows

üîç Encoding Categorical Variables...
  ‚úì Encoded 'State' (20 unique values)
  ‚úì Encoded 'City' (42 unique values)
  ‚úì Encoded 'Locality' (500 unique values)
  ‚úì Encoded 'Property_Type' (3 unique values)
  ‚úì Encoded 'Furnished_Status' (3 unique values)
  ‚úì Encoded 'Facing' (4 unique values)
  ‚úì Encoded 'Owner_Type' (3 unique values)
  ‚úì Encoded 'Availability_Status' (2 unique values)
  ‚úì Encoded 'Security' (2 unique values)

üîç Handling Outliers...
  ‚úì Capped 0 outliers in 'Price_in_Lakhs'
  ‚úì Capped 0 outliers in 'Size_in_SqFt'
  ‚úì Capped 0 outliers in 'Floor_No'
  ‚úì Capped 0 outliers in 'Total_Floors'

‚úÖ PREPROCESSING COMPLETED


In [6]:
# Check results
print("\nüìä Processed Data Info:")
print(f"Shape: {df_processed.shape}")
print(f"Missing Values: {df_processed.isnull().sum().sum()}")
print(f"Duplicates: {df_processed.duplicated().sum()}")

df_processed.head()



üìä Processed Data Info:
Shape: (250000, 23)
Missing Values: 0
Duplicates: 0


Unnamed: 0,ID,State,City,Locality,Property_Type,BHK,Size_in_SqFt,Price_in_Lakhs,Price_per_SqFt,Year_Built,...,Age_of_Property,Nearby_Schools,Nearby_Hospitals,Public_Transport_Accessibility,Parking_Space,Security,Amenities,Facing,Owner_Type,Availability_Status
0,1,15,6,483,0,1,4740,489.76,0.1,1990,...,35,10,3,High,No,0,"Playground, Gym, Garden, Pool, Clubhouse",3,2,0
1,2,11,33,434,1,3,2364,195.52,0.08,2008,...,17,8,1,Low,No,1,"Playground, Clubhouse, Pool, Gym, Garden",1,1,1
2,3,13,25,75,0,2,3642,183.79,0.05,1997,...,28,9,8,Low,Yes,0,"Clubhouse, Pool, Playground, Gym",2,0,0
3,4,14,21,326,1,2,2741,300.29,0.11,1991,...,34,5,7,High,Yes,1,"Playground, Clubhouse, Gym, Pool, Garden",1,1,0
4,5,14,19,407,2,4,4823,182.9,0.04,2002,...,23,4,9,Low,No,1,"Playground, Garden, Gym, Pool, Clubhouse",0,1,0


In [7]:
# Save processed data
df_processed.to_csv('../data/processed/preprocessed_data.csv', index=False)
print("\n‚úÖ Processed data saved to 'data/processed/preprocessed_data.csv'")

# Save encoders
preprocessor.save_encoders('../models/encoders.pkl')



‚úÖ Processed data saved to 'data/processed/preprocessed_data.csv'

üíæ Saved encoders to '../models/encoders.pkl'
