In [116]:
import importlib
import src.config
import src.preprocessing

importlib.reload(src.config)
importlib.reload(src.preprocessing)

from src.preprocessing import preprocess_base

df = pd.read_csv("../data/raw/real_estate.csv")
df = preprocess_base(df)

print(df.shape)
print(df.info())
df.head()


(250000, 18)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 18 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   state                           250000 non-null  object 
 1   city                            250000 non-null  object 
 2   locality                        250000 non-null  object 
 3   property_type                   250000 non-null  object 
 4   bhk                             250000 non-null  int64  
 5   size_in_sqft                    250000 non-null  int64  
 6   price_in_lakhs                  250000 non-null  float64
 7   year_built                      250000 non-null  int64  
 8   furnished_status                250000 non-null  object 
 9   nearby_schools                  250000 non-null  int64  
 10  nearby_hospitals                250000 non-null  int64  
 11  public_transport_accessibility  250000 non-null  object 
 12  par

Unnamed: 0,state,city,locality,property_type,bhk,size_in_sqft,price_in_lakhs,year_built,furnished_status,nearby_schools,nearby_hospitals,public_transport_accessibility,parking_space,security,amenities,facing,owner_type,availability_status
0,Tamil Nadu,Chennai,Locality_84,Apartment,1,4740,489.76,1990,Furnished,10,3,High,No,No,"Playground, Gym, Garden, Pool, Clubhouse",West,Owner,Ready_to_Move
1,Maharashtra,Pune,Locality_490,Independent House,3,2364,195.52,2008,Unfurnished,8,1,Low,No,Yes,"Playground, Clubhouse, Pool, Gym, Garden",North,Builder,Under_Construction
2,Punjab,Ludhiana,Locality_167,Apartment,2,3642,183.79,1997,Semi-furnished,9,8,Low,Yes,No,"Clubhouse, Pool, Playground, Gym",South,Broker,Ready_to_Move
3,Rajasthan,Jodhpur,Locality_393,Independent House,2,2741,300.29,1991,Furnished,5,7,High,Yes,Yes,"Playground, Clubhouse, Gym, Pool, Garden",North,Builder,Ready_to_Move
4,Rajasthan,Jaipur,Locality_466,Villa,4,4823,182.9,2002,Semi-furnished,4,9,Low,No,Yes,"Playground, Garden, Gym, Pool, Clubhouse",East,Builder,Ready_to_Move


In [117]:
from datetime import date
CURRENT_YEAR = date.today().year

df['age_of_property'] = CURRENT_YEAR - df['year_built']
df['age_of_property'].describe()

count    250000.000000
mean         19.479988
std           9.808575
min           3.000000
25%          11.000000
50%          19.000000
75%          28.000000
max          36.000000
Name: age_of_property, dtype: float64

In [118]:
df['price_per_sqft'] = df['price_in_lakhs']*100000 / df['size_in_sqft']
df['price_per_sqft'].describe()

count    250000.000000
mean      13058.281776
std       13071.850480
min         202.247191
25%        4802.839710
50%        9244.747594
75%       15987.388517
max       99182.000000
Name: price_per_sqft, dtype: float64

In [119]:
transport_map = {
    "Low": 0,
    "Medium": 1,
    "High": 2,
}

df['transport_score'] = df['public_transport_accessibility'].map(transport_map)

df['transport_score'].value_counts(dropna=False)

transport_score
2    83705
0    83287
1    83008
Name: count, dtype: int64

## Engineer Amenity Strength

In [120]:
df['amenity_count'] = df['amenities'].apply(lambda x: len(str(x).split(',')))
df['amenity_count'].describe()

count    250000.000000
mean          3.000288
std           1.414284
min           1.000000
25%           2.000000
50%           3.000000
75%           4.000000
max           5.000000
Name: amenity_count, dtype: float64

## city benchmarks

In [121]:
city_median_pps = df.groupby('city')['price_per_sqft'].transform('median')
city_median_schools = df.groupby('city')['nearby_schools'].transform('median')
city_median_age = df.groupby("city")['age_of_property'].transform('median')
city_median_amenities = df.groupby('city')['amenity_count'].transform('median')

In [122]:
df['price_value_score'] = (df['price_per_sqft'] < city_median_pps).astype(int)

df['transport_value_score'] = (df['transport_score']>= df['transport_score'].median()).astype(int)

df['school_value_score'] = (df['nearby_schools'] >= city_median_schools).astype(int)

df['age_value_score'] = (df['age_of_property'] <= city_median_age).astype(int)

df['amenity_value_score'] = (df['amenity_count'] >= city_median_amenities).astype(int)

df['investment_score'] = (
    df['price_value_score']+
    df['transport_value_score']+
    df['school_value_score']+
    df['age_value_score']+
    df['amenity_value_score']
)

## For Classification - Good Investment

In [123]:
df['good_investment'] = (df['investment_score']>= 3).astype(int)
df['good_investment'].value_counts(normalize=True)


good_investment
1    0.62496
0    0.37504
Name: proportion, dtype: float64

## Growth Rates

In [124]:
tier_1 = ['Ahmedabad', 'Bangalore', 'Chennai', 'Hyderabad', 'Kolkata', 'Mumbai', 'New Delhi', 'Pune', 'Gurgaon', 'Noida']
tier_2 = ['Amritsar', 'Bhopal', 'Bhubaneswar', 'Coimbatore', 'Dehradun', 'Faridabad', 'Guwahati', 'Indore', 'Jaipur', 'Jamshedpur', 'Kochi', 'Lucknow', 'Nagpur', 'Patna', 'Raipur', 'Ranchi', 'Surat', 'Trivandrum', 'Vijayawada', 'Vishakhapatnam']
tier_3 = ['Bilaspur', 'Cuttack', 'Durgapur', 'Dwarka', 'Gaya', 'Haridwar', 'Jodhpur', 'Ludhiana', 'Mangalore', 'Mysore', 'Silchar', 'Warangal']


In [125]:
growth_rates = {
    'tier_1': 0.13,
    'tier_2': 0.15,
    'tier_3': 0.10,
}

## Compute Future Price

In [126]:
def assign_city_tier(city):
    if city in tier_1:
        return 'tier_1'
    elif city in tier_2:
        return 'tier_2'
    else:
        return 'tier_3'
df['city_tier'] = df['city'].apply(assign_city_tier)

In [127]:
def compute_future_price(row):
    rate = growth_rates[row['city_tier']]
    return row['price_in_lakhs'] * ((1+rate)**5)
df['future_price_5y'] = df.apply(compute_future_price, axis=1)

In [128]:
df[['price_in_lakhs', 'future_price_5y']].head()

Unnamed: 0,price_in_lakhs,future_price_5y
0,489.76,902.351053
1,195.52,360.232926
2,183.79,295.995633
3,300.29,483.620048
4,182.9,367.87723


In [129]:
df.to_csv("../data/processed/real_estate_processed.csv", index=False)
