In [1]:
# src/data_prep.py
import pandas as pd
import numpy as np

def load_data(path):
    df = pd.read_csv(path)
    return df

def basic_clean(df):
    # Drop duplicates
    df = df.drop_duplicates(subset='ID', keep='first') if 'ID' in df.columns else df.drop_duplicates()
    # Standardize column names
    df.columns = [c.strip() for c in df.columns]
    # Convert numeric columns
    for col in ['Price_in_Lakhs','Price_per_SqFt','Size_in_SqFt','BHK','Year_Built','Floor_No','Total_Floors']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

def impute_missing(df):
    # numeric impute with medians
    num_cols = df.select_dtypes(include=['float','int']).columns
    for c in num_cols:
        df[c].fillna(df[c].median(), inplace=True)
    # categorical impute with mode or 'Unknown'
    cat_cols = df.select_dtypes(include=['object']).columns
    for c in cat_cols:
        df[c].fillna(df[c].mode().iloc[0] if not df[c].mode().empty else 'Unknown', inplace=True)
    return df

def feature_engineering(df, current_year=2025):
    if 'Price_per_SqFt' not in df.columns and 'Price_in_Lakhs' in df.columns and 'Size_in_SqFt' in df.columns:
        # Price_in_Lakhs to rupees not necessary; keep as lakhs per spec
        df['Price_per_SqFt'] = (df['Price_in_Lakhs'] * 1e5) / df['Size_in_SqFt']
    if 'Age_of_Property' not in df.columns and 'Year_Built' in df.columns:
        df['Age_of_Property'] = current_year - df['Year_Built']
        df.loc[df['Age_of_Property'] < 0, 'Age_of_Property'] = df['Age_of_Property'].median()
    # amenities count
    if 'Amenities' in df.columns:
        df['Amenities_count'] = df['Amenities'].fillna('').apply(lambda x: len(str(x).split(',')) if x!='' else 0)
    # binary flags
    df['has_parking'] = df.get('Parking_Space', 0).apply(lambda x: 1 if (pd.notna(x) and int(x) > 0) else 0) if 'Parking_Space' in df.columns else 0
    df['is_ready_to_move'] = df.get('Availability_Status','').str.lower().apply(lambda x: 1 if 'ready' in str(x) else 0)
    # city median pps for relative feature
    if 'City' in df.columns and 'Price_per_SqFt' in df.columns:
        city_med = df.groupby('City')['Price_per_SqFt'].transform('median')
        df['pps_vs_city_median'] = df['Price_per_SqFt'] / city_med
    return df

def create_labels(df, city_growth_rates=None, global_rate=0.08, threshold_appreciation=0.20):
    # create future price using deterministic formula (fallback)
    if city_growth_rates is None:
        df['growth_rate'] = global_rate
    else:
        df['growth_rate'] = df['City'].map(city_growth_rates).fillna(global_rate)
    df['Future_Price_5yr'] = df['Price_in_Lakhs'] * ((1 + df['growth_rate']) ** 5)
    # Good investment: either >= threshold_appreciation or price below city median
    if 'Price_per_SqFt' in df.columns:
        city_pps_med = df.groupby('City')['Price_per_SqFt'].transform('median')
        df['cheap_vs_city'] = (df['Price_per_SqFt'] <= 0.95 * city_pps_med).astype(int)
    else:
        df['cheap_vs_city'] = 0
    df['expected_appreciation'] = (df['Future_Price_5yr'] - df['Price_in_Lakhs']) / df['Price_in_Lakhs']
    # multi-factor score
    df['mf_score'] = 0
    df['mf_score'] += (df.get('BHK',0) >= 3).astype(int)
    df['mf_score'] += (df.get('is_ready_to_move',0) == 1).astype(int)
    # assume RERA info in Owner_Type or a specific column
    if 'RERA' in df.columns:
        df['mf_score'] += (df['RERA'].astype(str).str.lower().isin(['yes','true','1'])).astype(int)
    df['Good_Investment'] = ((df['expected_appreciation'] >= threshold_appreciation) | (df['cheap_vs_city'] == 1) | (df['mf_score'] >= 2)).astype(int)
    return df


In [3]:
pip install -r requirements.txt




ERROR: Could not install packages due to an OSError: [WinError 32] The process cannot access the file because it is being used by another process: 'c:\\Users\\ramka\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\docker\\_version.py'
Consider using the `--user` option or check the permissions.


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip



Collecting mlflow (from -r requirements.txt (line 7))
  Using cached mlflow-3.7.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-skinny==3.7.0 (from mlflow->-r requirements.txt (line 7))
  Using cached mlflow_skinny-3.7.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.7.0 (from mlflow->-r requirements.txt (line 7))
  Using cached mlflow_tracing-3.7.0-py3-none-any.whl.metadata (19 kB)
Collecting Flask-CORS<7 (from mlflow->-r requirements.txt (line 7))
  Using cached flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Collecting cryptography<47,>=43.0.0 (from mlflow->-r requirements.txt (line 7))
  Using cached cryptography-46.0.3-cp311-abi3-win_amd64.whl.metadata (5.7 kB)
Collecting docker<8,>=4.0.0 (from mlflow->-r requirements.txt (line 7))
  Using cached docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow->-r requirements.txt (line 7))
  Using cached graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting cloudpickle<4 (fr