# Step 1: Import libraries


In [42]:

import pandas as pd
import numpy as np
from pathlib import Path
import joblib

# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Step 2:Load data

In [43]:
df = pd.read_csv(r"C:\Users\sheri\land-price-predictor\data\land_prices.csv")


In [44]:
 
# Quick check
df.head() 



Unnamed: 0,property_id,location_name,taluk,village,latitude,longitude,land_area_cents,land_type,distance_to_school_km,distance_to_airport_km,distance_to_railway_station_km,distance_to_hospital_km,distance_to_medical_college_km,distance_to_bus_stop_km,distance_to_market_km,price_lakhs
0,1,Chevayur,Kozhikode,Chevayur,11.278,75.814,5.5,Residential,0.8,26.5,5.1,1.2,2.5,0.4,1.1,16.5
1,2,Medical College Area,Kozhikode,Kozhikode,11.282,75.833,4.0,Commercial,0.5,27.1,6.5,0.3,0.5,0.2,0.8,22.0
2,3,Kunnamangalam,Kozhikode,Kunnamangalam,11.319,75.879,10.0,Residential,1.5,25.0,12.0,3.0,5.0,0.8,1.5,25.0
3,4,Feroke,Kozhikode,Feroke,11.183,75.833,8.0,Residential,1.2,18.0,7.5,2.5,12.0,0.7,1.8,20.0
4,5,Balussery,Koyilandy,Balussery,11.445,75.828,15.0,Agricultural,2.5,38.0,18.0,3.5,20.0,1.2,2.0,18.0


In [45]:
df.columns


Index(['property_id', 'location_name', 'taluk', 'village', 'latitude',
       'longitude', 'land_area_cents', 'land_type', 'distance_to_school_km',
       'distance_to_airport_km', 'distance_to_railway_station_km',
       'distance_to_hospital_km', 'distance_to_medical_college_km',
       'distance_to_bus_stop_km', 'distance_to_market_km', 'price_lakhs'],
      dtype='object')

In [46]:
print(df.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   property_id                     50 non-null     int64  
 1   location_name                   50 non-null     object 
 2   taluk                           50 non-null     object 
 3   village                         50 non-null     object 
 4   latitude                        50 non-null     float64
 5   longitude                       50 non-null     float64
 6   land_area_cents                 50 non-null     float64
 7   land_type                       50 non-null     object 
 8   distance_to_school_km           50 non-null     float64
 9   distance_to_airport_km          50 non-null     float64
 10  distance_to_railway_station_km  50 non-null     float64
 11  distance_to_hospital_km         50 non-null     float64
 12  distance_to_medical_college_km  50 non

In [47]:
print(df.isna().sum())

property_id                       0
location_name                     0
taluk                             0
village                           0
latitude                          0
longitude                         0
land_area_cents                   0
land_type                         0
distance_to_school_km             0
distance_to_airport_km            0
distance_to_railway_station_km    0
distance_to_hospital_km           0
distance_to_medical_college_km    0
distance_to_bus_stop_km           0
distance_to_market_km             0
price_lakhs                       0
dtype: int64


# Handle rare categories 

In [48]:
# Cell 4
# Group villages with <3 occurrences as 'Other'
village_counts = df['village'].value_counts()
rare_villages = village_counts[village_counts < 3].index
df['village'] = df['village'].replace(rare_villages, 'Other')

# Optional: do same for location_name if needed
location_counts = df['location_name'].value_counts()
rare_locations = location_counts[location_counts < 3].index
df['location_name'] = df['location_name'].replace(rare_locations, 'Other')


# Feature engineering

In [49]:
# Cell 5
# Example: accessibility score (closer = better)
df['accessibility_score'] = (
    df['distance_to_school_km'] +
    df['distance_to_market_km'] +
    df['distance_to_hospital_km'] +
    df['distance_to_bus_stop_km']
)

# Log transform land area to reduce skew
df['log_land_area'] = np.log1p(df['land_area_cents'])

# Drop original land_area_cents if using log
# Or keep both if you want model to use raw and log
# df.drop(columns=['land_area_cents'], inplace=True)


# Separate Features & Target

In [50]:
X = df.drop("price_lakhs", axis=1)
y = df["price_lakhs"]


# Define categorical columns

In [51]:
# Cell 7
categorical_features = ["location_name", "taluk", "village", "land_type"]


# Define numerical columns

In [52]:
numerical_features = [
    "latitude", "longitude", "distance_to_school_km", "distance_to_airport_km",
    "distance_to_railway_station_km", "distance_to_hospital_km",
    "distance_to_medical_college_km", "distance_to_bus_stop_km",
    "distance_to_market_km", "accessibility_score", "log_land_area"
]


# Preprocessing

In [53]:

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)


# Train–test split

In [54]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# Create full ML pipeline ans Train Random forest

In [58]:

pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        ("model", RandomForestRegressor(
            n_estimators=300,
            max_depth=8,          # prevents overfitting
            min_samples_split=3,
            min_samples_leaf=2,
            random_state=42
        ))
    ]
)




# Train the model

In [59]:

pipeline.fit(X_train, y_train)


# Evaluate the model

In [60]:
# Cell 11
y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE  : {mae:.2f} lakhs")
print(f"RMSE : {rmse:.2f} lakhs")
print(f"R²   : {r2:.3f}")


MAE  : 4.68 lakhs
RMSE : 6.06 lakhs
R²   : 0.229


# Save model 

In [61]:
import joblib
MODEL_PATH = r"C:\Users\sheri\land-price-predictor\model\land_price_model_rf.pkl"
joblib.dump(pipeline, MODEL_PATH)
print(f"✅ Model saved at {MODEL_PATH}")



✅ Model saved at C:\Users\sheri\land-price-predictor\model\land_price_model_rf.pkl
