In [5]:
# =========================================================
#               SMART MODEL LOADER WITH FIX
# =========================================================

import pickle
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error

# Define fixed file names
FIXED_PKL_FILENAME = "gurgaon_property_predictor.pkl"

print("CHECKING FOR EXISTING MODEL")
print("=" * 50)

model = None

# Check if model file exists and is valid
if os.path.exists(FIXED_PKL_FILENAME):
    print(f"Model file '{FIXED_PKL_FILENAME}' found")
    print("Checking model validity...")
    
    try:
        with open(FIXED_PKL_FILENAME, "rb") as file:
            loaded_content = pickle.load(file)
        
        # Check if it's actually a model with predict method
        if hasattr(loaded_content, 'predict'):
            model = loaded_content
            print("Valid model loaded successfully")
        else:
            print(f"File contains {type(loaded_content)}, not a trained model")
            print("Training new model...")
            model = None
            
    except Exception as e:
        print(f"Error loading model: {e}")
        print("Training new model...")
        model = None
else:
    print(f"Model file '{FIXED_PKL_FILENAME}' not found")
    print("Training new model...")
    model = None

# Train new model if needed
if model is None:
    print("\nTRAINING NEW MODEL")
    print("=" * 50)

    # Load dataset
    file_path = "data of gurugram real Estate 2024.csv"
    data = pd.read_csv(file_path)

    print(f"Dataset shape: {data.shape}")
    print(f"Dataset columns: {list(data.columns)}")

    # Data cleaning
    data_clean = data.dropna()

    # Clean numeric columns
    for col in data_clean.columns:
        if data_clean[col].dtype == 'object':
            data_clean[col] = (
                data_clean[col]
                .astype(str)
                .str.replace(',', '')
                .str.replace('₹', '')
                .str.strip()
            )
            try:
                data_clean[col] = pd.to_numeric(data_clean[col])
            except:
                continue

    # Find numeric columns
    numeric_cols = data_clean.select_dtypes(include=[np.number]).columns.tolist()

    # Find target column
    price_columns = [col for col in numeric_cols if any(word in col.lower() for word in ['price', 'rate', 'cost'])]
    if price_columns:
        y_col = price_columns[0]
        print(f"Target column: {y_col}")
    else:
        y_col = numeric_cols[-1] if numeric_cols else 'Price'
        print(f"No price column found. Using: {y_col}")

    # Feature columns - use simple features that exist
    simple_features = ['Area', 'BHK', 'Bathroom']
    X_cols = [col for col in simple_features if col in numeric_cols and col != y_col]
    
    # If simple features not found, use available numeric columns
    if not X_cols:
        X_cols = [col for col in numeric_cols if col != y_col][:3]  # Use first 3 features
    
    print(f"Using features: {X_cols}")

    # Prepare data
    X = data_clean[X_cols]
    y = data_clean[y_col]

    # Remove NaN values
    valid_indices = X.notna().all(axis=1) & y.notna()
    X = X[valid_indices]
    y = y[valid_indices]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print(f"Training samples: {X_train.shape[0]}")
    print(f"Testing samples: {X_test.shape[0]}")

    # Train model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Evaluate model
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    print(f"Model trained successfully")
    print(f"R2 Score: {r2:.4f}")
    print(f"Mean Absolute Error: {mae:,.2f}")

    # Save model
    with open(FIXED_PKL_FILENAME, "wb") as file:
        pickle.dump(model, file)
    print(f"Model saved as: {FIXED_PKL_FILENAME}")

# =========================================================
#               PRICE PREDICTOR
# =========================================================

print("\n" + "=" * 50)
print("GURGAON HOUSE PRICE PREDICTOR")
print("=" * 50)

def predict_house_price():
    try:
        print("\nEnter Property Details:")
        area = float(input("Area (in sq. ft.): "))
        bhk = int(input("Number of BHK: "))
        bathroom = int(input("Number of Bathrooms: "))
        
        # Create input data with correct feature names
        input_data = {}
        
        # Map inputs to actual feature names used in training
        feature_mapping = {
            'Area': area,
            'BHK': bhk, 
            'Bathroom': bathroom
        }
        
        # Use only the features that were actually used in training
        for feature in X_cols:
            if feature in feature_mapping:
                input_data[feature] = [feature_mapping[feature]]
            else:
                # For other features, use mean value
                input_data[feature] = [X[feature].mean()]
        
        input_df = pd.DataFrame(input_data)
        
        # Make prediction
        predicted_price = model.predict(input_df)[0]
        
        # Display result
        print("\n" + "=" * 50)
        print("PREDICTION RESULT")
        print("=" * 50)
        print(f"Property Details:")
        print(f"Area: {area:,} sq. ft.")
        print(f"BHK: {bhk}")
        print(f"Bathrooms: {bathroom}")
        print(f"Estimated Price: Rs{predicted_price:,.2f}")
        print("=" * 50)
        
    except ValueError:
        print("Error: Please enter valid numbers")
    except Exception as e:
        print(f"Prediction error: {e}")

# Run predictor
predict_house_price()
print("\nPrediction completed")

CHECKING FOR EXISTING MODEL
Model file 'gurgaon_property_predictor.pkl' found
Checking model validity...
File contains <class 'numpy.ndarray'>, not a trained model
Training new model...

TRAINING NEW MODEL
Dataset shape: (19515, 12)
Dataset columns: ['Price', 'Status', 'Area', 'Rate per sqft', 'Property Type', 'Locality', 'Builder Name', 'RERA Approval', 'BHK_Count', 'Socity', 'Company Name', 'Flat Type']
Target column: Price
Using features: ['Area']
Training samples: 15612
Testing samples: 3903
Model trained successfully
R2 Score: 0.0521
Mean Absolute Error: 26,942,056.91
Model saved as: gurgaon_property_predictor.pkl

GURGAON HOUSE PRICE PREDICTOR

Enter Property Details:


Area (in sq. ft.):  500
Number of BHK:  4
Number of Bathrooms:  5



PREDICTION RESULT
Property Details:
Area: 500.0 sq. ft.
BHK: 4
Bathrooms: 5
Estimated Price: Rs38,306,399.70

Prediction completed
