In [26]:
# Step 1: Import libraries
import pandas as pd

# Step 2: Load the dataset
file_path = '/content/Housing.csv'  # Path for Google Colab
data = pd.read_csv(file_path)

# Step 3: Display the first few rows to understand the data structure
data.head(10)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
5,10850000,7500,3,3,1,yes,no,yes,no,yes,2,yes,semi-furnished
6,10150000,8580,4,3,4,yes,no,no,no,yes,2,yes,semi-furnished
7,10150000,16200,5,3,2,yes,no,no,no,no,0,no,unfurnished
8,9870000,8100,4,1,2,yes,yes,yes,no,yes,2,yes,furnished
9,9800000,5750,3,2,4,yes,yes,no,no,yes,1,yes,unfurnished


1. Data Pre Processing to change the yes/ no to numeric value to 1/0
2. Removed Nan values by checking yes/no

In [27]:
# Reload the dataset with encoding specified and disable automatic NaN detection
data = pd.read_csv(file_path, encoding='utf-8', na_filter=False)

# Verify data by checking the unique values in each binary column again
for column in ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']:
    print(f"Unique values in {column}: {data[column].unique()}")


from sklearn.preprocessing import LabelEncoder

# Define binary columns and map 'yes'/'no' to 1/0
binary_columns = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
for column in binary_columns:
    data[column] = data[column].map({'yes': 1, 'no': 0})

# Encode the 'furnishingstatus' column with LabelEncoder
data['furnishingstatus'] = LabelEncoder().fit_transform(data['furnishingstatus'])

# Check for any remaining NaN values and display the first few rows to confirm preprocessing
print(data.isnull().sum())  # Should show 0 NaN values if preprocessing is successful
data.head(10)



Unique values in mainroad: ['yes' 'no']
Unique values in guestroom: ['no' 'yes']
Unique values in basement: ['no' 'yes']
Unique values in hotwaterheating: ['no' 'yes']
Unique values in airconditioning: ['yes' 'no']
Unique values in prefarea: ['yes' 'no']
price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0
5,10850000,7500,3,3,1,1,0,1,0,1,2,1,1
6,10150000,8580,4,3,4,1,0,0,0,1,2,1,1
7,10150000,16200,5,3,2,1,0,0,0,0,0,0,2
8,9870000,8100,4,1,2,1,1,1,0,1,2,1,0
9,9800000,5750,3,2,4,1,1,0,0,1,1,1,2


Divide the dataset into train , validation and test set

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Split the data into features and target variable
X = data.drop(columns=['price'])
y = data['price']

# Split data into training (60%), validation (20%), and test (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


Initialize and Train the Model

In [30]:
# Initialize the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training set
model.fit(X_train, y_train)

Evaluate the Model on Train, Validation, and Test Sets

In [31]:
# Evaluate model performance on training set
y_train_pred = model.predict(X_train)
train_mse = mean_squared_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Evaluate model performance on validation set
y_val_pred = model.predict(X_val)
val_mse = mean_squared_error(y_val, y_val_pred)
val_r2 = r2_score(y_val, y_val_pred)

# Evaluate model performance on test set
y_test_pred = model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"Training MSE: {train_mse}, R²: {train_r2}")
print(f"Validation MSE: {val_mse}, R²: {val_r2}")
print(f"Test MSE: {test_mse}, R²: {test_r2}")


Training MSE: 149863717485.64218, R²: 0.9435460880752874
Validation MSE: 1698003258224.6528, R²: 0.6305637935002706
Test MSE: 2072463113860.414, R²: 0.5536534677178122


In [32]:
import joblib

# Save the model as 'best_model.joblib' and 'last_model.joblib'
joblib.dump(model, 'best_model.joblib')
joblib.dump(model, 'last_model.joblib')


['last_model.joblib']

Define a Function for User Input and Make Predictions

In [35]:
import pandas as pd

def predict_price():
    # Take user inputs
    area = float(input("Enter area in square feet: "))
    bedrooms = int(input("Enter number of bedrooms: "))
    bathrooms = int(input("Enter number of bathrooms: "))
    stories = int(input("Enter number of stories: "))
    mainroad = input("Is the property on the main road? (yes/no): ").strip().lower()
    guestroom = input("Does the property have a guest room? (yes/no): ").strip().lower()
    basement = input("Does the property have a basement? (yes/no): ").strip().lower()
    hotwaterheating = input("Does the property have hot water heating? (yes/no): ").strip().lower()
    airconditioning = input("Does the property have air conditioning? (yes/no): ").strip().lower()
    parking = int(input("Enter number of parking spaces: "))
    prefarea = input("Is it in a preferable area? (yes/no): ").strip().lower()
    furnishingstatus = input("Furnishing status (furnished, semi-furnished, unfurnished): ").strip().lower()

    # Convert inputs to match training format
    mainroad = 1 if mainroad == 'yes' else 0
    guestroom = 1 if guestroom == 'yes' else 0
    basement = 1 if basement == 'yes' else 0
    hotwaterheating = 1 if hotwaterheating == 'yes' else 0
    airconditioning = 1 if airconditioning == 'yes' else 0
    prefarea = 1 if prefarea == 'yes' else 0

    furnishing_map = {'furnished': 0, 'semi-furnished': 1, 'unfurnished': 2}
    furnishingstatus = furnishing_map.get(furnishingstatus, 0)  # Default to 'furnished' if input is invalid

    # Arrange input data as a DataFrame with column names
    input_data = pd.DataFrame([[area, bedrooms, bathrooms, stories, mainroad, guestroom, basement, hotwaterheating, airconditioning, parking, prefarea, furnishingstatus]],
                              columns=X.columns)  # Match the feature names with X.columns used in training

    # Make a prediction
    predicted_price = model.predict(input_data)[0]
    print(f"Predicted Price: {predicted_price}")

# Run the prediction with user input
predict_price()



Enter area in square feet: 9100000
Enter number of bedrooms: 4
Enter number of bathrooms: 2
Enter number of stories: 2
Is the property on the main road? (yes/no): yes
Does the property have a guest room? (yes/no): yes
Does the property have a basement? (yes/no): yes
Does the property have hot water heating? (yes/no): no
Does the property have air conditioning? (yes/no): yes
Enter number of parking spaces: 1
Is it in a preferable area? (yes/no): yes
Furnishing status (furnished, semi-furnished, unfurnished): unfurnished
Predicted Price: 7074690.0


In [37]:
import pandas as pd

def predict_price():
    # Take user inputs with explicit prompts and conversion to ensure correct types
    try:
        area = float(input("Enter area in square feet (numeric only): ").strip())
    except ValueError:
        print("Invalid input for area. Please enter a numeric value.")
        return  # Exit the function if area input is invalid

    bedrooms = int(input("Enter number of bedrooms: ").strip())
    bathrooms = int(input("Enter number of bathrooms: ").strip())
    stories = int(input("Enter number of stories: ").strip())
    mainroad = input("Is the property on the main road? (yes/no): ").strip().lower()
    guestroom = input("Does the property have a guest room? (yes/no): ").strip().lower()
    basement = input("Does the property have a basement? (yes/no): ").strip().lower()
    hotwaterheating = input("Does the property have hot water heating? (yes/no): ").strip().lower()
    airconditioning = input("Does the property have air conditioning? (yes/no): ").strip().lower()
    parking = int(input("Enter number of parking spaces: ").strip())
    prefarea = input("Is it in a preferable area? (yes/no): ").strip().lower()
    furnishingstatus = input("Furnishing status (furnished, semi-furnished, unfurnished): ").strip().lower()

    # Convert categorical inputs to binary or encoded values
    mainroad = 1 if mainroad == 'yes' else 0
    guestroom = 1 if guestroom == 'yes' else 0
    basement = 1 if basement == 'yes' else 0
    hotwaterheating = 1 if hotwaterheating == 'yes' else 0
    airconditioning = 1 if airconditioning == 'yes' else 0
    prefarea = 1 if prefarea == 'yes' else 0

    furnishing_map = {'furnished': 0, 'semi-furnished': 1, 'unfurnished': 2}
    furnishingstatus = furnishing_map.get(furnishingstatus, 0)  # Default to 'furnished' if input is invalid

    # Prepare input data in DataFrame format
    input_data = pd.DataFrame([[area, bedrooms, bathrooms, stories, mainroad, guestroom, basement, hotwaterheating, airconditioning, parking, prefarea, furnishingstatus]],
                              columns=X.columns)  # Use column names matching those in training

    # Make a prediction
    predicted_price = model.predict(input_data)[0]
    print(f"Predicted Price: {predicted_price}")

# Run the prediction with user input
predict_price()


Enter area in square feet (numeric only): 7420
Enter number of bedrooms: 4
Enter number of bathrooms: 3
Enter number of stories: 3
Is the property on the main road? (yes/no): yes
Does the property have a guest room? (yes/no): yes
Does the property have a basement? (yes/no): yes
Does the property have hot water heating? (yes/no): no
Does the property have air conditioning? (yes/no): yes
Enter number of parking spaces: 2
Is it in a preferable area? (yes/no): yes
Furnishing status (furnished, semi-furnished, unfurnished): furnished
Predicted Price: 8770673.8
