In [1]:
import numpy as np
import pandas as pd
import pickle 

In [2]:
df = pd.read_csv('cleandata.csv')  # Replace with your actual file path
print(df)

      Unnamed: 0             location  total_sqft  bath  price  bhk
0              0  1st Block Jayanagar      2850.0   4.0  428.0    4
1              1  1st Block Jayanagar      1630.0   3.0  194.0    3
2              2  1st Block Jayanagar      1875.0   2.0  235.0    3
3              3  1st Block Jayanagar      1200.0   2.0  130.0    3
4              4  1st Block Jayanagar      1235.0   2.0  148.0    2
...          ...                  ...         ...   ...    ...  ...
7355       10292                other      1200.0   2.0   70.0    2
7356       10293                other      1800.0   1.0  200.0    1
7357       10296                other      1353.0   2.0  110.0    2
7358       10297                other       812.0   1.0   26.0    1
7359       10300                other      3600.0   5.0  400.0    4

[7360 rows x 6 columns]


In [4]:
X = df.drop('price', axis=1)  # Drop the 'price' column, keeping the rest as features
y = df['price']  # Target variable (house price)


In [5]:
# One-hot encode categorical variables
X = pd.get_dummies(X, columns=['location'], drop_first=True)

# Save the list of feature columns
feature_columns = X.columns.tolist()

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [7]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)


In [9]:
# Evaluate the model
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error on Test Set: {rmse}")

Root Mean Squared Error on Test Set: 40.616135153106995


In [10]:
# Prediction function based on user input
def predict_price(location, sqft, bath, bhk):
    # Create a zero vector for features
    x_input = np.zeros(len(X.columns))
    
    # Assign values based on input features
    x_input[0] = sqft  # Assuming 'total_sqft' is the first column
    x_input[1] = bath  # Assuming 'bath' is the second column
    x_input[2] = bhk   # Assuming 'phk' (number of BHK) is the third column
    
    # Set the location's one-hot encoded column to 1
    if location in X.columns:
        loc_index = np.where(X.columns == location)[0][0]
        x_input[loc_index] = 1
    
    # Predict the price
    return model.predict([x_input])[0]


In [11]:
# Example prediction
price_prediction = predict_price('Sarjapur  Road', 1200, 2, 2)
print(f"Predicted Price: {price_prediction}")

Predicted Price: 102.01737992590877




In [12]:
# Save the model and feature columns
with open('house_price_model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [13]:
# Example prediction
price_prediction = predict_price('silk board', 15000, 8, 5)
print(f"Predicted Price: {price_prediction}")

Predicted Price: 135.60019756209869


