In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pickle

# Load dataset
df = pd.read_csv("Chennai houseing sale.csv")

# Handle missing values
df.fillna(df.median(numeric_only=True), inplace=True)

# Drop unnecessary columns
df.drop(["PRT_ID", "DATE_SALE", "DATE_BUILD"], axis=1, inplace=True)

# Select only the relevant columns
df = df[["AREA", "INT_SQFT", "DIST_MAINROAD", "N_BEDROOM", "N_BATHROOM", "N_ROOM", "SALE_COND", "PARK_FACIL", "SALES_PRICE"]]

# Encode categorical variables with drop='first' to avoid multicollinearity
categorical_cols = ["AREA", "SALE_COND", "PARK_FACIL"]

# Initialize OneHotEncoder with handle_unknown='ignore' and sparse_output=True
encoder = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=True)

# Encode categorical features
encoded_features = encoder.fit_transform(df[categorical_cols])

# Standardize numerical data
scaler = StandardScaler()
numerical_cols = ["INT_SQFT", "DIST_MAINROAD", "N_BEDROOM", "N_BATHROOM", "N_ROOM"]
scaled_features = scaler.fit_transform(df[numerical_cols])

# Convert sparse matrix to dense to combine with scaled features
encoded_features_dense = encoded_features.toarray()

# Combine features (scaled numerical and encoded categorical)
X = np.hstack((scaled_features, encoded_features_dense))
y = df["SALES_PRICE"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model with more trees to improve accuracy
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# Make predictions on test set
y_pred = model.predict(X_test)

# Calculate and print the model performance (Mean Squared Error)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Calculate accuracy (R^2 score)
accuracy = model.score(X_test, y_test)
print(f"Accuracy (R^2 Score): {accuracy}")

# Save the model, encoder, and scaler to .pkl files
with open("model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

with open("encoder.pkl", "wb") as encoder_file:
    pickle.dump(encoder, encoder_file)

with open("scaler.pkl", "wb") as scaler_file:
    pickle.dump(scaler, scaler_file)

print("Model, encoder, and scaler have been saved as .pkl files.")


Mean Squared Error: 6233342740039.03
Accuracy (R^2 Score): 0.5255270255528559
Model, encoder, and scaler have been saved as .pkl files.


In [None]:
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# Load the saved model, encoder, and scaler from .pkl files
with open("model.pkl", "rb") as model_file:
    model = pickle.load(model_file)

with open("encoder.pkl", "rb") as encoder_file:
    encoder = pickle.load(encoder_file)

with open("scaler.pkl", "rb") as scaler_file:
    scaler = pickle.load(scaler_file)

# Example input data for testing (replace with real data for actual testing)
test_data = {
    "AREA": ["Karapakkam"],           # Example categorical data
    "INT_SQFT": [1004],          # Example numerical data
    "DIST_MAINROAD": [131],        # Example numerical data
    "N_BEDROOM": [1],            # Example numerical data
    "N_BATHROOM": [1],           # Example numerical data
    "N_ROOM": [3],               # Example numerical data
    "SALE_COND": ["AbNormal"],     # Example categorical data
    "PARK_FACIL": ["Yes"]        # Example categorical data
}

# Convert test data into a DataFrame
import pandas as pd
test_df = pd.DataFrame(test_data)

# Preprocess the categorical variables using the encoder
encoded_test_features = encoder.transform(test_df[["AREA", "SALE_COND", "PARK_FACIL"]]).toarray()

# Standardize numerical features using the scaler
scaled_test_features = scaler.transform(test_df[["INT_SQFT", "DIST_MAINROAD", "N_BEDROOM", "N_BATHROOM", "N_ROOM"]])

# Combine encoded and scaled features
X_test_new = np.hstack((scaled_test_features, encoded_test_features))

# Make a prediction using the trained model
predicted_price = model.predict(X_test_new)

# Print the predicted house price
print(f"Predicted House Price: {predicted_price[0]}")


Predicted House Price: 7562173.125
