In [None]:
#Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
import pickle

In [None]:
# Load the dataset
data = pd.read_csv("Delhi_v2.csv")

In [None]:
# Clean and preprocess the data
cleaned_data = data.drop(["Unnamed: 0", "latitude", "longitude", "neworold", "Landmarks", 
                          "type_of_building", "desc", "Price_sqft", "Lift", "parking"], axis="columns")

In [None]:
# Fill missing values
cleaned_data["Balcony"] = cleaned_data["Balcony"].fillna(cleaned_data["Bedrooms"])
cleaned_data["Status"] = cleaned_data["Status"].fillna("Under Construction")

price_mask = cleaned_data["price"] > 6500000.0
cleaned_data.loc[price_mask, "Furnished_status"] = cleaned_data.loc[price_mask, "Furnished_status"].fillna("Furnished")
cleaned_data.loc[~price_mask, "Furnished_status"] = cleaned_data.loc[~price_mask, "Furnished_status"].fillna("Unfurnished")

In [None]:
# Normalize 'Address' column
cleaned_data["Address"] = cleaned_data["Address"].apply(lambda x: x.strip())
address_counts = cleaned_data["Address"].value_counts()
infrequent_addresses = address_counts[address_counts <= 10].index
cleaned_data["Address"] = cleaned_data["Address"].apply(lambda x: "Other" if x in infrequent_addresses else x)

In [None]:
# Split the data into features and target variable
X = cleaned_data.drop(columns=["price"])
y = cleaned_data["price"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Preprocess the categorical features
column_transformer = make_column_transformer(
    (OneHotEncoder(sparse_output=False), ["Address", "Furnished_status", "Status"]),
    remainder="passthrough"
)

In [None]:
# Standardize the data and create the Ridge regression pipeline
scaler = StandardScaler()
ridge_regressor = Ridge()
ridge_pipeline = make_pipeline(column_transformer, scaler, ridge_regressor)

In [None]:
# Train the model
ridge_pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred_ridge = ridge_pipeline.predict(X_test)

In [None]:
# Evaluate the model
r2 = r2_score(y_test, y_pred_ridge)
print(f"R^2 Score: {r2}")


# Example of predicting a new data point
new_data = pd.DataFrame([["Noida Extension, Noida, Delhi NCR", 1350, 3, "Unfurnished", "Ready to Move", 2, 1]],
                        columns=["Address", "area", "Bedrooms", "Furnished_status", "Status", "Bathrooms", "Balcony"])

predicted_price = ridge_pipeline.predict(new_data)[0]
print(f"Predicted Price: {predicted_price}")