# Imports

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.decomposition import PCA

# Data loading

In [None]:
! gdown 1I4UJIytuZtFO_0o-YKz7kqwAro97KLM_

Downloading...
From: https://drive.google.com/uc?id=1I4UJIytuZtFO_0o-YKz7kqwAro97KLM_
To: /content/estate_data.zip
  0% 0.00/1.74M [00:00<?, ?B/s]100% 1.74M/1.74M [00:00<00:00, 115MB/s]


In [None]:
! unzip estate_data.zip

Archive:  estate_data.zip
  inflating: Home Sale Data.csv      


# Data preprocessing

In [None]:
data = pd.read_csv('Home Sale Data.csv', sep=';')

In [None]:
data = data.dropna()
data = data.drop_duplicates()
data['Studio'] = data['Number of rooms'].str.contains('Studio').astype(int)
data.loc[data['Number of rooms'] == '10 and more than', 'Number of rooms'] = data.loc[data['Number of rooms'] == '10 and more than', 'Number of rooms'].copy().str[:2]
data['Number of rooms'] = data['Number of rooms'].apply(lambda x: sum(map(float, [_ for _ in x.split('+') if _.isdigit()])))

for floor in data['Floor location'].unique():
    if not floor.isdigit():
        data[floor] = np.int64(data['Floor location'] == floor)
        data.loc[data['Floor location'] == floor, 'Floor location'] = 0
data['Floor location'] = data['Floor location'].astype(int)

data.loc[data['Number of floors'] == '30  and more than', 'Number of floors'] = data.loc[data['Number of floors'] == '30  and more than', 'Number of floors'].copy().str[:2]
data['Number of floors'] = data['Number of floors'].astype(int)

data.loc[data['Number of bathrooms'] == '6 and more than', 'Number of bathrooms'] = data.loc[data['Number of bathrooms'] == '6 and more than', 'Number of bathrooms'].copy().str[:2]
data.loc[data['Number of bathrooms'] == 'Absent', 'Number of bathrooms'] = 0
data['Number of bathrooms'] = data['Number of bathrooms'].astype(int)
data = data.drop(['Adrtisement Date', 'Pick Up Data Time'], axis=1)

data['Price'] = data['Price'].str[:-3]
data['Price'] = data['Price'].str.replace('.', '')
data['Price'] = data['Price'].astype(int)
data['Price'] = data['Price'].apply(np.log)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Studio'] = data['Number of rooms'].str.contains('Studio').astype(int)
  data['Price'] = data['Price'].str.replace('.', '')


In [None]:
categorical_columns = [column for column in data.columns if data[column].dtype == 'object']
encoder = OneHotEncoder(sparse_output=False, drop='if_binary')
encoded_data = encoder.fit_transform(data[categorical_columns])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())
data = data.drop(categorical_columns, axis=1)
data = pd.concat([data.reset_index(drop=True), encoded_df], axis=1)

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(
    data.drop('Price', axis=1), data['Price'],
    test_size=.3, random_state=17
)

In [None]:
scaler = StandardScaler()
train_labels = scaler.fit_transform(train_labels.values.reshape(-1, 1))
test_labels = scaler.transform(test_labels.values.reshape(-1, 1))

In [None]:
pca = PCA()
decomposed_train_features = pca.fit_transform(train_features)
decomposed_test_features = pca.transform(test_features)

# Data modeling

Model training

In [None]:
model = lgb.LGBMRegressor(random_state=17, verbosity=-1)
model.fit(decomposed_train_features, train_labels)

  y = column_or_1d(y, warn=True)


Model evaluating

In [None]:
mean_squared_error(test_labels, model.predict(decomposed_test_features))

0.07963496912466585