# import dependencies

In [112]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OrdinalEncoder
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

# import data

In [113]:
data = pd.read_csv('./dataset/House Price Prediction Dataset.csv')
data.head()

Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage,Price
0,1,1360,5,4,3,1970,Downtown,Excellent,No,149919
1,2,4272,5,4,3,1958,Downtown,Excellent,No,424998
2,3,3592,2,2,3,1938,Downtown,Good,No,266746
3,4,966,4,2,2,1902,Suburban,Fair,Yes,244020
4,5,4926,1,4,2,1975,Downtown,Fair,Yes,636056


# cleaning data

In [114]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Id         2000 non-null   int64 
 1   Area       2000 non-null   int64 
 2   Bedrooms   2000 non-null   int64 
 3   Bathrooms  2000 non-null   int64 
 4   Floors     2000 non-null   int64 
 5   YearBuilt  2000 non-null   int64 
 6   Location   2000 non-null   object
 7   Condition  2000 non-null   object
 8   Garage     2000 non-null   object
 9   Price      2000 non-null   int64 
dtypes: int64(7), object(3)
memory usage: 156.4+ KB


In [115]:
data.duplicated().sum()

np.int64(0)

In [116]:
data_cleaned = data.drop(columns=['Id'])
data_cleaned.head()

Unnamed: 0,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage,Price
0,1360,5,4,3,1970,Downtown,Excellent,No,149919
1,4272,5,4,3,1958,Downtown,Excellent,No,424998
2,3592,2,2,3,1938,Downtown,Good,No,266746
3,966,4,2,2,1902,Suburban,Fair,Yes,244020
4,4926,1,4,2,1975,Downtown,Fair,Yes,636056


# split data

In [117]:
X = data_cleaned.drop(columns=['Price'])
y = data['Price']

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('jumlah train data ', len(X_train))
print('jumlah test data ', len(X_test))

jumlah train data  1600
jumlah test data  400


# preprocessing

## scaling

In [119]:
X_train.columns

Index(['Area', 'Bedrooms', 'Bathrooms', 'Floors', 'YearBuilt', 'Location',
       'Condition', 'Garage'],
      dtype='object')

In [120]:
# TRAIN
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train[['Area', 'Bedrooms', 'Bathrooms', 'Floors', 'YearBuilt']])
X_train_scaled = pd.DataFrame(X_train_scaled, columns=['Area', 'Bedrooms', 'Bathrooms', 'Floors', 'YearBuilt'])
X_train_scaled.head()

Unnamed: 0,Area,Bedrooms,Bathrooms,Floors,YearBuilt
0,0.885282,0.75,1.0,1.0,0.268293
1,0.124722,0.5,0.666667,0.0,0.569106
2,0.204758,0.5,1.0,0.0,0.756098
3,0.479546,0.25,0.666667,0.0,0.585366
4,0.619164,0.25,1.0,0.0,0.658537


In [121]:
# TEST
X_test_scaled = scaler.fit_transform(X_test[['Area', 'Bedrooms', 'Bathrooms', 'Floors', 'YearBuilt']])
X_test_scaled = pd.DataFrame(X_test_scaled, columns=['Area', 'Bedrooms', 'Bathrooms', 'Floors', 'YearBuilt'])
X_test_scaled.head()

Unnamed: 0,Area,Bedrooms,Bathrooms,Floors,YearBuilt
0,0.029379,0.0,1.0,0.5,0.00813
1,0.746717,0.0,0.0,0.5,0.00813
2,0.244158,1.0,1.0,0.0,0.918699
3,0.267972,0.5,0.0,0.5,0.780488
4,0.459159,1.0,0.333333,1.0,0.764228


## encoding

In [122]:
# TRAIN
ordinal_encoder = OrdinalEncoder()
X_train_encoded = ordinal_encoder.fit_transform(X_train[['Condition']])
X_train_encoded = pd.DataFrame(X_train_encoded, columns=['Condition'])

In [123]:
label_encoder = LabelEncoder()
X_train_label_encoded = X_train.copy()
for col in ['Garage', 'Location']:
    X_train_label_encoded[col] = label_encoder.fit_transform(X_train[col])

X_train_label_encoded = X_train_label_encoded[['Garage', 'Location']]

In [124]:
# Test
X_test_encoded = ordinal_encoder.fit_transform(X_test[['Condition']])
X_test_encoded = pd.DataFrame(X_test_encoded, columns=['Condition'])

X_test_label_encoded = X_test.copy()
for col in ['Garage', 'Location']:
    X_test_label_encoded[col] = label_encoder.fit_transform(X_test[col])

X_test_label_encoded = X_test_label_encoded[['Garage', 'Location']]

# gabung data

In [125]:
X_train_label_encoded = X_train_label_encoded.reset_index()
X_train_label_encoded.head()

Unnamed: 0,index,Garage,Location
0,968,0,3
1,240,0,0
2,819,1,3
3,692,1,1
4,420,1,1


In [126]:
X_train_label_encoded = X_train_label_encoded[['Garage', 'Location']]

In [127]:
X_train = pd.concat([X_train_scaled, X_train_encoded, X_train_label_encoded], axis=1)
X_train

Unnamed: 0,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Condition,Garage,Location
0,0.885282,0.75,1.000000,1.0,0.268293,0.0,0,3
1,0.124722,0.50,0.666667,0.0,0.569106,2.0,0,0
2,0.204758,0.50,1.000000,0.0,0.756098,2.0,1,3
3,0.479546,0.25,0.666667,0.0,0.585366,3.0,1,1
4,0.619164,0.25,1.000000,0.0,0.658537,0.0,1,1
...,...,...,...,...,...,...,...,...
1595,0.662961,1.00,0.666667,1.0,0.252033,2.0,1,0
1596,0.277012,0.50,0.666667,0.5,0.333333,1.0,1,0
1597,0.218542,0.25,0.666667,1.0,0.593496,2.0,0,0
1598,0.656959,1.00,1.000000,0.0,0.447154,2.0,1,0


In [128]:
X_test_label_encoded = X_test_label_encoded.reset_index()

X_test_label_encoded = X_test_label_encoded[['Garage', 'Location']]

In [129]:
X_test = pd.concat([X_test_scaled, X_test_encoded, X_test_label_encoded], axis=1)
X_test

Unnamed: 0,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Condition,Garage,Location
0,0.029379,0.00,1.000000,0.5,0.008130,1.0,0,3
1,0.746717,0.00,0.000000,0.5,0.008130,1.0,0,3
2,0.244158,1.00,1.000000,0.0,0.918699,0.0,0,3
3,0.267972,0.50,0.000000,0.5,0.780488,2.0,1,0
4,0.459159,1.00,0.333333,1.0,0.764228,2.0,1,0
...,...,...,...,...,...,...,...,...
395,0.291565,0.00,0.666667,1.0,0.569106,3.0,0,1
396,0.065213,0.50,0.333333,0.5,0.918699,3.0,1,0
397,0.676385,0.75,1.000000,0.5,0.317073,3.0,1,0
398,0.705319,1.00,0.333333,0.5,0.081301,2.0,1,0


# model development

In [130]:
X_train.columns

Index(['Area', 'Bedrooms', 'Bathrooms', 'Floors', 'YearBuilt', 'Condition',
       'Garage', 'Location'],
      dtype='object')

In [131]:
# X_train = X_train[['Area', 'Bedrooms', 'Bathrooms', 'Floors', 'YearBuilt', 'Condition']]
# X_test = X_test[['Area', 'Bedrooms', 'Bathrooms', 'Floors', 'YearBuilt', 'Condition']]

In [132]:
model_svr = SVR(kernel='rbf')
model_svr.fit(X_train, y_train)
y_pred_svr = model_svr.predict(X_test)
mse_svr = mean_squared_error(y_test, y_pred_svr)

print(f'MSE SVR : {mse_svr}')

MSE SVR : 77902418480.88211


In [133]:
# 77902418480.88211


# evaluasi