In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy import stats

df = pd.read_csv("data/train.csv")

In [2]:
missing_values = df.isnull().sum()  # Mengitung jumlah NaN di setiap kolom
missing_values = missing_values[missing_values > 0]  # Hanya tampilkan kolom yang memiliki missing values
missing_values = missing_values.sort_values(ascending=False)  # Urutkan dari yang paling banyak missing values

print(missing_values)

missing_df = pd.DataFrame({'Column': missing_values.index, 'Missing Values': missing_values.values})
print(missing_df)

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
BsmtExposure      38
BsmtFinType2      38
BsmtFinType1      37
BsmtCond          37
BsmtQual          37
MasVnrArea         8
MasVnrType         8
Electrical         1
dtype: int64
          Column  Missing Values
0         PoolQC            1453
1    MiscFeature            1406
2          Alley            1369
3          Fence            1179
4    FireplaceQu             690
5    LotFrontage             259
6     GarageType              81
7    GarageYrBlt              81
8   GarageFinish              81
9     GarageQual              81
10    GarageCond              81
11  BsmtExposure              38
12  BsmtFinType2              38
13  BsmtFinType1              37
14      BsmtCond              37
15      BsmtQual              37
16    MasVnrArea           

In [3]:
df_copy = df.copy()
drop_cols = ["PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu", "LotFrontage"] #kolum yang memiliki missing value paling banyak
df_copy = df_copy.drop(columns=drop_cols)

In [4]:
# Disini saya menggunakan matriks korelasi untuk memilih fitur yang memiliki korelasi tinggi dengan SalePrice
correlation_matrix = df.corr()["SalePrice"].abs().sort_values(ascending=False)
selected_features = correlation_matrix[correlation_matrix > 0.5].index.tolist()

# Hasil dari feature yang kita pilih
print("Fitur yang dipilih berdasarkan korelasi:", selected_features)

Fitur yang dipilih berdasarkan korelasi: ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd']


In [5]:
# Mendefinisikan fitur (X) dan target (y)
X = df_copy[selected_features].drop(columns=["SalePrice"])  # Fitur yang dipilih
y = df_copy["SalePrice"]  # Target (harga rumah)

In [6]:
# Disini saya menggunakan metode Z-score dengan threshold 3 untuk menghapus outliers yang ekstrem
z_scores = np.abs(stats.zscore(X))  
outliers = (z_scores > 3).any(axis=1)  # Identifikasi outliers

# Filter dataset untuk menghapus outliers
X_filtered = X[~outliers]
y_filtered = y[~outliers]

In [7]:
# Disini saya menggunakan log-transformasi untuk membuat distribusi harga rumah lebih normal
y_log = np.log(y_filtered)  

In [8]:
# Disini saya menggunakan StandardScaler agar semua fitur memiliki skala yang sama, sehingga membantu optimasi model
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_filtered)

In [9]:
# Split data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_log, test_size=0.3, random_state=42)

# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)


y_pred_log = model.predict(X_test)


# Evaluasi model menggunakan MAE, RMSE, dan R²
mae = mean_absolute_error(np.exp(y_test), np.exp(y_pred_log))  
rmse = np.sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred_log)))  
r2 = r2_score(y_test, y_pred_log) 

y_mean = df['SalePrice'].mean()

mae_percentage = (mae  /y_mean)*100
rmse_percentage = (rmse /y_mean)*100
r2_percentage = r2*100

# Print the evaluation metrics
print(f"Mean Absolute Error (MAE): {mae:.2f} ({mae_percentage:.2f}%)")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f} (%{rmse_percentage:.2f}%)")
print(f"R-squared (R²): {r2:.4f} ({r2_percentage:.2f}%)")

Mean Absolute Error (MAE): 16709.86 (9.24%)
Root Mean Squared Error (RMSE): 22408.41 (%12.39%)
R-squared (R²): 0.8502 (85.02%)
