In [149]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, root_mean_squared_error

In [150]:
kc_data = pd.read_csv("/Users/selinwork/Documents/Ironhack/Ironhack_Week_5/Project_IronKaggle/king_ country_ houses_aa.csv")

In [151]:
kc_data.columns = kc_data.columns.str.lower().str.replace(' ', '_')

In [152]:
df = kc_data.copy()

In [153]:
df["price"] = df["price"].astype(int)
df["bathrooms"] = df["bathrooms"].astype(int)
df["floors"] = df["floors"].astype(int)


In [154]:
# Ensure 'date' is in datetime format
df["date"] = pd.to_datetime(df["date"], errors='coerce')

# Create 'year_month' column as a numerical format
df['year_month'] = df['date'].dt.year * 100 + df['date'].dt.month

# Drop the original 'date' column
df = df.drop(columns=['date'])


In [155]:
df.rename(columns={'year_month': 'date'}, inplace=True)


In [156]:
ml_df = df.copy()

In [157]:
# I dropped the Id columns due to their irrelevance in the model
ml_df = df.drop(["id"], axis=1)

In [158]:
# I have generated a new column called price_per_sqft which is the price of the house divided by the square footage of the living area.To train better.
ml_df['price_per_sqft'] = round(ml_df['price'] / ml_df['sqft_living'], 2)


In [None]:
# I moved my target (price) to the last column
target = ml_df.pop("price")
ml_df["price"] = target
ml_df.head()

In [160]:
X = ml_df.drop(["price"], axis=1)
y = ml_df["price"]

In [161]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f'100% of our data is : {len(ml_df)}.')
print(f'70% for training data is: {len(X_train)}.')
print(f'30% for test data is: {len(X_test)}.')

100% of our data is : 21613.
70% for training data is: 15129.
30% for test data is: 6484.


In [162]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [163]:
predictions = lin_reg.predict(X_test)

In [164]:
comparison = pd.DataFrame({"Actual": y_test, "Predicted": predictions})
comparison['Predicted'] = comparison['Predicted'].astype(int)
comparison.sample(10)

Unnamed: 0,Actual,Predicted
15806,336000,265370
15750,485000,436870
18595,202000,91625
9760,260000,137254
286,467000,479141
13033,395000,497585
549,215000,100826
14070,920000,883834
8288,650000,931981
2481,236500,213823


In [165]:
comparison["Difference"] = abs(comparison["Actual"] - comparison["Predicted"])
comparison = comparison.reset_index().rename(columns={'index': 'ID'})
comparison[['Actual', 'Predicted', 'Difference']].head(10)

Unnamed: 0,Actual,Predicted,Difference
0,365000,355196,9804
1,865000,863196,1804
2,1038000,1099351,61351
3,1490000,1453624,36376
4,711000,708889,2111
5,211000,169342,41658
6,790000,787398,2602
7,680000,672449,7551
8,384500,365447,19053
9,605000,985563,380563


In [166]:
r2_3 = r2_score(y_test, predictions)
RMSE_3 = root_mean_squared_error(y_test, predictions)
MSE_3 = mean_squared_error(y_test, predictions)
MAE_3 = mean_absolute_error(y_test, predictions)

print("R2 = ", round(r2_3, 2))
print("RMSE = ", round(RMSE_3, 2))
print("MSE =  ", round(MSE_3, 2)) 
print("MAE = ", round(MAE_3, 2))

R2 =  0.89
RMSE =  128038.07
MSE =   16393748400.03
MAE =  70257.45


In [167]:
%pip install xgboost

import xgboost as xgb

Note: you may need to restart the kernel to use updated packages.


In [193]:
xgbr1 = xgb.XGBRegressor()
xgbr1.fit(X_train, y_train)

In [196]:
pred_xgb1 = xgbr1.predict(X_test)

In [197]:
r2_8 = r2_score(y_test, pred_xgb)
RMSE_8 = root_mean_squared_error(y_test, pred_xgb)
MSE_8 = mean_squared_error(y_test, pred_xgb)
MAE_8 = mean_absolute_error(y_test, pred_xgb)

print("R2 = ", round(r2_8, 4))
print("RMSE = ", round(RMSE_8, 4))
print("The value of the metric MSE is ", round(MSE_8, 4))
print("MAE = ", round(MAE_8, 4))

R2 =  0.9916
RMSE =  34808.1074
The value of the metric MSE is  1211604341.0922
MAE =  12543.4983


In [None]:
print(ml_df["yr_renovated"].unique())

In [None]:
ml_df.shape

In [None]:
ml_df_new.shape

In [None]:
columns_to_drop = ["zipcode", "long","price_per_sqft"]
ml_df_new = ml_df.drop(columns=[col for col in columns_to_drop if col in ml_df.columns], axis=1)
ml_df_new.head(2)

In [202]:
X = ml_df_new.drop(["price"], axis=1)
y = ml_df_new["price"]

In [203]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f'100% of our data is : {len(ml_df_new)}.')
print(f'70% for training data is: {len(X_train)}.')
print(f'30% for test data is: {len(X_test)}.')

100% of our data is : 21613.
70% for training data is: 15129.
30% for test data is: 6484.


In [204]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [205]:
predictions_new = lin_reg.predict(X_test)

In [206]:
comparison_new = pd.DataFrame({"Actual": y_test, "Predicted": predictions_new})
comparison_new['Predicted'] = comparison_new['Predicted'].astype(int)
comparison_new.head(10)

Unnamed: 0,Actual,Predicted
735,365000,463056
2830,865000,787188
4106,1038000,1198956
16218,1490000,1651188
19964,711000,728673
1227,211000,276224
18849,790000,792044
19369,680000,524877
20164,384500,374730
7139,605000,470648


In [207]:
r2_01 = r2_score(y_test, predictions_new)
RMSE_01 = root_mean_squared_error(y_test, predictions_new)
MSE_01= mean_squared_error(y_test, predictions_new)
MAE_01 = mean_absolute_error(y_test, predictions_new)

print("R2 = ", round(r2_01, 2))
print("RMSE = ", round(RMSE_01, 2))
print("MSE =  ", round(MSE_01, 2)) 
print("MAE = ", round(MAE_01, 2))

R2 =  0.7
RMSE =  209795.43
MSE =   44014122625.08
MAE =  127676.41


In [208]:
import xgboost as xgb

xgbr2 = xgb.XGBRegressor()
xgbr2.fit(X_train, y_train)

In [209]:
pred_xgb2 = xgbr2.predict(X_test)

In [210]:
r2_02 = r2_score(y_test, pred_xgb_new)
RMSE_02 = root_mean_squared_error(y_test, pred_xgb_new)
MSE_02 = mean_squared_error(y_test, pred_xgb_new)
MAE_02 = mean_absolute_error(y_test, pred_xgb_new)

print("R2 = ", round(r2_02, 4))
print("RMSE = ", round(RMSE_02, 4))
print("The value of the metric MSE is ", round(MSE_02, 4))
print("MAE = ", round(MAE_02, 4))

R2 =  0.8022
RMSE =  168981.2871
The value of the metric MSE is  28554675389.0895
MAE =  82770.899


In [None]:
ml_df_new1 = ml_df_new.drop(["sqft_above"], axis=1)
ml_df_new1.head(2)


In [212]:
X = ml_df_new1.drop(["price"], axis=1)
y = ml_df_new1["price"]

In [213]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [215]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [216]:
predictions_new1 = lin_reg.predict(X_test)

In [217]:
comparison_new1 = pd.DataFrame({"Actual": y_test, "Predicted": predictions_new1})
comparison_new1['Predicted'] = comparison_new1['Predicted'].astype(int)
comparison_new1.head(10)

Unnamed: 0,Actual,Predicted
735,365000,463056
2830,865000,787188
4106,1038000,1198956
16218,1490000,1651188
19964,711000,728673
1227,211000,276224
18849,790000,792044
19369,680000,524877
20164,384500,374730
7139,605000,470648


In [218]:
r2_02 = r2_score(y_test, predictions_new1)
RMSE_02 = root_mean_squared_error(y_test, predictions_new1)
MSE_02= mean_squared_error(y_test, predictions_new1)
MAE_02 = mean_absolute_error(y_test, predictions_new1)

print("R2 = ", round(r2_02, 2))
print("RMSE = ", round(RMSE_02, 2))
print("MSE =  ", round(MSE_02, 2)) 
print("MAE = ", round(MAE_02, 2))

R2 =  0.7
RMSE =  209795.43
MSE =   44014122625.08
MAE =  127676.41


In [219]:
xgbr3 = xgb.XGBRegressor()
xgbr3.fit(X_train, y_train)

In [221]:
pred_xgb3 = xgbr3.predict(X_test)

In [222]:
r2_03 = r2_score(y_test, pred_xgb3)
RMSE_03 = root_mean_squared_error(y_test, pred_xgb3)
MSE_03 = mean_squared_error(y_test, pred_xgb3)
MAE_03 = mean_absolute_error(y_test, pred_xgb3)

print("R2 = ", round(r2_03, 4))
print("RMSE = ", round(RMSE_03, 4))
print("The value of the metric MSE is ", round(MSE_03, 4))
print("MAE = ", round(MAE_03, 4))

R2 =  0.7994
RMSE =  170169.2317
The value of the metric MSE is  28957567403.3844
MAE =  82637.7204
