#Importing the Libraries

In [109]:
!pip install catboost
!pip install optuna
!pip install xgboost



In [110]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import scipy.stats as stats
from IPython.display import display, HTML

#Data Exploration

In [111]:
#Uploading the Train Data
url = 'https://raw.githubusercontent.com/VictorHNascimento/Housing-Prices-Competition/refs/heads/main/train.csv'
!wget {url} -O houseprices_train.csv

--2025-02-24 22:38:51--  https://raw.githubusercontent.com/VictorHNascimento/Housing-Prices-Competition/refs/heads/main/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 460676 (450K) [text/plain]
Saving to: ‘houseprices_train.csv’


2025-02-24 22:38:52 (7.92 MB/s) - ‘houseprices_train.csv’ saved [460676/460676]



In [112]:
df = pd.read_csv('houseprices_train.csv')

In [113]:
# Function to create scrollable table within a small window
def create_table(df, table_id, title):
    html = f'<h3>{title}</h3>'
    html += f'<div id="{table_id}" style="height:200px; overflow:auto;">'
    html += df.to_html()
    html += '</div>'
    return html

In [114]:
num_features = df.select_dtypes(include=[np.number])
num_features.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [115]:
num_sumary_stats = num_features.describe().T
num_sumary_stats
html_numerical = create_table(num_sumary_stats, 'num_features', 'Summary statistics for numerical features')
display(HTML(html_numerical))

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1460.0,730.5,421.610009,1.0,365.75,730.5,1095.25,1460.0
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1954.0,1973.0,2000.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1967.0,1994.0,2004.0,2010.0
MasVnrArea,1452.0,103.685262,181.066207,0.0,0.0,0.0,166.0,1600.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0


In [116]:
cat_features = df.select_dtypes(include=['object'])
cat_summary_stats = cat_features.describe().T
html_categorical = create_table(cat_summary_stats, 'categorical_features', 'Summary statistics for categorical features')
display(HTML(html_categorical ))

Unnamed: 0,count,unique,top,freq
MSZoning,1460,5,RL,1151
Street,1460,2,Pave,1454
Alley,91,2,Grvl,50
LotShape,1460,4,Reg,925
LandContour,1460,4,Lvl,1311
Utilities,1460,2,AllPub,1459
LotConfig,1460,5,Inside,1052
LandSlope,1460,3,Gtl,1382
Neighborhood,1460,25,NAmes,225
Condition1,1460,9,Norm,1260


In [117]:
null_values = df.isnull().sum()
html_null_values = create_table(null_values.to_frame(), 'null_values', 'Null values in the dataset')


missing_percentage = (df.isnull().sum() / len(df)) * 100
html_missing_percentage = create_table(missing_percentage.to_frame(), 'missing_percentage', 'Percentage of missing values for each feature')

display(HTML(html_null_values + html_missing_percentage))


Unnamed: 0,0
Id,0
MSSubClass,0
MSZoning,0
LotFrontage,259
LotArea,0
Street,0
Alley,1369
LotShape,0
LandContour,0
Utilities,0

Unnamed: 0,0
Id,0.0
MSSubClass,0.0
MSZoning,0.0
LotFrontage,17.739726
LotArea,0.0
Street,0.0
Alley,93.767123
LotShape,0.0
LandContour,0.0
Utilities,0.0


In [118]:
missing_values_rows = df[df.isnull().any(axis=1)]
html_missing_values_rows = create_table(missing_values_rows.head(), 'missing_values_rows', 'Rows with missing values')

display(HTML(html_missing_values_rows))

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [119]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

##Exploring the Target Data

In [120]:
import scipy.stats as stats

# Fiting a normal distribution to the SalePrice data
mu, sigma = stats.norm.fit(df['SalePrice'])

# Creating a histogram of the SalePrice column
hist_data = go.Histogram(x=df['SalePrice'], nbinsx=50, name="Histogram", opacity=0.75, histnorm='probability density', marker=dict(color='#669BBC'))

# Calculating the normal distribution based on the fitted parameters
x_norm = np.linspace(df['SalePrice'].min(), df['SalePrice'].max(), 100)
y_norm = stats.norm.pdf(x_norm, mu, sigma)

# Creating the normal distribution overlay
norm_data = go.Scatter(x=x_norm, y=y_norm, mode="lines", name=f"Normal dist. (μ={mu:.2f}, σ={sigma:.2f})", line=dict(color="#C1121F"))

# Combining the histogram and the overlay
fig = go.Figure(data=[hist_data, norm_data])

# Seting the layout for the plot
fig.update_layout(
    title="SalePrice Distribution",
    xaxis_title="SalePrice",
    yaxis_title="Density",
    legend_title_text="Fitted Normal Distribution",
    plot_bgcolor='#003049',
    paper_bgcolor='#003049',
    font=dict(color='white'),
    autosize=False,
    width=1200,
    height=600
)

fig.show()

In [121]:
# Creating a Q-Q plot
qq_data = stats.probplot(df['SalePrice'], dist="norm")
qq_fig = px.scatter(x=qq_data[0][0], y=qq_data[0][1], labels={'x': 'Theoretical Quantiles', 'y': 'Ordered Values'}, color_discrete_sequence=["#CAD2C5"])
qq_fig.update_layout(
    title="Q-Q plot",
    plot_bgcolor='#2F3E46',
    paper_bgcolor='#2F3E46',
    font=dict(color='white'),
    autosize=False,
    width=1200,
    height=600
)

# Calculating the line of best fit
slope, intercept, r_value, p_value, std_err = stats.linregress(qq_data[0][0], qq_data[0][1])
line_x = np.array(qq_data[0][0])
line_y = intercept + slope * line_x

# Adding the line of best fit to the Q-Q plot
line_data = go.Scatter(x=line_x, y=line_y, mode="lines", name="Normal Line", line=dict(color="#52796F"))

# Updating the Q-Q plot with the normal line
qq_fig.add_trace(line_data)

# Showing the plot
qq_fig.show()

#Creating the Model

##Creating a Data Pipeline

In [122]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

#Creating a transformer to numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
]) #Scale and deal with NaN values

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
]) #Deal with NaN values and encode to numbers

In [123]:
# Updating columns
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

# Removing target variable from numerical columns
numerical_columns = numerical_columns.drop('SalePrice')

# Combining transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ],remainder = 'passthrough')

# Creating a pipeline with the preprocessor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)])

# Applying the pipeline to your dataset
x = df.drop('SalePrice', axis=1)
y = np.log(df['SalePrice']) #Normalize dependent variable
x_preprocessed = pipeline.fit_transform(x)

##Finding the Best Model

In [124]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_preprocessed, y, test_size=0.2, random_state=42)

In [125]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

import numpy as np
import pandas as pd

# Regression Models
models = {
    'Linear Regression': LinearRegression(),
    'Polynomial Regression': LinearRegression(),
    'Support Vector Regression': SVR(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'CatBoost': CatBoostRegressor(verbose=0),
    'XGBoost': XGBRegressor()
}

# Store the results
results = []

# Training and Evaluating Each Model
for name, model in models.items():

    if name == 'Polynomial Regression':
        poly = PolynomialFeatures(degree=2)
        x_train_poly = poly.fit_transform(x_train)
        x_test_poly = poly.transform(x_test)

        model.fit(x_train_poly, y_train)
        y_pred_train = model.predict(x_train_poly)
        y_pred_test = model.predict(x_test_poly)

    else:
      model.fit(x_train, y_train)
      y_pred_train = model.predict(x_train)
      y_pred_test = model.predict(x_test)

    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    mae = mean_absolute_error(y_test, y_pred_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

    results.append({
        'Model': name,
        'Train R²': r2_train,
        'Test R²': r2_test,
        'MAE': mae,
        'RMSE': rmse
    })

# Make a DataFrame with Results
df_results = pd.DataFrame(results)

# Order the df by RMSE.
df_results_sorted = df_results.sort_values(by='RMSE', ascending=True)

def style_format(val):
    if val == df_results_sorted.iloc[0]['Model']:
        color = 'green'
    elif val == df_results_sorted.iloc[1]['Model']:
        color = 'yellow'
    elif val == df_results_sorted.iloc[2]['Model']:
        color = 'red'
    else:
        color = 'black'
    return f'color: {color}; text-align: left;'

styled_df = df_results_sorted.style.applymap(style_format, subset=['Model'])
styled_df


Styler.applymap has been deprecated. Use Styler.map instead.



Unnamed: 0,Model,Train R²,Test R²,MAE,RMSE
5,CatBoost,0.993124,0.912898,0.084378,0.127493
0,Linear Regression,0.944921,0.906461,0.090358,0.13212
4,Random Forest,0.980489,0.883787,0.098057,0.147265
6,XGBoost,0.999837,0.879152,0.102042,0.150173
2,Support Vector Regression,0.960569,0.876527,0.100444,0.151795
1,Polynomial Regression,1.0,0.858373,0.110012,0.162572
3,Decision Tree,1.0,0.791682,0.142969,0.197167


In [126]:
import optuna
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# RMSE function
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Objective function for Optuna
def objective(trial):
    # Hyperparameter search space
    iterations = trial.suggest_int('iterations', 500, 1000)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.5)
    depth = trial.suggest_int('depth', 4, 8)
    l2_leaf_reg = trial.suggest_int('l2_leaf_reg', 1, 5)
    bagging_temperature = trial.suggest_float('bagging_temperature', 0, 2)

    # Create and train the model with the suggested parameters
    model = CatBoostRegressor(
        iterations=iterations,
        learning_rate=learning_rate,
        depth=depth,
        l2_leaf_reg=l2_leaf_reg,
        bagging_temperature=bagging_temperature,
        verbose=0,
        random_state=42
    )

    model.fit(x_train, y_train)

    # Predictions and RMSE calculation
    y_pred = model.predict(x_test)
    return rmse(y_test, y_pred)

# Create the study object for optimization
study = optuna.create_study(direction='minimize')

# Run the optimization
study.optimize(objective, n_trials=20)

# Get the best model and its parameters
best_params = study.best_params
best_model = CatBoostRegressor(
    iterations=best_params['iterations'],
    learning_rate=best_params['learning_rate'],
    depth=best_params['depth'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    bagging_temperature=best_params['bagging_temperature'],
    verbose=0,
    random_state=42
)

# Train the best model
best_model.fit(x_train, y_train)

# Calculate the final RMSE on the test set
y_pred = best_model.predict(x_test)
final_rmse = rmse(y_test, y_pred)

# Display the results
print(f'Best hyperparameters: {best_params}')
print(f'Best RMSE: {final_rmse:.4f}')


[I 2025-02-24 22:39:38,491] A new study created in memory with name: no-name-3e1aa14a-00f0-407b-aa6e-78b717c60fb6
[I 2025-02-24 22:39:39,882] Trial 0 finished with value: 0.14327273842859578 and parameters: {'iterations': 518, 'learning_rate': 0.3895933565319268, 'depth': 4, 'l2_leaf_reg': 1, 'bagging_temperature': 1.836305943357461}. Best is trial 0 with value: 0.14327273842859578.
[I 2025-02-24 22:39:41,868] Trial 1 finished with value: 0.12798732258516365 and parameters: {'iterations': 755, 'learning_rate': 0.18053329184053404, 'depth': 4, 'l2_leaf_reg': 2, 'bagging_temperature': 1.843349205857335}. Best is trial 1 with value: 0.12798732258516365.
[I 2025-02-24 22:39:45,317] Trial 2 finished with value: 0.142693678410438 and parameters: {'iterations': 572, 'learning_rate': 0.33041062031003576, 'depth': 6, 'l2_leaf_reg': 5, 'bagging_temperature': 1.5418569883410385}. Best is trial 1 with value: 0.12798732258516365.
[I 2025-02-24 22:40:02,030] Trial 3 finished with value: 0.1362731840

Best hyperparameters: {'iterations': 996, 'learning_rate': 0.0715822492807757, 'depth': 5, 'l2_leaf_reg': 4, 'bagging_temperature': 1.6026110582894297}
Best RMSE: 0.1209


##Training the Model

In [127]:
regressor = CatBoostRegressor(**best_params, verbose = 0, random_state = 42)
regressor.fit(x_train, y_train)

<catboost.core.CatBoostRegressor at 0x7f243be2d810>

##Evaluating the Training Results

In [128]:
y_pred = regressor.predict(x_test)

In [129]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

y_pred = regressor.predict(x_test)

# Calculating the Metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')

mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {mae}')

r2 = r2_score(y_test, y_pred)
print(f'R²: {r2}')

RMSE: 0.1208589170143248
MAE: 0.08164411844552487
R²: 0.9217265825165363


In [130]:
import plotly.graph_objects as go

# Creating the Scatter
scatter = go.Scatter(
    x=y_test,
    y=y_pred,
    mode='markers',
    marker=dict(color='yellow', opacity=0.8),
    name="Predições"
)

# Line that represent the ideal values (y = x)
line = go.Scatter(
    x=[min(y_test), max(y_test)],
    y=[min(y_test), max(y_test)],
    mode='lines',
    line=dict(color='red', dash='dash'),
    name="y = x"
)

# Layout
layout = go.Layout(
    title="Real Data x Predictions",
    xaxis=dict(title="Real Data"),
    yaxis=dict(title="Predictions"),
    plot_bgcolor='#2F4858',
    paper_bgcolor='#2F4858',
    font=dict(color='white')
)

# Criando a figura
fig = go.Figure(data=[scatter, line], layout=layout)
fig.show()

#Training With All Data

In [131]:
regressor = CatBoostRegressor(**best_params)
regressor.fit(x_preprocessed, y)

0:	learn: 0.3822111	total: 3.92ms	remaining: 3.9s
1:	learn: 0.3649583	total: 10.2ms	remaining: 5.08s
2:	learn: 0.3496217	total: 14ms	remaining: 4.63s
3:	learn: 0.3345548	total: 19.2ms	remaining: 4.77s
4:	learn: 0.3210387	total: 24.7ms	remaining: 4.9s
5:	learn: 0.3081706	total: 29.8ms	remaining: 4.92s
6:	learn: 0.2967953	total: 35.4ms	remaining: 5s
7:	learn: 0.2857200	total: 41.2ms	remaining: 5.09s
8:	learn: 0.2747549	total: 45.8ms	remaining: 5.03s
9:	learn: 0.2655442	total: 50.6ms	remaining: 4.99s
10:	learn: 0.2568257	total: 54.4ms	remaining: 4.87s
11:	learn: 0.2493096	total: 58.1ms	remaining: 4.77s
12:	learn: 0.2418001	total: 61.9ms	remaining: 4.68s
13:	learn: 0.2342635	total: 66.7ms	remaining: 4.68s
14:	learn: 0.2280598	total: 70.3ms	remaining: 4.6s
15:	learn: 0.2217762	total: 73.8ms	remaining: 4.52s
16:	learn: 0.2150817	total: 79.9ms	remaining: 4.6s
17:	learn: 0.2097627	total: 83.7ms	remaining: 4.55s
18:	learn: 0.2040868	total: 87.4ms	remaining: 4.49s
19:	learn: 0.1988523	total: 91.

<catboost.core.CatBoostRegressor at 0x7f243ac7f410>

In [132]:
y_all = regressor.predict(x_preprocessed)

#Calculating the Metrics
rmse = np.sqrt(mean_squared_error(y, y_all))
print(f'RMSE: {rmse}')

mae = mean_absolute_error(y, y_all)
print(f'MAE: {mae}')

r2 = r2_score(y, y_all)
print(f'R²: {r2}')

RMSE: 0.03303814055391015
MAE: 0.025370436043710285
R²: 0.9931545840352747


In [133]:
import plotly.graph_objects as go

# Creating the Scatter
scatter = go.Scatter(
    x=y,
    y=y_all,
    mode='markers',
    marker=dict(color='yellow', opacity=0.8),
    name="Predições"
)

# Line that represent the ideal values (y = x)
line = go.Scatter(
    x=[min(y_all), max(y_all)],
    y=[min(y_all), max(y_all)],
    mode='lines',
    line=dict(color='red', dash='dash'),
    name="y = x"
)

# Layout
layout = go.Layout(
    title="Real Data x Predictions",
    xaxis=dict(title="Real Datas"),
    yaxis=dict(title="Previsões"),
    plot_bgcolor='#2F4858',
    paper_bgcolor='#2F4858',
    font=dict(color='white')
)

# Criando a figura
fig = go.Figure(data=[scatter, line], layout=layout)
fig.show()

#Submission

In [134]:
url = 'https://raw.githubusercontent.com/VictorHNascimento/Housing-Prices-Competition/refs/heads/main/test.csv'
!wget {url} -O houseprices_test.csv

--2025-02-24 22:41:36--  https://raw.githubusercontent.com/VictorHNascimento/Housing-Prices-Competition/refs/heads/main/test.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 451405 (441K) [text/plain]
Saving to: ‘houseprices_test.csv’


2025-02-24 22:41:36 (9.48 MB/s) - ‘houseprices_test.csv’ saved [451405/451405]



In [135]:
df_test = pd.read_csv('houseprices_test.csv')
df_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [136]:
df_sub = df_test[['Id']].copy()
df_test_preprocessed = pipeline.transform(df_test)
x_sub = df_test_preprocessed

In [137]:
y_sub = regressor.predict(x_sub)

In [138]:
df_sub['SalePrice'] = np.exp(y_sub)
df_sub.to_csv('submission.csv', index=False)