In [1]:
import pickle
import pathlib

import numpy as np
import pandas as pd

In [2]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
print(DATA_DIR)

c:\Users\Pedro Antônio Silva\Desktop\Semestre 4\Machine Learning\proj_1\ames\data


In [3]:
clean_data_path = DATA_DIR / 'processed' / 'ames_clean.pkl'

In [4]:
with open(clean_data_path, 'rb') as file:
    data = pickle.load(file)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2877 entries, 0 to 2929
Data columns (total 70 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   MS.SubClass      2877 non-null   category
 1   MS.Zoning        2877 non-null   category
 2   Lot.Frontage     2877 non-null   float64 
 3   Lot.Area         2877 non-null   float64 
 4   Lot.Shape        2877 non-null   category
 5   Land.Contour     2877 non-null   category
 6   Lot.Config       2877 non-null   category
 7   Land.Slope       2877 non-null   category
 8   Neighborhood     2877 non-null   category
 9   Bldg.Type        2877 non-null   category
 10  House.Style      2877 non-null   category
 11  Overall.Qual     2877 non-null   category
 12  Overall.Cond     2877 non-null   category
 13  Roof.Style       2877 non-null   category
 14  Mas.Vnr.Type     2877 non-null   category
 15  Mas.Vnr.Area     2877 non-null   float64 
 16  Exter.Qual       2877 non-null   category


In [6]:
model_data = data.copy()

In [7]:
categorical_columns = []
ordinal_columns = []
for col in model_data.select_dtypes('category').columns:
    if model_data[col].cat.ordered:
        ordinal_columns.append(col)
    else:
        categorical_columns.append(col)

In [8]:
ordinal_columns

['Lot.Shape',
 'Land.Slope',
 'Overall.Qual',
 'Overall.Cond',
 'Exter.Qual',
 'Exter.Cond',
 'Heating.QC',
 'Electrical',
 'Kitchen.Qual',
 'Functional',
 'Paved.Drive',
 'Fence']

In [9]:
categorical_columns

['MS.SubClass',
 'MS.Zoning',
 'Land.Contour',
 'Lot.Config',
 'Neighborhood',
 'Bldg.Type',
 'House.Style',
 'Roof.Style',
 'Mas.Vnr.Type',
 'Foundation',
 'Bsmt.Qual',
 'Bsmt.Cond',
 'Bsmt.Exposure',
 'BsmtFin.Type.1',
 'BsmtFin.Type.2',
 'Central.Air',
 'Garage.Type',
 'Garage.Finish',
 'Sale.Type',
 'Sale.Condition',
 'Condition',
 'Exterior']

In [10]:
for col in ordinal_columns:
    codes, _ = pd.factorize(data[col], sort=True)
    model_data[col] = codes

In [11]:
model_data[ordinal_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2877 entries, 0 to 2929
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   Lot.Shape     2877 non-null   int64
 1   Land.Slope    2877 non-null   int64
 2   Overall.Qual  2877 non-null   int64
 3   Overall.Cond  2877 non-null   int64
 4   Exter.Qual    2877 non-null   int64
 5   Exter.Cond    2877 non-null   int64
 6   Heating.QC    2877 non-null   int64
 7   Electrical    2877 non-null   int64
 8   Kitchen.Qual  2877 non-null   int64
 9   Functional    2877 non-null   int64
 10  Paved.Drive   2877 non-null   int64
 11  Fence         2877 non-null   int64
dtypes: int64(12)
memory usage: 292.2 KB


In [12]:
data['Lot.Shape'].value_counts()

Reg    1825
IR1     960
IR2      76
IR3      16
Name: Lot.Shape, dtype: int64

In [13]:
model_data['Lot.Shape'].value_counts()

0    1825
1     960
2      76
3      16
Name: Lot.Shape, dtype: int64

In [14]:
model_data['Exterior'].value_counts()

VinylSd    1024
HdBoard     439
MetalSd     432
Wd Sdng     401
Plywood     218
CemntBd     126
BrkFace      86
WdShing      55
Stucco       42
AsbShng      41
Other        13
Name: Exterior, dtype: int64

In [15]:
original_data = model_data['Exterior']
encoded_data = pd.get_dummies(original_data)

aux_dataframe = encoded_data
aux_dataframe['Exterior'] = original_data.copy()

aux_dataframe.head().transpose()

Unnamed: 0,0,1,2,3,4
AsbShng,0,0,0,0,0
BrkFace,1,0,0,1,0
CemntBd,0,0,0,0,0
HdBoard,0,0,0,0,0
MetalSd,0,0,0,0,0
Plywood,0,0,0,0,0
Stucco,0,0,0,0,0
VinylSd,0,1,0,0,1
Wd Sdng,0,0,1,0,0
WdShing,0,0,0,0,0


In [16]:
original_data = model_data['Exterior']
encoded_data = pd.get_dummies(original_data, drop_first=True)

aux_dataframe = encoded_data
aux_dataframe['Exterior'] = original_data.copy()

aux_dataframe.head().transpose()

Unnamed: 0,0,1,2,3,4
BrkFace,1,0,0,1,0
CemntBd,0,0,0,0,0
HdBoard,0,0,0,0,0
MetalSd,0,0,0,0,0
Plywood,0,0,0,0,0
Stucco,0,0,0,0,0
VinylSd,0,1,0,0,1
Wd Sdng,0,0,1,0,0
WdShing,0,0,0,0,0
Other,0,0,0,0,0


In [17]:
model_data = pd.get_dummies(model_data, drop_first=True)

Now our dataset has a lot more variables!

In [18]:
model_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2877 entries, 0 to 2929
Columns: 166 entries, Lot.Frontage to Exterior_Other
dtypes: bool(2), float64(34), int64(12), uint8(118)
memory usage: 1.4 MB


In [19]:
for cat in categorical_columns:
    dummies = []
    for col in model_data.columns:
        if col.startswith(cat + "_"):
            dummies.append(f'"{col}"')
    dummies_str = ', '.join(dummies)
    print(f'From column "{cat}" we made {dummies_str}\n')

From column "MS.SubClass" we made "MS.SubClass_30", "MS.SubClass_50", "MS.SubClass_60", "MS.SubClass_70", "MS.SubClass_80", "MS.SubClass_85", "MS.SubClass_90", "MS.SubClass_120", "MS.SubClass_160", "MS.SubClass_190", "MS.SubClass_Other"

From column "MS.Zoning" we made "MS.Zoning_RH", "MS.Zoning_RL", "MS.Zoning_RM"

From column "Land.Contour" we made "Land.Contour_HLS", "Land.Contour_Low", "Land.Contour_Lvl"

From column "Lot.Config" we made "Lot.Config_CulDSac", "Lot.Config_FR2", "Lot.Config_FR3", "Lot.Config_Inside"

From column "Neighborhood" we made "Neighborhood_BrDale", "Neighborhood_BrkSide", "Neighborhood_ClearCr", "Neighborhood_CollgCr", "Neighborhood_Crawfor", "Neighborhood_Edwards", "Neighborhood_Gilbert", "Neighborhood_IDOTRR", "Neighborhood_MeadowV", "Neighborhood_Mitchel", "Neighborhood_NAmes", "Neighborhood_NPkVill", "Neighborhood_NWAmes", "Neighborhood_NoRidge", "Neighborhood_NridgHt", "Neighborhood_OldTown", "Neighborhood_SWISU", "Neighborhood_Sawyer", "Neighborhood_Sa

In [20]:
X = model_data.drop(columns=['SalePrice']).copy()
y = model_data['SalePrice'].copy()

In [21]:
X.values, y.values

(array([[141.0, 31770.0, 1, ..., 0, 0, 0],
        [80.0, 11622.0, 0, ..., 0, 0, 0],
        [81.0, 14267.0, 1, ..., 1, 0, 0],
        ...,
        [62.0, 10441.0, 0, ..., 0, 0, 0],
        [77.0, 10010.0, 0, ..., 0, 0, 0],
        [74.0, 9627.0, 0, ..., 0, 0, 0]], dtype=object),
 array([5.33243846, 5.0211893 , 5.23552845, ..., 5.12057393, 5.23044892,
        5.27415785]))

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
RANDOM_SEED = 42  # Any number here, really.

In [24]:
Xtrain, Xtest, ytrain, ytest = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=RANDOM_SEED,
)


In [25]:
X.shape, Xtrain.shape, Xtest.shape

((2877, 165), (2157, 165), (720, 165))

In [26]:
y.shape, ytrain.shape, ytest.shape

((2877,), (2157,), (720,))

In [27]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, BayesianRidge
model = Lasso()

model.fit(Xtrain, ytrain)

In [28]:
from sklearn.metrics import mean_squared_error

ypred = model.predict(Xtest)
RMSE = np.sqrt(mean_squared_error(ytest, ypred))
print(f"RMSE = {RMSE}")
error_percent = 100 * (10**RMSE - 1)
print(f'Average error is {error_percent:.2f}%')

RMSE = 0.10571853146271401
Average error is 27.56%


Now applying a Pipeline


In [29]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
model = Lasso()

def pred_Lasso(d):
    pred = Pipeline([
        ("poly_2",PolynomialFeatures()),
        ("std_scaler", StandardScaler()),
        ("Lasso",Lasso())
        ])
    return pred

def pred_Linear_Regression(d):
    pred = Pipeline([
        (f"poly_{d}",PolynomialFeatures(degree=d)),
        ("std_scaler", StandardScaler()),
        ("Linear_Regression",LinearRegression())
        ])
    return pred

def pred_Ridge(d):
    pred = Pipeline([
        ("poly_2",PolynomialFeatures(degree=d)),
        ("std_scaler", StandardScaler()),
        ("Ridge",BayesianRidge())
        ])
    return pred


In [30]:
Lpred = pred_Ridge(1)
w
Lpred.fit(Xtrain,ytrain)

ypred = Lpred.predict(Xtest)
RMSE = np.sqrt(mean_squared_error(ytest, ypred))
print(f"RMSE = {RMSE}")
error_percent = 100 * (10**RMSE - 1)
print(f'Average error is {error_percent:.2f}%')

RMSE = 0.061104713052014474
Average error is 15.11%


In [31]:
from sklearn.model_selection import GridSearchCV
Lpred = pred_Ridge(2)
param_grid = [
    {'Ridge__alpha_1': [0.01, 0.1, 1, 10, 100],
     'Ridge__alpha_2': [0.01, 0.1, 1, 10, 100],
     'Ridge__lambda_1': [0.01, 0.1, 1, 10, 100],
     'Ridge__lambda_2': [0.01, 0.1, 1, 10, 100]
    }
]

grid_search = GridSearchCV(Lpred, param_grid, cv=5,
                            scoring='neg_mean_squared_error',
                            return_train_score=True)

grid_search.fit(Xtrain, ytrain)

grid_search.best_params_

grid_search.best_estimator_

KeyboardInterrupt: 

In [None]:
# RMSE for the best estimator

ypred = grid_search.best_estimator_.predict(Xtest)
RMSE = np.sqrt(mean_squared_error(ytest, ypred))
print(f"RMSE = {RMSE}")

RMSE = 0.062031151300850546


In [None]:
# Average error for the best estimator

error_percent = 100 * (10**RMSE - 1)
print(f'Average error is {error_percent:.2f}%')

Average error is 15.35%


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming you have your y_pred and y_test as Pandas Series or DataFrames
# Example:
# y_pred = pd.Series([predicted values])
# y_test = pd.Series([true values])

# Create a DataFrame to store the predicted and true values
results_df = pd.DataFrame({'y_test': ytest, 'y_pred': ypred})

# Create a scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=results_df, x='y_test', y='y_pred')
plt.xlabel('True Values (y_test)')
plt.ylabel('Predicted Values (y_pred)')
plt.title('Scatter Plot of True vs. Predicted Values')
plt.grid(True)
plt.show()