In [1]:
%matplotlib inline
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder,StandardScaler
import sklearn.metrics as metrics
from sklearn import preprocessing
from joblib import dump,load

In [2]:
train_data = pd.read_csv('/Users/raphsmart/Desktop/DSA/House_prices_industrialization/train.csv')
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
df = train_data
df_main_train = df.copy
label_col = 'SalePrice'

In [4]:
useful_features = ['Foundation', 'KitchenQual', 'TotRmsAbvGrd']
useful_features

['Foundation', 'KitchenQual', 'TotRmsAbvGrd']

In [5]:
df = df[useful_features + [label_col]]
df.head()

Unnamed: 0,Foundation,KitchenQual,TotRmsAbvGrd,SalePrice
0,PConc,Gd,8,208500
1,CBlock,TA,6,181500
2,PConc,Gd,6,223500
3,BrkTil,Gd,7,140000
4,PConc,Gd,9,250000


In [6]:
continuous_columns = df[useful_features].select_dtypes(include = 'number').columns
continuous_columns

Index(['TotRmsAbvGrd'], dtype='object')

In [7]:
scaler = StandardScaler()
scaler.fit(df[continuous_columns])

StandardScaler()

In [8]:
scaled_columns = scaler.transform(df[continuous_columns])
scaled_columns

array([[ 0.91220977],
       [-0.31868327],
       [-0.31868327],
       ...,
       [ 1.52765629],
       [-0.93412978],
       [-0.31868327]])

In [9]:
continuous_features_df = pd.DataFrame(data=scaled_columns, columns = continuous_columns)
continuous_features_df.head()

Unnamed: 0,TotRmsAbvGrd
0,0.91221
1,-0.318683
2,-0.318683
3,0.296763
4,1.527656


In [10]:
categorical_columns = df[['Foundation', 'KitchenQual']]
categorical_columns.head()

Unnamed: 0,Foundation,KitchenQual
0,PConc,Gd
1,CBlock,TA
2,PConc,Gd
3,BrkTil,Gd
4,PConc,Gd


In [86]:
encoder = OneHotEncoder(sparse = True)
encoder.fit(categorical_columns)
filename = "/Users/raphsmart/Desktop/DSA/House_prices_industrialization/models/encoder.joblib"
joblib.dump(encoder, filename)

['/Users/raphsmart/Desktop/DSA/House_prices_industrialization/models/encoder.joblib']

In [87]:
X = pd.DataFrame(encoder.transform(categorical_columns).toarray())
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [88]:
X.index = df.index
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [89]:
final_df = continuous_features_df.join(X).join(df[label_col])
final_df.head()

Unnamed: 0,TotRmsAbvGrd,0,1,2,3,4,5,6,7,8,9,SalePrice
0,0.91221,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,208500
1,-0.318683,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,181500
2,-0.318683,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,223500
3,0.296763,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,140000
4,1.527656,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,250000


In [90]:
X, y = final_df.drop(columns=[label_col]), final_df[label_col]

In [91]:
X.shape

(1460, 11)

In [92]:
X.head()

Unnamed: 0,TotRmsAbvGrd,0,1,2,3,4,5,6,7,8,9
0,0.91221,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.318683,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.318683,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.296763,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.527656,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [93]:
y.shape

(1460,)

In [94]:
y.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

In [95]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [96]:
from sklearn.linear_model import LinearRegression

In [97]:
reg = LinearRegression().fit(X_train, y_train)

y_pred = reg.predict(X_test)
y_pred



array([138582.38031186, 258935.35956139, 121449.75697432, 155715.00364941,
       295597.30196973,  87184.51029923, 218132.21101051, 155715.00364941,
        87184.51029923, 196518.15220029, 112730.6002052 , 121449.75697432,
       138582.38031186, 224670.1128863 , 179385.52886274, 183866.96433542,
       224670.1128863 , 138582.38031186, 112730.6002052 , 224670.1128863 ,
       172847.62698695, 207537.48954876, 224670.1128863 , 104317.13363677,
       207537.48954876, 190404.86621121, 207537.48954876, 104317.13363677,
       179385.52886274, 190404.86621121, 145120.28218765, 241802.73622385,
       374722.51678166, 104317.13363677, 241802.73622385, 121449.75697432,
       155715.00364941, 241802.73622385, 276067.98289894, 127987.65885011,
       162252.9055252 , 241802.73622385, 121449.75697432, 329862.54864482,
       121449.75697432, 125344.4956376 , 121449.75697432, 138582.38031186,
       364127.79531991, 155715.00364941, 138582.38031186, 190404.86621121,
       127987.65885011, 2

In [98]:
import numpy as np
from sklearn.metrics import mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

In [99]:
compute_rmsle(y_test, y_pred)

0.25

In [100]:
# model.joblib
import joblib

In [101]:
filename = "/Users/raphsmart/Desktop/DSA/House_prices_industrialization/models/scalar.joblib"
joblib.dump(scaler, filename)

['/Users/raphsmart/Desktop/DSA/House_prices_industrialization/models/scalar.joblib']

In [113]:
# filename = "/Users/raphsmart/Desktop/DSA/House_prices_industrialization/models/encoder.joblib"
# joblib.dump(encoder, filename)

['/Users/raphsmart/Desktop/DSA/House_prices_industrialization/models/encoder.joblib']

In [37]:
filename = "/Users/raphsmart/Desktop/DSA/House_prices_industrialization/models/model.joblib"
joblib.dump(reg, filename)

['/Users/raphsmart/Desktop/DSA/House_prices_industrialization/models/model.joblib']

In [135]:
def build_model(final_df: pd.DataFrame) -> dict[str, str]:
    
    # split data into Train, Test
    X_train, X_test, y_train, y_test = train_test_split(final_df)


    #Create Encoder
    encoder = OneHotEncoder(sparse = True)   
    
    #Create Scalar
    scalar =StandardScaler()
    
    X_train = X_train,encoder=encoder,scalar=scalar,is_test=False
    
    #Define an evaluation dictonary
    evaluations_dict= dict()

    #Defining the Machine Learning model 
    reg = LinearRegression()
    
    #Train model
    reg.fit(X_train, y_train)
    
    # Model Build Evalution on Testing Set 
    #-------------------------------------
    #Preprocessing(cleaning data and using trained encoders,scalars)
    X_test = X_test, encoder=encoder, scalar=scalar, is_test=True
    
    #Testing-set evaluation
    y_test_predictions = reg.predict(X_test)
    test_evaluation = evaluate_performance(y_pred=y_test_predictions, y_true=y_test,
                                                precision=3,comment="Test")
    evaluations_dict.update(test_evaluation)
    # Returns a dictionary with the model performances (for example {'rmse': 0.18})
    return evaluations_dict

In [136]:
# Build Model 
evaluations= build_model(final_df)
print(evaluations)

ValueError: not enough values to unpack (expected 4, got 2)

In [111]:
# model inference

In [None]:
# Load Data
test_master=pd.read_csv("/Users/raphsmart/Desktop/DSA/House_prices_industrialization/test.csv")
test_data = test_master.copy()

In [143]:
def make_predictions(input_data: pd.DataFrame) -> np.ndarray:
    
     # load the encoder 
    encode_filename = "/Users/raphsmart/Desktop/DSA/House_prices_industrialization/models/encoder.joblib"
    encoder = load(encode_filename)
    encoder = OneHotEncoder(handle_unknown='ignore')
    encoded_input = encoder.fit(input_data[['Foundation', 'KitchenQual']])
    encoded_input = encoder.transform(input_data[['Foundation', 'KitchenQual']])

    # load the scalar
    scalar_filename = "/Users/raphsmart/Desktop/DSA/House_prices_industrialization/models/scalar.joblib"
    scalar = load(scalar_filename)
    scalar_input = scalar.transform(input_data[['TotRmsAbvGrd']])

    # load the model
    model_filename = "/Users/raphsmart/Desktop/DSA/House_prices_industrialization/models/encoder.joblib"
    model= load(model_filename)
    model_input = model.transform(input_data)
    
    final_predict = encoded_input.join(scalar_input)
    
#     input_data = input_data, encoder = encoded_input, scalar = scalar_input, is_test=True
    
    #Validation-set evaluation
    y_predictions = model.predict(final_predict)
    
    return y_predictions

In [144]:
predicitons = make_predictions(test_master)
print(predicitons)

Feature names unseen at fit time:
- 1stFlrSF
- 2ndFlrSF
- 3SsnPorch
- Alley
- BedroomAbvGr
- ...
Feature names must be in the same order as they were in fit.



ValueError: X has 80 features, but OneHotEncoder is expecting 2 features as input.