In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from scipy.stats import chi2_contingency
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
import joblib
import json
import os


warnings.filterwarnings("ignore")


In [6]:
data_train_raw = pd.read_csv(r"C:\Users\CORE I5\dsp-sajeev-menon\data\housing_price_train.csv", index_col="Id")
data_train = data_train_raw.copy()

In [7]:
data_train

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000
1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125


In [8]:
X = data_train.drop('SalePrice', axis=1) 
y = data_train['SalePrice'] 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [10]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)


In [11]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1022, 79), (1022,), (438, 79), (438,))

In [12]:
def get_continuous_columns(df: pd.DataFrame) -> list[str]:
    return df.select_dtypes(include=np.number).columns.tolist()

In [13]:
def fillna_continuous(df: pd.DataFrame) -> pd.DataFrame:
    continuous_columns = get_continuous_columns(df)
    df[continuous_columns] = df[continuous_columns].fillna(0)
    return df

In [14]:
def get_categorical_columns(df: pd.DataFrame) -> list[str]:
    return df.select_dtypes(include='object').columns.tolist()

In [15]:
def fillna_categorical(df: pd.DataFrame, fill_value="Unknown") -> pd.DataFrame:
    categorical_columns = get_categorical_columns(df)
    df[categorical_columns] = df[categorical_columns].fillna(fill_value)
    return df

In [16]:

def make_encoder(df: pd.DataFrame, save_path: str) -> OneHotEncoder:
   
    categorical_columns = get_categorical_columns(df)
    encoder = OneHotEncoder(handle_unknown="ignore", dtype=int)
    encoder.fit(df[categorical_columns])

    encoder_name = "encoder.joblib"  
    encoder_path = os.path.join(save_path, encoder_name) 

    joblib.dump(encoder, encoder_path)
    return encoder, encoder_path

In [18]:
def encode_categorical(df: pd.DataFrame, encoder: OneHotEncoder) -> pd.DataFrame:
   
    categorical_columns = get_categorical_columns(df)
    encoded_columns = encoder.transform(df[categorical_columns])
    encoded_df = pd.DataFrame(
        encoded_columns.toarray(),
        columns=encoder.get_feature_names_out(categorical_columns),
    )
    df = df.drop(categorical_columns, axis=1).join(encoded_df)
    return df

In [19]:
encoder_save_path = r"C:\Users\CORE I5\dsp-sajeev-menon\model" 
encoder, _ = make_encoder(X_train, encoder_save_path) 

In [20]:
encoder = joblib.load(os.path.join(encoder_save_path, "encoder.joblib"))
X_test_encoded = encode_categorical(X_test, encoder)

In [35]:
def process_data(df: pd.DataFrame,  continuous_fill_value=0, categorical_fill_value="Unknown") -> pd.DataFrame:
     continuous_cols = get_continuous_columns(df)
     categorical_cols = get_categorical_columns(df)
     df[continuous_cols] = df[continuous_cols].fillna(continuous_fill_value)
     df[categorical_cols] = df[categorical_cols].fillna(categorical_fill_value)
     return df

In [37]:
 
def build_model(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: np.ndarray,
    y_test: np.ndarray,
    model_type="linear_regression",
    save_path= r"C:\Users\CORE I5\dsp-sajeev-menon\model" ,
    model_name="model.joblib",) -> dict[str, any]:
    if model_type == "linear_regression":
        model = LinearRegression()
    else:
        raise ValueError(f"Unsupported model type: {model_type}")

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    model_path = os.path.join(save_path, model_name)
    joblib.dump(model, model_path)

    return {"mse": mse, "rmse": rmse, "r2": r2, "model_path": model_path}

In [46]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

# Assuming you already have a function `get_categorical_columns` to identify categorical columns

# Create and fit the encoder on the training data
encoder = OneHotEncoder(handle_unknown='ignore') 
encoder.fit(X_train[get_categorical_columns(X_train)])
mse, rmse, r2, model_path = build_model(X_train_encoded, X_test_encoded, y_train, y_test).values()

# Transform both training and testing data
X_train_encoded = encoder.transform(X_train[get_categorical_columns(X_train)])
X_test_encoded = encoder.transform(X_test[get_categorical_columns(X_test)])

# Now, you can use X_train_encoded and X_test_encoded in your `build_model` function
results = build_model(X_train_encoded, X_test_encoded, y_train, y_test)
accuracy = results['rmse']  # Or whichever metric you want to use as 'accuracy'
model_path = results['model_path']

In [47]:
accuracy

47291.48015864312

In [82]:
data_test_raw = pd.read_csv(r"C:/Users/CORE I5/dsp-sajeev-menon/data/housing_price_test.csv", index_col="Id")
data_test = data_test_raw.copy()


In [85]:
def make_predictions(
    data_test: pd.DataFrame, encoder_path: str, model_path: str
) -> np.ndarray:
    encoder_train = joblib.load(encoder_path)
    data_test = encode_categorical(data_test, encoder_train)
    process_data(data_test)
    joblib_model = joblib.load(model_path)
    tesr_pred = abs(joblib_model.predict(data_test))
    return tesr_pred


In [86]:
predictions = make_predictions(data_test, encoder_path, model_path)

ValueError: X has 295 features, but LinearRegression is expecting 259 features as input.