In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.pipeline import Pipeline
import joblib

In [4]:
%matplotlib inline

In [33]:
HouseDF = pd.read_csv(r"C:\Users\Ranjitha Arun\Desktop\DSP-GIT\dsp-ranjitha-vadivel\data\train.csv") ## now importing california house pricing dataset
HouseDF = HouseDF[['LotArea', 'Street', 'LotShape', 'GarageArea', 'GarageQual', 'MSZoning', 'KitchenQual', 'SalePrice']]

# model Training

In [34]:
from sklearn.model_selection import train_test_split

# Separate the target variable from the features
X = HouseDF.drop('SalePrice', axis=1)
y = HouseDF['SalePrice']

# Split into training and test sets with a 70/30 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [37]:
# Define the continuous and categorical features
continuous_features = ['LotArea', 'GarageArea']
categorical_features = ['Street', 'LotShape', 'GarageQual', 'MSZoning', 'KitchenQual']

# Scale the continuous features
scaler = StandardScaler()
joblib.dump(scaler,'../models/scalar.joblib')
HouseDF_scaled = scaler.fit_transform(X_train[continuous_features])
HouseDF_scaled = pd.DataFrame(HouseDF_scaled, columns=continuous_features)

# Encode the categorical features using one-hot encoding
encoder = OneHotEncoder(handle_unknown='ignore')
joblib.dump(encoder,'../models/encoder.joblib')
HouseDF_encoded = encoder.fit_transform(X_train[categorical_features])
HouseDF_encoded = pd.DataFrame(HouseDF_encoded.toarray(), columns=encoder.get_feature_names_out(categorical_features))
HouseDF_encoded=HouseDF_encoded.drop('GarageQual_Ex',axis=1)
HouseDF_encoded=HouseDF_encoded.drop('GarageQual_nan',axis=1)

In [8]:
# Concatenate the continuous and categorical features
X_train= pd.concat([HouseDF_scaled, HouseDF_encoded], axis=1)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train,y_train)

#save 

joblib.dump(model, '../models/model.joblib')

['../models/model.joblib']

# model Testing

In [9]:
# Define the continuous and categorical features for test data
continuous_features_test = ['LotArea', 'GarageArea']
categorical_features_test = ['Street', 'LotShape', 'GarageQual', 'MSZoning', 'KitchenQual']

# Scale the continuous features for test data
scaler_test = StandardScaler()
HouseDF_scaled_test = scaler_test.fit_transform(X_test[continuous_features_test])
HouseDF_scaled_test = pd.DataFrame(HouseDF_scaled_test, columns=continuous_features_test)

# Encode the categorical features using one-hot encoding for test data
encoder_test = OneHotEncoder(handle_unknown='ignore')
HouseDF_encoded_test = encoder_test.fit_transform(X_test[categorical_features_test])
HouseDF_encoded_test = pd.DataFrame(HouseDF_encoded_test.toarray(), columns=encoder_test.get_feature_names_out(categorical_features_test))
HouseDF_encoded_test = HouseDF_encoded_test.drop('GarageQual_nan',axis=1)
# Concatenate the continuous and categorical features
X_test = pd.concat([HouseDF_scaled_test, HouseDF_encoded_test], axis=1)

In [10]:
# Make predictions on the testing data

predictions = model.predict(X_test)

In [11]:
# Evaluate the model using RMSLE
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

rmsle = compute_rmsle(y_test, predictions)
print('RMSLE:', rmsle)

RMSLE: 0.25


# Model inference

In [12]:
testDF = pd.read_csv(r"C:\Users\Ranjitha Arun\Desktop\DSP-GIT\dsp-ranjitha-vadivel\data\test.csv")
ID = testDF["Id"]
testDF = testDF[['LotArea', 'Street', 'LotShape', 'GarageArea', 'GarageQual', 'MSZoning', 'KitchenQual', ]]
print(testDF)

      LotArea Street LotShape  GarageArea GarageQual MSZoning KitchenQual
0       11622   Pave      Reg       730.0         TA       RH          TA
1       14267   Pave      IR1       312.0         TA       RL          Gd
2       13830   Pave      IR1       482.0         TA       RL          TA
3        9978   Pave      IR1       470.0         TA       RL          Gd
4        5005   Pave      IR1       506.0         TA       RL          Gd
...       ...    ...      ...         ...        ...      ...         ...
1454     1936   Pave      Reg         0.0        NaN       RM          TA
1455     1894   Pave      Reg       286.0         TA       RM          TA
1456    20000   Pave      Reg       576.0         TA       RL          TA
1457    10441   Pave      Reg         0.0        NaN       RL          TA
1458     9627   Pave      Reg       650.0         TA       RL          TA

[1459 rows x 7 columns]


In [13]:
testDF.isnull().sum()

LotArea         0
Street          0
LotShape        0
GarageArea      1
GarageQual     78
MSZoning        4
KitchenQual     1
dtype: int64

In [14]:
testDF = testDF.dropna()
print(testDF)
testDF.isnull().sum()



      LotArea Street LotShape  GarageArea GarageQual MSZoning KitchenQual
0       11622   Pave      Reg       730.0         TA       RH          TA
1       14267   Pave      IR1       312.0         TA       RL          Gd
2       13830   Pave      IR1       482.0         TA       RL          TA
3        9978   Pave      IR1       470.0         TA       RL          Gd
4        5005   Pave      IR1       506.0         TA       RL          Gd
...       ...    ...      ...         ...        ...      ...         ...
1451    13384   Pave      Reg       336.0         TA       RL          TA
1452     1533   Pave      Reg       286.0         TA       RM          TA
1455     1894   Pave      Reg       286.0         TA       RM          TA
1456    20000   Pave      Reg       576.0         TA       RL          TA
1458     9627   Pave      Reg       650.0         TA       RL          TA

[1376 rows x 7 columns]


LotArea        0
Street         0
LotShape       0
GarageArea     0
GarageQual     0
MSZoning       0
KitchenQual    0
dtype: int64

In [15]:
# Define the continuous and categorical features for test data
continuous_features_inf = ['LotArea', 'GarageArea']
categorical_features_inf = ['Street', 'LotShape', 'GarageQual', 'MSZoning', 'KitchenQual']

# Scale the continuous features for test data
scaler_inf = StandardScaler()
HouseDF_scaled_inf = scaler_inf.fit_transform(testDF[continuous_features_inf])
HouseDF_scaled_inf = pd.DataFrame(HouseDF_scaled_inf, columns=continuous_features_inf)

# Encode the categorical features using one-hot encoding for test data
encoder_inf = OneHotEncoder(handle_unknown='ignore')
HouseDF_encoded_inf = encoder_inf.fit_transform(testDF[categorical_features_inf])
HouseDF_encoded_inf = pd.DataFrame(HouseDF_encoded_inf.toarray(), columns=encoder_inf.get_feature_names_out(categorical_features_inf))
# Concatenate the continuous and categorical features
testDF_final = pd.concat([HouseDF_scaled_inf, HouseDF_encoded_inf], axis=1)
print(testDF_final.info() )
#testDF_final=testDF_final.drop('GarageQual_nan',axis=1)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1376 entries, 0 to 1375
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   LotArea           1376 non-null   float64
 1   GarageArea        1376 non-null   float64
 2   Street_Grvl       1376 non-null   float64
 3   Street_Pave       1376 non-null   float64
 4   LotShape_IR1      1376 non-null   float64
 5   LotShape_IR2      1376 non-null   float64
 6   LotShape_IR3      1376 non-null   float64
 7   LotShape_Reg      1376 non-null   float64
 8   GarageQual_Fa     1376 non-null   float64
 9   GarageQual_Gd     1376 non-null   float64
 10  GarageQual_Po     1376 non-null   float64
 11  GarageQual_TA     1376 non-null   float64
 12  MSZoning_C (all)  1376 non-null   float64
 13  MSZoning_FV       1376 non-null   float64
 14  MSZoning_RH       1376 non-null   float64
 15  MSZoning_RL       1376 non-null   float64
 16  MSZoning_RM       1376 non-null   float64


In [31]:
model_saved = joblib.load('../models/model.joblib')
predictions = model_saved.predict(testDF_final)
predictions

array([175010.33234523, 192464.41495404, 172169.99389943, ...,
        80245.9253969 , 189902.2835034 , 174412.20017726])

In [105]:


def prepare_data(df: pd.DataFrame) -> pd.DataFrame:
    # Separate the target variable from the features
    X = df.drop('SalePrice', axis=1)
    y = df['SalePrice']

    # Define the continuous and categorical features
    continuous_features = ['LotArea', 'GarageArea']
    categorical_features = ['Street', 'LotShape', 'GarageQual', 'MSZoning', 'KitchenQual']

    # Scale the continuous features
    scaler = StandardScaler()
    scaled_X = scaler.fit_transform(X[continuous_features])
    scaled_X = pd.DataFrame(scaled_X, columns=continuous_features)

    # Encode the categorical features using one-hot encoding
    encoder = OneHotEncoder(handle_unknown='ignore')
    encoded_X = encoder.fit_transform(X[categorical_features])
    encoded_X = pd.DataFrame(encoded_X.toarray(), columns=encoder.get_feature_names_out(categorical_features))
    encoded_X = encoded_X.drop('GarageQual_Ex', axis=1)
    encoded_X = encoded_X.drop('GarageQual_nan', axis=1)

    # Concatenate the continuous and categorical features
    X = pd.concat([scaled_X, encoded_X], axis=1)
    
    return X, y

def build_model(data: pd.DataFrame, model_file_path: str) -> dict[str, str]:
    # Prepare data
    X, y = prepare_data(data)

    # Split into training and test sets with a 70/30 split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Train a linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    joblib.dump(model, '../models/model.joblib')
    

    # Evaluate the model
    rmse = calculate_rmse(model, X_test, y_test)

    # Save the model
    joblib.dump(model, model_file_path)

    # Return performance metrics
    return {'rmse': rmse}

def calculate_rmse(model: LinearRegression, X: pd.DataFrame, y: pd.Series) -> float:
    y_pred = model.predict(X)
    mse = ((y_pred - y) ** 2).mean()
    rmse = mse ** 0.5
    return rmse

def make_predictions(input_data: pd.DataFrame, model_file_path: str) -> pd.Series:
    
    
    # Load the model and all the data preparation objects (scaler, encoder, etc)
    model = joblib.load('../models/model.joblib')
    X, y = prepare_data(input_data)

    # Make predictions
    predictions = model.predict(X)

    return pd.Series(predictions)
