In [58]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.pipeline import Pipeline

In [59]:
%matplotlib inline

In [60]:
HouseDF = pd.read_csv(r"C:\Users\Ranjitha Arun\Desktop\DSP-GIT\dsp-ranjitha-vadivel\data\train.csv") ## now importing california house pricing dataset
HouseDF = HouseDF[['LotArea', 'Street', 'LotShape', 'GarageArea', 'GarageQual', 'MSZoning', 'KitchenQual', 'SalePrice']]

In [61]:
HouseDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   LotArea      1460 non-null   int64 
 1   Street       1460 non-null   object
 2   LotShape     1460 non-null   object
 3   GarageArea   1460 non-null   int64 
 4   GarageQual   1379 non-null   object
 5   MSZoning     1460 non-null   object
 6   KitchenQual  1460 non-null   object
 7   SalePrice    1460 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 91.4+ KB


In [62]:
HouseDF.head() 

Unnamed: 0,LotArea,Street,LotShape,GarageArea,GarageQual,MSZoning,KitchenQual,SalePrice
0,8450,Pave,Reg,548,TA,RL,Gd,208500
1,9600,Pave,Reg,460,TA,RL,TA,181500
2,11250,Pave,IR1,608,TA,RL,Gd,223500
3,9550,Pave,IR1,642,TA,RL,Gd,140000
4,14260,Pave,IR1,836,TA,RL,Gd,250000


In [63]:
HouseDF.isnull().sum()

LotArea         0
Street          0
LotShape        0
GarageArea      0
GarageQual     81
MSZoning        0
KitchenQual     0
SalePrice       0
dtype: int64

In [64]:
HouseDF.dropna(axis=0, inplace=True)

In [65]:
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

In [66]:
# Split the data into input features and target variable
y = HouseDF["SalePrice"]
X = HouseDF.drop(["SalePrice"], axis=1)

# Split the data into training and testing sets
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=22)


In [67]:
# Separate the continuous and categorical features
continuous_features = ['LotArea', 'GarageArea']
categorical_features = ['LotShape', 'KitchenQual']

# Preprocess and engineer features in the train set
scaler = StandardScaler()
encoder = OrdinalEncoder()
X_train_continuous = pd.DataFrame(scaler.fit_transform(X_train[continuous_features]), columns=continuous_features)
X_train_categorical = encoder.fit_transform(X_train[categorical_features])
X_train_continuous_df = pd.DataFrame(X_train_continuous, columns=continuous_features)
X_train_categorical_df = pd.DataFrame(X_train_categorical, columns=categorical_features)
X_train = pd.concat([X_train_continuous_df, X_train_categorical_df], axis=1)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, np.log(y_train))

# Preprocess and engineer features in the test set
X_test_continuous = pd.DataFrame(scaler.transform(X_test[continuous_features]), columns=continuous_features)
X_test_categorical = encoder.transform(X_test[categorical_features])
X_test_continuous_df = pd.DataFrame(X_test_continuous, columns=continuous_features)
X_test_categorical_df = pd.DataFrame(X_test_categorical, columns=categorical_features)
X_test = pd.concat([X_test_continuous_df, X_test_categorical_df], axis=1)

# Make predictions on the testing data
predictions = np.exp(model.predict(X_test))

# Evaluate the model using RMSLE
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

rmsle = compute_rmsle(y_test, predictions)
print('RMSLE:', rmsle)

RMSLE: 0.25


In [87]:
testDF = pd.read_csv(r"C:\Users\Ranjitha Arun\Desktop\DSP-GIT\dsp-ranjitha-vadivel\data\test.csv")
ID = testDF["Id"]
testDF = testDF[['LotArea', 'Street', 'LotShape', 'GarageArea', 'GarageQual', 'MSZoning', 'KitchenQual', ]]
print(testDF)


      LotArea Street LotShape  GarageArea GarageQual MSZoning KitchenQual
0       11622   Pave      Reg       730.0         TA       RH          TA
1       14267   Pave      IR1       312.0         TA       RL          Gd
2       13830   Pave      IR1       482.0         TA       RL          TA
3        9978   Pave      IR1       470.0         TA       RL          Gd
4        5005   Pave      IR1       506.0         TA       RL          Gd
...       ...    ...      ...         ...        ...      ...         ...
1454     1936   Pave      Reg         0.0        NaN       RM          TA
1455     1894   Pave      Reg       286.0         TA       RM          TA
1456    20000   Pave      Reg       576.0         TA       RL          TA
1457    10441   Pave      Reg         0.0        NaN       RL          TA
1458     9627   Pave      Reg       650.0         TA       RL          TA

[1459 rows x 7 columns]


In [88]:
testDF.isnull().sum()

LotArea         0
Street          0
LotShape        0
GarageArea      1
GarageQual     78
MSZoning        4
KitchenQual     1
dtype: int64

In [89]:
testDF = testDF.dropna()
print(testDF)



      LotArea Street LotShape  GarageArea GarageQual MSZoning KitchenQual
0       11622   Pave      Reg       730.0         TA       RH          TA
1       14267   Pave      IR1       312.0         TA       RL          Gd
2       13830   Pave      IR1       482.0         TA       RL          TA
3        9978   Pave      IR1       470.0         TA       RL          Gd
4        5005   Pave      IR1       506.0         TA       RL          Gd
...       ...    ...      ...         ...        ...      ...         ...
1451    13384   Pave      Reg       336.0         TA       RL          TA
1452     1533   Pave      Reg       286.0         TA       RM          TA
1455     1894   Pave      Reg       286.0         TA       RM          TA
1456    20000   Pave      Reg       576.0         TA       RL          TA
1458     9627   Pave      Reg       650.0         TA       RL          TA

[1376 rows x 7 columns]


In [90]:
# Scale continuous features
X_test_continuous = pd.DataFrame(scaler.transform(testDF[continuous_features]), columns=continuous_features)



In [91]:
# Encode categorical features, ignore unknown categories

X_test_categorical = encoder.transform(testDF[categorical_features])
X_test_categorical_df = pd.DataFrame(X_test_categorical, columns=categorical_features)



In [92]:
# Concatenate continuous and categorical features
X_test = pd.concat([X_test_continuous, X_test_categorical_df], axis=1)



In [93]:


X_test_imputed = X_test.dropna()
# Make predictions on the test data
predictions = np.exp(model.predict(X_test_imputed))
testDF["SalePrice"] = predictions



In [94]:
testDF["Id"] = ID

In [96]:
# Save predictions to a file
testDF[["Id", "SalePrice"]].to_csv("submission.csv", index=False)


In [98]:
# Import joblib
import joblib

# Save the trained model
joblib.dump(model, '../models/model.joblib')



['../models/model.joblib']

In [103]:
# Load the trained model from the saved file
model = joblib.load('../models/model.joblib')

# Model inference
testDF = pd.read_csv(r"C:\Users\Ranjitha Arun\Desktop\DSP-GIT\dsp-ranjitha-vadivel\data\test.csv")
X_test_continuous = pd.DataFrame(scaler.transform(testDF[continuous_features]), columns=continuous_features)

# Encode categorical features, ignore unknown categories
X_test_categorical = encoder.transform(testDF[categorical_features])
X_test_categorical_df = pd.DataFrame(X_test_categorical, columns=categorical_features)
# Concatenate continuous and categorical features
X_test = pd.concat([X_test_continuous, X_test_categorical_df], axis=1)


X_test_imputed = X_test.dropna()
# Make predictions on the test data
predictions = np.exp(model.predict(X_test_imputed))
print(predictions)


[177127.97941396 174208.51972878 166562.91206332 ... 162385.63421202
  94789.58116099 163796.68519459]
