In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
os.chdir("../")

In [3]:
%pwd

'c:\\Users\\harik\\OneDrive\\Desktop\\HARIKRISHNAN_DETAILS\\Real_Estate_Predictor_Web_App\\Real_Estate_Price_Predictor_Web_App'

In [4]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir:Path
    data_file:Path
    transformed_data:Path
    params_discrete_feature: list
    params_Id_column: list
    params_categorical_stratergy:str
    params_numerical_stratergy:str
    params_fill_value:str
    params_target_label:str
    params_rare_categorical_variable:str
    params_ordinal_categorical_feature:list
    params_nominal_categorical_feature:list

In [5]:
from real_estate_price_predictor.constants import *
from real_estate_price_predictor.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir = config.root_dir,
            data_file = config.data_file,
            transformed_data = config.transformed_data_file,
            params_discrete_feature = self.params.discrete_feature,
            params_Id_column = self.params.Id_column,
            params_categorical_stratergy = self.params.categorical_stratergy,
            params_numerical_stratergy = self.params.numerical_stratergy,
            params_fill_value = self.params.fill_value,
            params_target_label = self.params.target_label,
            params_rare_categorical_variable = self.params.rare_categorical_variable,
            params_ordinal_categorical_feature = self.params.ordinal_categorical_feature,
            params_nominal_categorical_feature = self.params.nominal_categorical_feature
        )

        return data_transformation_config

In [8]:
class SeparatingDifferentFeatures:
    def __init__(self,config:DataTransformationConfig):
        self.config = config
    
    def read_data(self):
        dataset = pd.read_csv(self.config.data_file)
        return dataset

# Features with null values

    def features_with_null_values(self, df:pd.DataFrame):
        dataset = df
        features_with_na=[features for features in dataset.columns if dataset[features].isnull().sum()>1]
        return features_with_na
    
# Finding both numerical and categorical features with null values

    def num_and_categorical_features_with_na(self,df:pd.DataFrame, categorical:bool):
        numerical_features_with_na =[]
        categorical_features_with_na =[]
        features_with_na = self.features_with_null_values(df)
        for feature in features_with_na:
            if pd.api.types.is_numeric_dtype(df[feature]):
                numerical_features_with_na.append(feature)
            else:
                categorical_features_with_na.append(feature)
        if categorical:
            return categorical_features_with_na
        else: 
            return numerical_features_with_na
    
# Finding all the numerical features

    def total_numerical_features(self,dataset:pd.DataFrame):
        numerical_features = []
        for feature in dataset.columns:
            if dataset[feature].dtypes != 'O':
                numerical_features.append(feature)
        return numerical_features

# Finding year or datatime variable

    def finding_year_feature(self,dataset:pd.DataFrame):
        year_feature = []
        numerical_features = self.total_numerical_features(dataset)
        for feature in numerical_features:
            if 'Yr' in feature or 'Year' in feature:
                year_feature.append(feature)
        return year_feature
        
# Finding Continuous Variable

    def continous_variables(self,df:pd.DataFrame):
        continuous_feature=[]
        numerical_features = self.total_numerical_features(df) 
        year_feature = self.finding_year_feature(df)
        for feature in numerical_features:
            if feature not in self.config.params_discrete_feature+year_feature+self.config.params_Id_column:
                continuous_feature.append(feature)
        return continuous_feature

# Replacing the zeros with 1 to perform log transform

    def replacing_zeros_of_continuous_features(self,dataset:pd.DataFrame):
        continuous_feature = self.continous_variables(dataset)
        for feature in continuous_feature:
            dataset.loc[dataset[feature] == 0, feature] = 1
        return dataset

# Finding the categorical features

    def total_categorical_features(self,dataset:pd.DataFrame):
        return [feature for feature in dataset.columns if dataset[feature].dtypes=='O']

# Handling Missing Values by creating a new category for categroical and with median for numerical

    def filling_missing_values(self, dataset:pd.DataFrame):
        categorical_imputer = SimpleImputer(strategy= self.config.params_categorical_stratergy,fill_value=self.config.params_fill_value)
        numerical_imputer = SimpleImputer(strategy=self.config.params_numerical_stratergy)
        categorical_features = self.total_categorical_features(dataset)
        numerical_features = self.total_numerical_features(dataset)
        dataset[categorical_features] = categorical_imputer.fit_transform(dataset[categorical_features])
        dataset[numerical_features] = numerical_imputer.fit_transform(dataset[numerical_features])
        return dataset
    
# Saving the transformed data

    def save_the_transformed_data(self,dataset:pd.DataFrame):
        dataset.to_csv(self.config.transformed_data)


In [9]:
class handling_date_time_variables(BaseEstimator, TransformerMixin,SeparatingDifferentFeatures):
    def __init__(self,config:DataTransformationConfig): # no *args or **kargs 
         super().__init__(config)
    def fit(self, X, y=None):
         return self # nothing else to do
    def transform(self, X, y=None):
         year_features = self.finding_year_feature(X)
         for feature in year_features:
             if feature != 'YrSold':
                 X[feature]=X['YrSold']-X[feature]
         return X

In [10]:
class log_transform_of_numeric_variables(BaseEstimator, TransformerMixin,SeparatingDifferentFeatures):
    def __init__(self, config:DataTransformationConfig): # no *args or **kargs
         super().__init__(config)
    def fit(self, X, y=None):
         return self # nothing else to do
    def transform(self, X, y=None):
         continous_features = self.continous_variables(X)
         X = self.replacing_zeros_of_continuous_features(X)
         for feature in continous_features:
             X[feature]=np.log(X[feature])
         return X

In [11]:
class handling_rare_categorical_values(BaseEstimator, TransformerMixin,SeparatingDifferentFeatures):
    def __init__(self,config:DataTransformationConfig): # no *args or **kargs
         super().__init__(config)
    def fit(self, X, y=None):
         return self # nothing else to do
    def transform(self, X, y=None):
         categorical_features = self.total_categorical_features(X)
         for feature in categorical_features:
             temp=X.groupby(feature)[self.config.params_target_label].count()/len(X)
             temp_df=temp[temp>0.01].index
             X[feature]=np.where(X[feature].isin(temp_df),X[feature],self.config.params_rare_categorical_variable)
         return X

In [12]:
class handling_ordinal_categorical_values(BaseEstimator, TransformerMixin,SeparatingDifferentFeatures):
    def __init__(self,config:DataTransformationConfig): # no *args or **kargs
         self.label_ordered_feature ={}
         super().__init__(config)
    def fit(self, X, y=None):
         for feature in self.config.params_ordinal_categorical_feature:
             labels_ordered=X.groupby([feature])[self.config.params_target_label].mean().sort_values().index
             labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
             self.label_ordered_feature[feature] = labels_ordered
         return self
    def transform(self, X, y=None):
         for feature in self.config.params_ordinal_categorical_feature:
           X[feature]=X[feature].map(self.label_ordered_feature[feature])
         return X

In [13]:
class handling_nominal_categorical_values(BaseEstimator, TransformerMixin,SeparatingDifferentFeatures):
    def __init__(self,config:DataTransformationConfig): # no *args or **kargs
         self.label_nominal_feature ={}
         super().__init__(config)
    def fit(self, X, y=None):
         for feature in self.config.params_nominal_categorical_feature:
             nominal_label=X.groupby([feature])[self.config.params_target_label].mean().to_dict()
             self.label_nominal_feature[feature] = nominal_label
         return self
    def transform(self, X, y=None):
         for feature in self.config.params_nominal_categorical_feature:
           X[feature]=X[feature].map(self.label_nominal_feature[feature])
         return X

In [14]:
class handling_outliers_for_continous_variable(BaseEstimator, TransformerMixin,SeparatingDifferentFeatures):
    def __init__(self,config:DataTransformationConfig):
        self.iqr_boundaries_conitnous_feature ={}
        super().__init__(config)
        
    def fit(self, X, y=None):
        continuous_feature = self.continous_variables(X)
        for feature in continuous_feature:
            IQR=X[feature].quantile(0.75)-X[feature].quantile(0.25)
            lower_bridge=X[feature].quantile(0.25)-(IQR*3)
            upper_bridge=X[feature].quantile(0.75)+(IQR*3)
            self.iqr_boundaries_conitnous_feature[feature] = [lower_bridge,upper_bridge]
        return self
    def transform(self, X, y=None):
        continuous_feature = self.continous_variables(X)
        for feature in continuous_feature:
            lower_bridge, upper_bridge = self.iqr_boundaries_conitnous_feature[feature]
            X.loc[X[feature]<=lower_bridge,feature]=lower_bridge
            X.loc[X[feature]>=upper_bridge,feature]=upper_bridge
        return X

In [None]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = SeparatingDifferentFeatures(config=data_transformation_config)

    # Read the data

    dataset = data_transformation.read_data()

    # Handling Null Values

    dataset = data_transformation.filling_missing_values(dataset)

    # Handling date time variables

    date_time_variables = handling_date_time_variables(config=data_transformation_config)
    dataset = date_time_variables.transform(dataset)

    # Transforming the continous variables using logrithmic transform

    log_transform = log_transform_of_numeric_variables(config=data_transformation_config)
    dataset = log_transform.transform(dataset)

    # Handling rare categorical variable

    rare_categorical_values = handling_rare_categorical_values(config=data_transformation_config)
    dataset = rare_categorical_values.transform(dataset)    

    # Encoding the ordinal categorical features using (Target Guided Encoding)

    ordinal_features = handling_ordinal_categorical_values(config=data_transformation_config)
    ordinal_features.fit(dataset)
    dataset = ordinal_features.transform(dataset)

    # Encoding the nominal categorical features using (Mean Encoding)

    nominal_features = handling_nominal_categorical_values(config=data_transformation_config)
    nominal_features.fit(dataset)
    dataset = nominal_features.transform(dataset)    

    # Removing the Outliers in Continous feature

    Removal_of_outlier = handling_outliers_for_continous_variable(config=data_transformation_config)
    Removal_of_outlier.fit(dataset)
    dataset = Removal_of_outlier.transform(dataset)

    # Saving the transformed data

    data_transformation.save_the_transformed_data(dataset)
    
except Exception as e:
    raise e

In [None]:
config = ConfigurationManager()
data_transformation_config = config.get_data_transformation_config()
data_transformation = SeparatingDifferentFeatures(config=data_transformation_config)
dataset = data_transformation.read_data()
dataset.shape

In [15]:
categorical_features_with_na = data_transformation.num_and_categorical_features_with_na(dataset,True)
numerical_features_with_na = data_transformation.num_and_categorical_features_with_na(dataset,False)

In [None]:
print("The categrical null count is ",len(categorical_features_with_na))
print("The numerical null count is ",len(numerical_features_with_na))

In [None]:
numerical_features = data_transformation.total_numerical_features(dataset)
print('Number of numerical variables: ', len(numerical_features))
dataset[numerical_features].head()

In [None]:
continuous_feature = data_transformation.continous_variables(dataset)
print("Continuous feature Count {}".format(len(continuous_feature)))
dataset[continuous_feature].head()

In [None]:
categorical_features = data_transformation.total_categorical_features(dataset)
print("The number of categorical variables are ", len(categorical_features))
dataset[categorical_features].head()

In [20]:
dataset = data_transformation.filling_missing_values(dataset)

In [None]:
dataset[categorical_features_with_na].isnull().sum()

In [None]:
dataset[numerical_features_with_na].isnull().sum()

In [None]:
date_time_variables = handling_date_time_variables(config=data_transformation_config)
dataset = date_time_variables.transform(dataset)
dataset[['YearBuilt','YearRemodAdd','GarageYrBlt']].head()

In [None]:
log_transform = log_transform_of_numeric_variables(config=data_transformation_config)
dataset = log_transform.transform(dataset)
dataset.head(5)

In [None]:
rare_categorical_values = handling_rare_categorical_values(config=data_transformation_config)
dataset = rare_categorical_values.transform(dataset)
dataset[['Condition2']].head(10)

In [None]:
ordinal_features = handling_ordinal_categorical_values(config=data_transformation_config)
ordinal_features.fit(dataset)
dataset = ordinal_features.transform(dataset)
dataset[data_transformation_config.params_ordinal_categorical_feature].head()

In [None]:
nominal_features = handling_nominal_categorical_values(config=data_transformation_config)
nominal_features.fit(dataset)
dataset = nominal_features.transform(dataset)
dataset[data_transformation_config.params_nominal_categorical_feature].head()

In [None]:
Removal_of_outlier = handling_outliers_for_continous_variable(config=data_transformation_config)
Removal_of_outlier.fit(dataset)
print(Removal_of_outlier.iqr_boundaries_conitnous_feature)
dataset = Removal_of_outlier.transform(dataset)
dataset[continuous_feature].describe()

In [7]:
dataset1 = pd.read_csv(r'C:\Users\harik\OneDrive\Desktop\HARIKRISHNAN_DETAILS\Real_Estate_Predictor_Web_App\Real_Estate_Price_Predictor_Web_App\artifacts\data_transformation\transformed_data.csv')

In [8]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
X = dataset1.drop(['Id','SalePrice'],axis=1)
Y = dataset1[['SalePrice']]
feature_sel_model = SelectFromModel(Lasso(alpha=0.005, random_state=0)) # remember to set the seed, the random state in this function
feature_sel_model.fit(X, Y)
selected_feat = X.columns[(feature_sel_model.get_support())]
print('selected features: {}'.format(len(selected_feat)))
print(selected_feat)

selected features: 23
Index(['MSSubClass', 'LotArea', 'Neighborhood', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtUnfSF',
       'TotalBsmtSF', 'HeatingQC', '1stFlrSF', 'GrLivArea', 'BsmtFullBath',
       'KitchenQual', 'TotRmsAbvGrd', 'Fireplaces', 'FireplaceQu',
       'GarageYrBlt', 'GarageCars', 'WoodDeckSF', 'OpenPorchSF'],
      dtype='object')


In [9]:
X_selected =X[selected_feat]
X_selected.head()

Unnamed: 0,MSSubClass,LotArea,Neighborhood,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinType1,BsmtFinSF1,BsmtUnfSF,...,GrLivArea,BsmtFullBath,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageYrBlt,GarageCars,WoodDeckSF,OpenPorchSF
0,60.0,9.041922,12.163641,7.0,5.0,5.0,5.0,6,6.559615,5.010635,...,7.444249,1.0,2,8.0,0.0,1,5.0,2.0,0.0,4.110874
1,20.0,9.169518,12.101696,6.0,8.0,31.0,31.0,4,6.88551,5.648974,...,7.140453,0.0,1,6.0,1.0,3,31.0,2.0,5.697093,0.0
2,60.0,9.328123,12.163641,7.0,5.0,7.0,6.0,6,6.186209,6.073045,...,7.487734,1.0,2,6.0,1.0,3,7.0,2.0,0.0,3.73767
3,70.0,9.164296,12.206659,7.0,5.0,91.0,36.0,4,5.375278,6.291569,...,7.448334,1.0,2,7.0,1.0,4,8.0,3.0,0.0,3.555348
4,60.0,9.565214,12.676,8.0,5.0,8.0,8.0,6,6.484635,6.194405,...,7.695303,1.0,2,9.0,1.0,3,8.0,3.0,5.257495,4.430817


In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_selected,Y,test_size=0.1,random_state=0)
X_train.shape, X_test.shape

((1314, 23), (146, 23))

In [11]:
from sklearn.preprocessing import MinMaxScaler
X_copy = X_train.copy()
scaler=MinMaxScaler()
scaler.fit(X_copy)
X_train_data = pd.DataFrame(scaler.transform(X_copy), columns=selected_feat)
X_test_data = pd.DataFrame(scaler.transform(X_test), columns=selected_feat)

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
list_of_models = ['Linear Regression', 'Ridge Regression', 'Polynomial Regression', 'SVR', 'Random Forrest Regressor', 'Decision Tree Regressor' ]
r2_score_of_models=[]
adjusted_r2_score =[]
mse=[]
for i  in list_of_models:
    if i == 'Linear Regression':
        model = LinearRegression()
    elif i == 'Ridge Regression':
        model = Ridge()
    elif i == 'Polynomial Regression':
        model = Pipeline([('poly', PolynomialFeatures(degree=2)),('linear_model', LinearRegression())])
    elif i == 'SVR':
        model = SVR(kernel='rbf', C=1.0)
    elif i == 'Random Forrest Regressor':
        model = RandomForestRegressor(n_estimators=100)
    else:
        model = DecisionTreeRegressor(max_depth=5)
# Train the model on the training data

    model.fit(X_train_data, y_train)

# Make predictions on the testing data

    y_pred = model.predict(X_test_data)

# Evaluate the model performance (e.g., R-squared, Mean Squared Error)
    
    r2 = r2_score(y_test, y_pred)
    r2_score_of_models.append(r2)

# Calculate the adjusted R²

    n = X_test_data.shape[0]  # Number of observations (samples) in the testing set
    p = X_test_data.shape[1]  # Number of features in the model
    adjusted_r2_score.append(1 - (1 - r2) * (n - 1) / (n - p - 1))
    mse.append(mean_squared_error(y_test, y_pred))
print(r2_score_of_models)
print(adjusted_r2_score)
print(mse)


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


[0.8915319920524694, 0.8890182559450004, 0.8094233922369708, 0.8805981099194479, 0.8929473424376737, 0.7496123295499876]
[0.8710831053082628, 0.8680954681313529, 0.7734950153636129, 0.8580879175272127, 0.8727652840447762, 0.7024080965962967]
[0.014852771905645037, 0.015196983528423996, 0.02609608989062562, 0.016349973342621123, 0.014658964746872692, 0.03428615522264247]


In [13]:
data = {'Models': list_of_models, 'Adjusted_R2_Score': adjusted_r2_score, 'R2_Score': r2_score_of_models , 'Mean_Squared_Error': mse}
performance_metrics = pd.DataFrame.from_dict(data)
performance_metrics.set_index('Models', inplace = True)
performance_metrics

Unnamed: 0_level_0,Adjusted_R2_Score,R2_Score,Mean_Squared_Error
Models,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Linear Regression,0.871083,0.891532,0.014853
Ridge Regression,0.868095,0.889018,0.015197
Polynomial Regression,0.773495,0.809423,0.026096
SVR,0.858088,0.880598,0.01635
Random Forrest Regressor,0.872765,0.892947,0.014659
Decision Tree Regressor,0.702408,0.749612,0.034286
