In [1]:
import os

In [2]:
%pwd

'c:\\Users\\harik\\OneDrive\\Desktop\\HARIKRISHNAN_DETAILS\\Real_Estate_Predictor_Web_App\\Real_Estate_Price_Predictor_Web_App\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\harik\\OneDrive\\Desktop\\HARIKRISHNAN_DETAILS\\Real_Estate_Predictor_Web_App\\Real_Estate_Price_Predictor_Web_App'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class FeatureSelectionConfig:
    root_dir:Path
    transformed_data_file:Path
    X_train_data_file:Path
    X_test_data_file:Path
    Y_train_data_file:Path
    Y_test_data_file:Path
    params_target_label:str
    params_Id_column: list
    params_alpha_for_lasso: int
    params_random_state_for_lasso:int
    params_percentile_for_mutual_info:int
    params_test_size:int
    params_random_state_for_train_test_split:int

@dataclass(frozen=True)
class FeatureScalingConfig:
    root_dir:Path
    X_train_data_file:Path
    X_test_data_file:Path
    X_train_scaled_data_file:Path
    X_test_scaled_data_file:Path
    feature_scaling_model:Path


In [6]:
from real_estate_price_predictor.constants import *
from real_estate_price_predictor.utils.common import read_yaml, create_directories, save_object

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_feature_selection(self) -> FeatureSelectionConfig:
        config = self.config.feature_selection

        create_directories([config.root_dir])

        feature_selection_config = FeatureSelectionConfig(
            root_dir=config.root_dir,
            transformed_data_file = config.transformed_data_file,
            X_train_data_file = config.X_train_data_file,
            X_test_data_file =  config.X_test_data_file,
            Y_train_data_file= config.Y_train_data_file,
            Y_test_data_file = config.Y_test_data_file,
            params_target_label = self.params.target_label,
            params_Id_column = self.params.Id_column,
            params_alpha_for_lasso = self.params.alpha_for_lasso,
            params_random_state_for_lasso = self.params.random_state_for_lasso,
            params_percentile_for_mutual_info = self.params.percentile_for_mutual_info,
            params_test_size = self.params.test_size,
            params_random_state_for_train_test_split = self.params.random_state_for_train_test_split
            
        )

        return feature_selection_config
    
    def get_feature_scaling(self) -> FeatureScalingConfig:
        config = self.config.feature_scaling

        create_directories([config.root_dir])

        feature_scaling_config = FeatureScalingConfig(
            root_dir=config.root_dir,
            X_train_data_file = config.X_train_data_file,
            X_test_data_file = config.X_test_data_file,
            X_train_scaled_data_file = config.X_train_scaled_data_file,
            X_test_scaled_data_file =  config.X_test_scaled_data_file,
            feature_scaling_model = config.feature_scaling_model
        )

        return feature_scaling_config

In [8]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from real_estate_price_predictor import logger
import pandas as pd

In [10]:
class FeatureSelectionAndTrainTestSplit:
    
    def __init__(self,config=FeatureSelectionConfig):
        self.config = config

    def lasso_feature_selection(self):
        if os.path.exists(self.config.transformed_data_file):
            dataset = pd.read_csv(self.config.transformed_data_file)
            X = dataset.drop([self.config.params_Id_column[0],self.config.params_target_label,],axis=1)
            Y = dataset[[self.config.params_target_label]]
            feature_sel_model = SelectFromModel(Lasso(alpha=self.config.params_alpha_for_lasso, random_state=self.config.params_random_state_for_lasso)) # remember to set the seed, the random state in this function
            feature_sel_model.fit(X, Y)
            selected_feat = X.columns[(feature_sel_model.get_support())]
            return X[selected_feat]
        else: logger.info(f">>>>>> transformation data file is not present <<<<<<")
        
    
    def mutual_information_feature_selection(self):
        if os.path.exists(self.config.transformed_data_file):
            dataset = pd.read_csv(self.config.transformed_data_file)
            X = dataset.drop([self.config.params_Id_column[0],self.config.params_target_label],axis=1)
            Y = dataset[[self.config.params_target_label]]
            selected_top_columns = SelectPercentile(mutual_info_regression, percentile=20)
            selected_top_columns.fit(X, Y)
            selected_feature = X.columns[selected_top_columns.get_support()]
            return X[selected_feature]
        else: logger.info(f">>>>>> transformation data file is not present <<<<<<")

    def test_train_split(self,dataset:pd.DataFrame):
        X = dataset
        if os.path.exists(self.config.transformed_data_file):
            dataset = pd.read_csv(self.config.transformed_data_file)
            Y = dataset[[self.config.params_target_label]]
            X_train,X_test,y_train,y_test =train_test_split(X,Y,test_size=self.config.params_test_size,random_state=self.config.params_random_state_for_train_test_split)
            X_train = pd.DataFrame(X_train,columns=X.columns)
            X_test = pd.DataFrame(X_test,columns=X.columns)
            X_train.to_csv(self.config.X_train_data_file,index=False)
            X_test.to_csv(self.config.X_test_data_file,index=False)
            y_train = pd.DataFrame(y_train,columns=Y.columns)
            y_test = pd.DataFrame(y_test,columns=Y.columns)
            y_train.to_csv(self.config.Y_train_data_file,index=False)
            y_test.to_csv(self.config.Y_test_data_file,index=False)
        else: logger.info(f">>>>>> transformation data file is not present <<<<<<")

class FeatureScaling():
    def __init__(self,config = FeatureScalingConfig):
        self.config =  config
    
    def read_csv_file(self,path:Path):
        dataset = pd.read_csv(path)
        return dataset 
    
    def min_max_scaler(self):
        X_train = self.read_csv_file(self.config.X_train_data_file)
        X_test = self.read_csv_file(self.config.X_test_data_file)
        scaler=MinMaxScaler()
        scaler.fit(X_train)
        if (os.path.exists(self.config.X_train_data_file)) and (os.path.exists(self.config.X_test_data_file) ):
            X_train_data = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
            X_test_data = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
            X_train_data.to_csv(self.config.X_train_scaled_data_file,index=False)
            X_test_data.to_csv(self.config.X_test_scaled_data_file,index=False)
            save_object(self.config.feature_scaling_model,scaler)
        
        
        

    

In [None]:
try:
    config = ConfigurationManager()
    feature_selection_config = config.get_feature_selection()
    feature_selection = FeatureSelectionAndTrainTestSplit(config=feature_selection_config)
    dataset = feature_selection.lasso_feature_selection()
    print(dataset.columns)
    print(len(dataset.columns))
    feature_selection.test_train_split(dataset)
except Exception as e:
    raise e

In [11]:
try:
    config = ConfigurationManager()
    feature_scaling_config = config.get_feature_scaling()
    feature_scaling = FeatureScaling(config=feature_scaling_config)
    feature_scaling.min_max_scaler()
except Exception as e:
    raise e

[2024-10-06 16:15:30,457: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-10-06 16:15:30,467: INFO: common: yaml file: params.yaml loaded successfully]
[2024-10-06 16:15:30,469: INFO: common: created directory at: artifacts]
[2024-10-06 16:15:30,471: INFO: common: created directory at: artifacts/train_test_data_scaled]


In [None]:
empty_df = pd.DataFrame()
print(empty_df.isnull().all().all())

In [13]:

# from sklearn.preprocessing import MinMaxScaler
# X_train = pd.read_csv(r'C:\Users\harik\OneDrive\Desktop\HARIKRISHNAN_DETAILS\Real_Estate_Predictor_Web_App\Real_Estate_Price_Predictor_Web_App\artifacts\train_test_data\X_train.csv')
# X_test = pd.read_csv(r'C:\Users\harik\OneDrive\Desktop\HARIKRISHNAN_DETAILS\Real_Estate_Predictor_Web_App\Real_Estate_Price_Predictor_Web_App\artifacts\train_test_data\X_test.csv')
# y_train = pd.read_csv(r'C:\Users\harik\OneDrive\Desktop\HARIKRISHNAN_DETAILS\Real_Estate_Predictor_Web_App\Real_Estate_Price_Predictor_Web_App\artifacts\train_test_data\Y_train.csv')
# y_test = pd.read_csv(r'C:\Users\harik\OneDrive\Desktop\HARIKRISHNAN_DETAILS\Real_Estate_Predictor_Web_App\Real_Estate_Price_Predictor_Web_App\artifacts\train_test_data\Y_test.csv')
# scaler=MinMaxScaler()
# scaler.fit(X_train)
# X_train_data = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
# X_test_data = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# if dataset.isnull().all().all():
# else: 
            # dataframe = pd.DataFrame(scaler.transform(dataset), columns=dataset.columns)
            # return dataframe

In [12]:
X_test_data = pd.read_csv(r'C:\Users\harik\OneDrive\Desktop\HARIKRISHNAN_DETAILS\Real_Estate_Predictor_Web_App\Real_Estate_Price_Predictor_Web_App\artifacts\train_test_data_scaled\X_test_scaled.csv')
X_train_data  = pd.read_csv(r'C:\Users\harik\OneDrive\Desktop\HARIKRISHNAN_DETAILS\Real_Estate_Predictor_Web_App\Real_Estate_Price_Predictor_Web_App\artifacts\train_test_data_scaled\X_train_scaled.csv')
y_train = pd.read_csv(r'C:\Users\harik\OneDrive\Desktop\HARIKRISHNAN_DETAILS\Real_Estate_Predictor_Web_App\Real_Estate_Price_Predictor_Web_App\artifacts\train_test_data\Y_train.csv')
y_test = pd.read_csv(r'C:\Users\harik\OneDrive\Desktop\HARIKRISHNAN_DETAILS\Real_Estate_Predictor_Web_App\Real_Estate_Price_Predictor_Web_App\artifacts\train_test_data\Y_test.csv')


In [13]:
# X_train_data.shape
X_test_data.shape

(146, 23)

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
list_of_models = ['Linear Regression', 'Ridge Regression', 'Polynomial Regression', 'SVR', 'Random Forrest Regressor', 'Decision Tree Regressor','AdaBoost Regressor','Gradient Boosting Regressor','XGBRegressor' ]
r2_score_of_models=[]
adjusted_r2_score =[]
mse=[]
for i  in list_of_models:
    if i == 'Linear Regression':
        model = LinearRegression()
    elif i == 'Ridge Regression':
        model = Ridge()
    elif i == 'Polynomial Regression':
        model = Pipeline([('poly', PolynomialFeatures(degree=2)),('linear_model', LinearRegression())])
    elif i == 'SVR':
        model = SVR(kernel='rbf', C=1.0)
    elif i == 'Random Forrest Regressor':
        model = RandomForestRegressor(n_estimators=100)
    elif i == 'AdaBoost Regressor':
        model = AdaBoostRegressor()
    elif i == 'Gradient Boosting Regressor':
        model = GradientBoostingRegressor()
    elif i == 'XGBRegressor':
        model = XGBRegressor()
    else:
        model = DecisionTreeRegressor(max_depth=5)
# Train the model on the training data

    model.fit(X_train_data, y_train)

# Make predictions on the testing data

    y_pred = model.predict(X_test_data)

# Evaluate the model performance (e.g., R-squared, Mean Squared Error)
    
    r2 = r2_score(y_test, y_pred)
    r2_score_of_models.append(r2)

# Calculate the adjusted R²

    n = X_test_data.shape[0]  # Number of observations (samples) in the testing set
    p = X_test_data.shape[1]  # Number of features in the model
    adjusted_r2_score.append(1 - (1 - r2) * (n - 1) / (n - p - 1))
    mse.append(mean_squared_error(y_test, y_pred))
print(r2_score_of_models)
print(adjusted_r2_score)
print(mse)


  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


[0.8915319920524694, 0.8890182559450005, 0.8094233922369707, 0.8805981099194473, 0.8930781403609384, 0.7496123295499891, 0.7908597767658794, 0.8849228501059885, 0.8649128675460815]
[0.8710831053082628, 0.8680954681313531, 0.7734950153636126, 0.858087917527212, 0.8729207405929185, 0.7024080965962985, 0.751431701893873, 0.8632279775849863, 0.8394456212637854]
[0.014852771905645039, 0.015196983528423988, 0.026096089890625635, 0.01634997334262119, 0.01464105428869481, 0.03428615522264225, 0.02863804812839099, 0.01575777679769206, 0.018497792256235945]


In [15]:
data = {'Models': list_of_models, 'Adjusted_R2_Score': adjusted_r2_score, 'R2_Score': r2_score_of_models , 'Mean_Squared_Error': mse}
performance_metrics = pd.DataFrame.from_dict(data)
performance_metrics.set_index('Models', inplace = False)
performance_metrics

Unnamed: 0,Models,Adjusted_R2_Score,R2_Score,Mean_Squared_Error
0,Linear Regression,0.871083,0.891532,0.014853
1,Ridge Regression,0.868095,0.889018,0.015197
2,Polynomial Regression,0.773495,0.809423,0.026096
3,SVR,0.858088,0.880598,0.01635
4,Random Forrest Regressor,0.872921,0.893078,0.014641
5,Decision Tree Regressor,0.702408,0.749612,0.034286
6,AdaBoost Regressor,0.751432,0.79086,0.028638
7,Gradient Boosting Regressor,0.863228,0.884923,0.015758
8,XGBRegressor,0.839446,0.864913,0.018498


In [None]:
performance_metrics.columns