In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [57]:
import joblib
import json
import os

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [3]:
train_df = pd.read_csv('data_ingestion/train_df.csv')
test_df = pd.read_csv('data_ingestion/test_df.csv')

In [4]:
print(f'Train Data Size: {train_df.shape}')
print(f'Test Data Size: {test_df.shape}')

Train Data Size: (20150, 5)
Test Data Size: (8637, 5)


In [5]:
train_df.head(2)

Unnamed: 0,area_sqft,bedroom,bathroom,location,avg_rent
0,900,2,2,uttara,18000.0
1,1200,3,3,shahjahanpur,30000.0


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20150 entries, 0 to 20149
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   area_sqft  20150 non-null  int64  
 1   bedroom    20150 non-null  int64  
 2   bathroom   20150 non-null  int64  
 3   location   20150 non-null  object 
 4   avg_rent   20150 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 787.2+ KB


In [7]:
X_train = train_df[['area_sqft', 'bedroom', 'bathroom', 'location']]
y_train = train_df[['avg_rent']]

X_test = test_df[['area_sqft', 'bedroom', 'bathroom', 'location']]
y_test = test_df[['avg_rent']]

In [8]:
num_columns = ['area_sqft', 'bedroom', 'bathroom']
cat_columns = ['location']

In [9]:
numerical_processor = Pipeline(
    [
       ('imputation_mean', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]
)

In [10]:
numerical_processor

Pipeline(steps=[('imputation_mean', SimpleImputer()),
                ('scaler', StandardScaler())])

In [11]:
categorcial_processor = Pipeline(
    [
        ('imputation_constant', SimpleImputer(strategy='constant')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

In [12]:
categorcial_processor

Pipeline(steps=[('imputation_constant', SimpleImputer(strategy='constant')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [13]:
preprocessor = ColumnTransformer(
    [
        ('numerical_processor', numerical_processor, num_columns),
        ('categorical_processor', categorcial_processor, cat_columns)
    ]
)

In [14]:
preprocessor

ColumnTransformer(transformers=[('numerical_processor',
                                 Pipeline(steps=[('imputation_mean',
                                                  SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 ['area_sqft', 'bedroom', 'bathroom']),
                                ('categorical_processor',
                                 Pipeline(steps=[('imputation_constant',
                                                  SimpleImputer(strategy='constant')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['location'])])

### Save the Preprocessor

In [55]:
# Create a directory for the preprocessor if it doesn't exist
output_dir = "preprocessor"
os.makedirs(output_dir, exist_ok=True)

# Save the entire preprocessor under the "preprocessor" folder
preprocessor_path = os.path.join(output_dir, "preprocessor.pkl")
joblib.dump(preprocessor, preprocessor_path)
print(f"Preprocessor saved as '{preprocessor_path}'")

Preprocessor saved as 'preprocessor\preprocessor.pkl'


In [15]:
# Visualize the Pipeline
from sklearn import set_config
set_config(display='diagram')

In [16]:
preprocessor

## Transform the Data

In [17]:
# Fit the preprocessor
preprocessor.fit(X_train)


In [18]:
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

## Modeling

In [31]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

In [32]:
models_dict ={
                'Linear Regression': LinearRegression(),
                'KNeighborsRegressor':KNeighborsRegressor(),
                'Random Forest': RandomForestRegressor(),
                'Gradient Boosting': GradientBoostingRegressor(),
                'AdaBoost Regressor': AdaBoostRegressor(),
                'Decision Tree': DecisionTreeRegressor(),
                'XGBRegressor': XGBRegressor()
            }
            
            # parameters for hyperparameter tuning
params = {
    "Decision Tree": {
        'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
        # 'splitter':['best','random'],
        # 'max_features':['sqrt','log2'],
    },
    "Random Forest":{
        # 'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],

        # 'max_features':['sqrt','log2',None],
        'n_estimators': [8,16,32,64,128,256]
    },
    "Gradient Boosting":{
        # 'loss':['squared_error', 'huber', 'absolute_error', 'quantile'],
        'learning_rate':[.1,.01,.05,.001],
        'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
        # 'criterion':['squared_error', 'friedman_mse'],
        # 'max_features':['auto','sqrt','log2'],
        'n_estimators': [8,16,32,64,128,256]
    },
    "Linear Regression":{},
    "XGBRegressor":{
        'learning_rate':[.1,.01,.05,.001],
        'n_estimators': [8,16,32,64,128,256]
    },
    "AdaBoost Regressor":{
        'learning_rate':[.1,.01,0.5,.001],
        # 'loss':['linear','square','exponential'],
        'n_estimators': [8,16,32,64,128,256]
    },
    "KNeighborsRegressor":{}          
}

In [56]:
from sklearn.model_selection import GridSearchCV

In [52]:
# Assuming you have a function evaluate_model
def evaluate_model(X_train, X_test, y_train, y_test, models_dict, params):
    best_model = None
    best_score = float('-inf')
    best_model_name = ""
    
    for model_name, model in models_dict.items():
        clf = GridSearchCV(model, params[model_name], cv=5)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        
        if score > best_score:
            best_score = score
            best_model = clf.best_estimator_
            best_model_name = model_name
    
    return best_model, best_score, best_model_name

In [53]:
# Create directory for best model
output_dir = "best_model"
os.makedirs(output_dir, exist_ok=True)

In [54]:
# Example usage
best_model, best_score, best_model_name = evaluate_model(X_train=X_train_transformed, X_test=X_test_transformed, y_train=y_train, y_test=y_test, models_dict=models_dict, params=params)

if best_score < 0.7:
    print("No best model found")
else:
    print(f'Best model: {best_model_name} with score {round(best_score * 100, 2)}%')
    print(f'Best Parameters: {best_model.get_params()}')
    
    # Save the best model
    model_path = os.path.join(output_dir, 'best_model.pkl')
    joblib.dump(best_model, model_path)
    print(f"Best model saved as '{model_path}'")
    
    # Extract and serialize the model details
    model_details = {
        "model_name": best_model_name,
        "model_accuracy": round(best_score * 100, 2),
        "model_params": {k: str(v) for k, v in best_model.get_params().items()}  # Ensure all values are strings
    }
    
    # Write model details to a text file
    details_path = os.path.join(output_dir, 'best_model_details.txt')
    with open(details_path, 'w') as file:
        file.write(json.dumps(model_details, indent=4))
    
    print(f"Best model details saved in '{details_path}'")


Best model: XGBRegressor with score 89.21%
Best Parameters: {'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'gpu_id': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.1, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 128, 'n_jobs': None, 'num_parallel_tree': None, 'predictor': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}
Best model saved as 'best_model\best_model.pkl