In [2]:
# pip install ipykernel
# namedtuple: immutable so info won't be altered and each element has a name associated
# so no need to remember the index and what info is there

from collections import namedtuple

### Defining Data Ingestion Configuration

In [5]:
# Input to Data Ingestion Component ie source of i/p data
"""
dataset_download_url: download location/url from where I will download data
tgz_download_dir    : Location (Folder) where I will download the compress file
raw_data_dir        : Location (Folder) where I will extract the file
ingested_train_dir  : Train dataset folder
ingested_test_dir   : Test dataste folder
"""

DataIngestionConfig = namedtuple("DataIngestionConfig", 
["dataset_download_url", "tgz_download_dir", "raw_data_dir", "ingested_train_dir", "ingested_test_dir"])



In [16]:
data_ingestion_config = DataIngestionConfig(
    dataset_download_url="down",
    tgz_download_dir="tgz",
    raw_data_dir="raw",
    ingested_train_dir="train",
    ingested_test_dir="test"
    )

data_ingestion_config.dataset_download_url


'down'

### DataValidationConfig

In [21]:
# For column level config we need the MDM file which has info about column details
DataValidationConfig = namedtuple("DataValidationConfig",
["schema_file_path"])


### DataTransformationConfig

In [22]:
"""
transformed_train_dir: dir to store our transformed input train data(file)
preprocessed_object_file_path: location to store our transformation pickle object which we will use in prediction pipeline
add_bedroom_per_room: column that we will create in our dataset, later stage
"""
DataTransformationConfig = namedtuple("DataTransformationConfig",
["add_bedroom_per_room", "transformed_train_dir", "transformed_test_dir", "preprocessed_object_file_path"])

### ModelTrainerConfig

In [23]:
"""
train_model_file_path: after training model, this specfy the path to store pickle object of our model
base_accuracy: expected or threshold accuracy to accept or reject our model
"""

ModelTrainerConfig = namedtuple("ModelTrainerConfig",
["train_model_file_path", "base_accuracy"])

### ModelEvaluationConfig

In [24]:
"""
model_evaluation_file_path: file in which we store all the essential info about the model running in production
time_stamp: 
"""

ModelEvaluationConfig = namedtuple("ModelEvaluationConfig",
["model_evaluation_file_path", "time_stamp"])

### ModelPusherConfig

In [None]:
ModelPusherConfig = namedtuple("ModelPusherConfig",
["export_dir_path"])

#### Working with yaml file: pip install PyYAML

In [3]:
import yaml
import os

In [4]:
pwd()

'c:\\Users\\DJ\\Desktop\\Current Batch Project\\Housing-Prediction-Machine-Learning\\notebook'

In [21]:
os.chdir(path="c:\\Users\\DJ\\Desktop\\Current Batch Project\\Housing-Prediction-Machine-Learning")

In [22]:
pwd

'c:\\Users\\DJ\\Desktop\\Current Batch Project\\Housing-Prediction-Machine-Learning'

In [23]:
os.listdir()

['.dockerignore',
 '.git',
 '.github',
 '.gitignore',
 'app.py',
 'config',
 'Dockerfile.heroku',
 'housing',
 'housing_predictor.egg-info',
 'info.txt',
 'LICENSE',
 'notebook',
 'README.md',
 'requirements.txt',
 'setup.py',
 'test.py']

In [24]:
config_file_path = os.path.join("config", "config.yaml")
config_file_path

'config\\config.yaml'

In [25]:
os.path.exists(config_file_path)

True

In [26]:
config_info = None
with open(config_file_path, "rb") as yaml_file:
    config_info = yaml.safe_load(yaml_file)


In [27]:
config_info

{'training_pipeline_config': {'pipeline_name': 'housing',
  'artifact_dir': 'artifact'},
 'data_ingestion_config': {'dataset_download_url': 'https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.tgz',
  'raw_data_dir': 'raw_data',
  'tgz_download_dir': 'tgz_data',
  'ingested_dir': 'ingested_data',
  'ingested_train_dir': 'train',
  'ingested_test_dir': 'test'},
 'data_validation_config': {'schema_dir': 'config',
  'schema_file_name': 'schema.yaml',
  'report_file_name': 'report.json',
  'report_page_file_name': 'report.html'},
 'data_transformation_config': {'add_bedroom_per_room': True,
  'transformed_dir': 'transformed_data',
  'transformed_train_dir': 'train',
  'transformed_test_dir': 'test',
  'preprocessing_dir': 'preprocessed',
  'preprocessed_object_file_name': 'preprocessed.pkl'},
 'model_trainer_config': {'trained_model_dir': 'trained_model',
  'model_file_name': 'model.pkl',
  'base_accuracy': 0.6},
 'model_evaluation_config': {'model_evaluatio

In [17]:
os.chdir('c:\\Users\\DJ\\Desktop\\Current Batch Project\\Housing-Prediction-Machine-Learning\\notebook')
os.getcwd()
os.chdir("\\")
pwd

'c:\\Users\\DJ\\Desktop\\Current Batch Project\\Housing-Prediction-Machine-Learning\\notebook'

In [None]:
def read_yaml_file(file_path:str) -> dict:
    """ 
    Reads a yaml file and returns the content of the file
    as a dictionary
    
    file_path:str
        File path to the yaml file
    """

    try:
        with open(file_path, "rb") as yaml_file:
            return yaml.safe_load(file_path)
    except Exception as e:
        raise e

In [2]:
from housing.config.configuration import Configuration

In [8]:
import os
os.chdir()
config = Configuration(config_file_path="config\\config.yaml")

HousingException: Error Occurred in script[c:\users\dj\desktop\current batch project\housing-prediction-machine-learning\housing\config\configuration.py] at line number: [23] error message: [Error Occurred in script[c:\users\dj\desktop\current batch project\housing-prediction-machine-learning\housing\util\util.py] at line number: [19] error message: [[Errno 2] No such file or directory: '.\\config\\config.yaml']]

In [1]:
import pandas as pd
import numpy as np

In [3]:
housing_data_frame = pd.read_csv(r"C:\Users\DJ\Desktop\Current Batch Project\Housing-Prediction-Machine-Learning\housing\artifact\data_ingestion\2022-07-06-12-42-04\raw_data\housing.csv")

In [19]:
housing_data_frame['income_cat'] = pd.cut(
    housing_data_frame['median_income'],
    bins=[0.0, 1.5, 3.0, 4.5, 6.0, np.inf],
    labels=[1,2,3,4,5]
)

In [20]:
housing_data_frame

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,income_cat
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,5
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,5
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,5
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,4
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,3
...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND,2
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND,2
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND,2
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND,2


In [21]:
housing_data_frame.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity', 'income_cat'],
      dtype='object')

In [22]:
data_type = list(map(lambda x: str(x).replace("dtype('", "").replace("')", "") ,list(housing_data_frame.dtypes)))

In [23]:
column = housing_data_frame.columns

In [24]:
dict(zip(column, data_type))

{'longitude': 'float64',
 'latitude': 'float64',
 'housing_median_age': 'float64',
 'total_rooms': 'float64',
 'total_bedrooms': 'float64',
 'population': 'float64',
 'households': 'float64',
 'median_income': 'float64',
 'median_house_value': 'float64',
 'ocean_proximity': 'object',
 'income_cat': 'category'}

0        8.3252
1        8.3014
2        7.2574
3        5.6431
4        3.8462
          ...  
20635    1.5603
20636    2.5568
20637    1.7000
20638    1.8672
20639    2.3886
Name: median_income, Length: 20640, dtype: float64