In [1]:
import os

In [2]:
from pathlib import Path

In [3]:
from dataclasses import dataclass

In [4]:
@dataclass
class DataIngestionConfig:
  root_dir:Path
  source_url:str
  local_data_file:Path
  unzip_dir:Path

In [5]:
@dataclass
class DataTransformConfig:
  root_dir:Path
  local_data_file: Path
  train_path: Path
  test_path: Path

In [6]:
@dataclass
class DataCleaningConfing:
  root_dir: Path
  train_path: Path
  test_path: Path
  cleaned_train_path: Path
  cleaned_test_path: Path

In [7]:
@dataclass
class ModelTrainingConfig:
  root_dir:Path
  cleaned_train_path: Path
  cleaned_test_path: Path
  model:Path  

In [8]:
@dataclass
class ModelValidationConfig:
    root_dir: Path
    cleaned_test_path:Path
    model: Path
    scores: Path
    ResidualDistribution: Path
    ResidualsVSPredictions: Path
    

In [9]:
import os
os.chdir('..')


In [10]:
from src.datascience.constants import *
from src.datascience.utils.common import *

In [11]:
os.chdir('C:/Users/bored/Music/My_own/research')

In [12]:
%pwd

'C:\\Users\\bored\\Music\\My_own\\research'

In [13]:
config_filepath=CONFIG_FILE_PATH

In [14]:
print(config_filepath.absolute())

C:\Users\bored\Music\My_own\config\config.yaml


In [15]:
class ConfigurationManager:
  def __init__(self,config_filepath=CONFIG_FILE_PATH,
               schema_filepath=SCHEMA_FILE_PATH,
               params_filepath=PARAMS_FILE_PATH):
    print(schema_filepath)
    self.config=read_yaml(config_filepath)
    self.schema=read_yaml(schema_filepath)
    self.params=read_yaml(params_filepath)
    crate_directories([self.config.artifacts_root])
  def get_data_ingestion_config(self)->DataIngestionConfig:
    config=self.config.data_ingestion
    crate_directories([config.root_dir])
    data_ingestion_config=DataIngestionConfig(
      root_dir=config.root_dir,
      source_url=config.source_url,
      local_data_file=config.local_data_file,
      unzip_dir=config.unzip_dir
    )
    return data_ingestion_config
  def get_data_transform(self)->DataTransformConfig:
    config=self.config.data_transform
    crate_directories([config.root_dir])
    data_trainsform_config=DataTransformConfig(
    root_dir=config.root_dir,
    local_data_file=config.local_data_file,
    train_path=config.train_path,
    test_path= config.test_path
    )
    return data_trainsform_config
  def get_data_clean(self)->DataCleaningConfing:
    config=self.config.data_cleaning
    crate_directories([config.root_dir])
    data_clean_config=DataCleaningConfing(
       root_dir= config.root_dir ,
        train_path= config.train_path ,
        test_path = config.test_path ,
        cleaned_train_path= config.cleaned_train_path ,
        cleaned_test_path= config.cleaned_test_path ,
    )
    return data_clean_config
  def get_model_training(self)->ModelTrainingConfig:
    config=self.config.model_training
    crate_directories([config.root_dir])
    Model_Training_config=ModelTrainingConfig(
      root_dir=config.root_dir,
      cleaned_train_path=config.cleaned_train_path,
      cleaned_test_path=config.cleaned_test_path,
      model=config.model
    )
    return Model_Training_config
  def get_model_evaluation(self)->ModelValidationConfig:
    print('Hitting the get model_evaluation')
    config=self.config.model_validation
    print('got model_validation')
    crate_directories([config.root_dir])
    print('creatig')
    Model_Validation_config=ModelValidationConfig(
      root_dir=config.root_dir,
      scores=config.scores,
      cleaned_test_path=config.cleaned_test_path,
      model=config.model,
      ResidualDistribution=config.ResidualDistribution,
      ResidualsVSPredictions=config.ResidualsVSPredictions
      
    )
    return Model_Validation_config

In [16]:
import os
import urllib.request as request
from src.datascience.basicConfig import logger

In [17]:
import zipfile

In [18]:
os.chdir('C:\\Users\\bored\\Music\\MY_OWN')

In [19]:
class DataIngestion:
  def __init__(self,config:DataIngestionConfig):
    self.config=config
  def download_file(self):
    if not os.path.exists(self.config.local_data_file):
      filename,header=request.urlretrieve(url=self.config.source_url,
                                          filename=self.config.local_data_file)
      logger.info(f'{filename} downloaded! with following info\n{header}')

    else:
      logger.info('File already exists')


In [20]:
from sklearn.model_selection import train_test_split

In [21]:
class DataTransform:
  def __init__(self,config:DataTransformConfig):
    self.config=config
  def extract_split(self):
    self.path=self.config.local_data_file
    data=pd.read_csv(self.path)
    self.train,self.test=train_test_split(data,train_size=0.8,random_state=42)
    save_file(self.train,self.config.train_path)
    save_file(self.test,self.config.test_path)

In [22]:
class DataCleaning:
  def __init__(self,config:DataCleaningConfing):
    self.config=config
  def clean(self):
    self.test_data=pd.read_csv(self.config.test_path)
    self.train_data=pd.read_csv(self.config.test_path)
    self.test_data.reset_index(drop=True,inplace=True)
    self.test_data.drop(columns=['Unnamed: 0'], inplace=True)
    self.train_data.reset_index(drop=True,inplace=True)
    self.train_data.drop(columns=['Unnamed: 0'], inplace=True)    
    save_file(self.test_data,self.config.cleaned_test_path)
    save_file(self.train_data,self.config.cleaned_train_path)



In [23]:
import numpy as np

In [24]:
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold


In [25]:
from sklearn.model_selection import RandomizedSearchCV


In [26]:
import pickle

In [27]:
class ModelTraining:
  def __init__(self,config:ModelTrainingConfig):
    self.config=config
  def data_preparation(self):
    self.train_data=pd.read_csv(self.config.cleaned_train_path)
    self.test_data=pd.read_csv(self.config.cleaned_test_path)
    self.X_train=self.train_data.drop(columns=['price'])
    self.y_train=self.train_data['price']
    self.x_test=self.test_data.drop(columns=['price'])
    self.y_test=self.test_data['price']
    self.y_train_log=np.log1p(self.y_train)
    self.y_test_log=np.log1p(self.y_test)
  def model_training(self):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', XGBRegressor())
    ])
    param_grid = {
    'regressor__n_estimators': [50, 100, 200], 
    'regressor__learning_rate': [0.01, 0.1, 0.2], 
    'regressor__max_depth': [3, 5, 7], 
    'regressor__subsample': [0.6, 0.8, 1.0], 
    'regressor__colsample_bytree': [0.6, 0.8, 1.0], 
    'regressor__reg_alpha': [0, 0.1, 1],
    'regressor__reg_lambda': [1, 5, 10],
    }
    random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=20, 
    cv=5,  
    scoring='r2',  
    random_state=42,
    n_jobs=-1 
    )
    random_search.fit(self.X_train, self.y_train_log)
    
    with open(self.config.model, 'wb') as file:
      pickle.dump(random_search.best_estimator_, file)



In [28]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import median_absolute_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_log_error
import seaborn as sns
import matplotlib.pyplot as plt

In [46]:
class ModelValidation:
  def __init__(self,config:ModelValidationConfig):
    self.config=config
    
  def model_load(self):
    with open(self.config.model, 'rb') as file:
      self.loaded_model = pickle.load(file)
  def predict(self):
    data=pd.read_csv(self.config.cleaned_test_path)
    self.x_test=data.drop(columns=['price'])
    self.y_test=data['price']
    self.y_pred_log= self.loaded_model.predict(self.x_test)
    self.y_pred=np.expm1(self.y_pred_log)
  def scores(self):
    mae = mean_absolute_error(self.y_test, self.y_pred)
    mse = mean_squared_error(self.y_test, self.y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(self.y_test, self.y_pred)
    mape = np.mean(np.abs((self.y_test - self.y_pred) / self.y_test)) * 100
    smape = np.mean(2 * np.abs(self.y_test - self.y_pred) / (np.abs(self.y_test) + np.abs(self.y_pred))) * 100
    median_ae = median_absolute_error(self.y_test, self.y_pred)
    explained_variance = explained_variance_score(self.y_test, self.y_pred)
    max_error = np.max(np.abs(self.y_test - self.y_pred))
    n = len(self.y_test)
    k = self.x_test.shape[1]
    adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - k - 1)
    quantiles = np.quantile(self.y_test, [0.25, 0.5, 0.75])
    for q in quantiles:
        subset = self.y_test[self.y_test <= q]
        r2_subset = r2_score(subset, self.y_pred[self.y_test <= q])
    msle = mean_squared_log_error(self.y_test, self.y_pred)
    results = {
    "MAE": mae,
    "MSE": mse,
    "RMSE": rmse,
    "R²": r2,
    "Adjusted R²": adjusted_r2,
    "MAPE": mape,
    "SMAPE": smape,
    "Median AE": median_ae,
    "Explained Variance": explained_variance,
    "Max Error": max_error,
    "MSLE": msle
      }
    self.results = pd.DataFrame(list(results.items()), columns=["Metric", "Value"])
    save_file(self.results,self.config.scores)
  
    print('scores is successful')
    
  def plots(self):
    residuals = self.y_test - self.y_pred
    # Residual Distribution Plot
    plt.figure()
    sns.histplot(residuals, kde=True)
    plt.title("Residual Distribution")
    plt.xlabel("Residuals")
    plt.ylabel("Frequency")
    plt.grid()
    plt.savefig(self.config.ResidualDistribution, dpi=300)
    plt.close()

    # Residuals vs Predictions Plot
    plt.figure()
    plt.scatter(self.y_pred, residuals, alpha=0.6)
    plt.axhline(0, color='red', linestyle='--')
    plt.title("Residuals vs. Predictions")
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.grid()
    plt.savefig(self.config.ResidualsVSPredictions, dpi=300)
    plt.close()








    




In [30]:
try:
  config=ConfigurationManager()
  data_ingestion_config=config.get_data_ingestion_config()
  data_ingestion=DataIngestion(config=data_ingestion_config)
  data_ingestion.download_file()
except Exception as e:
  raise e

C:\Users\bored\Music\My_own\schema.yaml
[2024-12-03 18:49:28,736 : INFO : common : YAML file C:\Users\bored\Music\My_own\config\config.yaml is loaded safely]
[2024-12-03 18:49:28,739 : INFO : common : YAML file C:\Users\bored\Music\My_own\schema.yaml is loaded safely]
[2024-12-03 18:49:28,742 : INFO : common : YAML file C:\Users\bored\Music\My_own\params.yaml is loaded safely]
[2024-12-03 18:49:28,743 : INFO : common : Created directory artifacts]
[2024-12-03 18:49:28,745 : INFO : common : Created directory artifacts/data_ingestion]
[2024-12-03 18:49:29,877 : INFO : 1387786406 : artifacts/data_ingestion/data.csv downloaded! with following info
Connection: close
Content-Length: 65642
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: text/plain; charset=utf-8
ETag: "7a3b21963b623a3d9ec3c3d18549b285a9610e60ac27f90a49b1c0138b1ee0f0"
Strict-Transport-Security: max-age=31536000
X-Content-Type-Options: nosniff
X-Frame-Opti

In [31]:
try:
  config=ConfigurationManager()
  data_transform_config=config.get_data_transform()
  data_transform=DataTransform(config=data_transform_config)
  data_transform.extract_split()
  
except Exception as e:
  raise e


C:\Users\bored\Music\My_own\schema.yaml
[2024-12-03 18:49:29,901 : INFO : common : YAML file C:\Users\bored\Music\My_own\config\config.yaml is loaded safely]
[2024-12-03 18:49:29,907 : INFO : common : YAML file C:\Users\bored\Music\My_own\schema.yaml is loaded safely]
[2024-12-03 18:49:29,911 : INFO : common : YAML file C:\Users\bored\Music\My_own\params.yaml is loaded safely]
[2024-12-03 18:49:29,913 : INFO : common : Created directory artifacts]
[2024-12-03 18:49:29,915 : INFO : common : Created directory artifacts/data_transform]
[2024-12-03 18:49:29,943 : INFO : common : File saved successfully at artifacts/data_transform/train.csv]
[2024-12-03 18:49:29,950 : INFO : common : File saved successfully at artifacts/data_transform/test.csv]


In [32]:
try:
  config=ConfigurationManager()
  data_transform_config=config.get_data_clean()
  data_transform=DataCleaning(config=data_transform_config)
  data_transform.clean()
except Exception as e:
  raise e

C:\Users\bored\Music\My_own\schema.yaml
[2024-12-03 18:49:29,977 : INFO : common : YAML file C:\Users\bored\Music\My_own\config\config.yaml is loaded safely]
[2024-12-03 18:49:29,978 : INFO : common : YAML file C:\Users\bored\Music\My_own\schema.yaml is loaded safely]
[2024-12-03 18:49:29,982 : INFO : common : YAML file C:\Users\bored\Music\My_own\params.yaml is loaded safely]
[2024-12-03 18:49:29,985 : INFO : common : Created directory artifacts]
[2024-12-03 18:49:29,986 : INFO : common : Created directory artifacts/data_clean]
[2024-12-03 18:49:30,005 : INFO : common : File saved successfully at artifacts/data_clean/test.csv]
[2024-12-03 18:49:30,008 : INFO : common : File saved successfully at artifacts/data_clean/train.csv]


In [33]:

try:
  config=ConfigurationManager()
  model_training_config=config.get_model_training()
  model_trainingg=ModelTraining(config=model_training_config)
  model_trainingg.data_preparation()
  model_trainingg.model_training()
except Exception as e:
  raise e

C:\Users\bored\Music\My_own\schema.yaml
[2024-12-03 18:49:30,044 : INFO : common : YAML file C:\Users\bored\Music\My_own\config\config.yaml is loaded safely]
[2024-12-03 18:49:30,051 : INFO : common : YAML file C:\Users\bored\Music\My_own\schema.yaml is loaded safely]
[2024-12-03 18:49:30,072 : INFO : common : YAML file C:\Users\bored\Music\My_own\params.yaml is loaded safely]
[2024-12-03 18:49:30,084 : INFO : common : Created directory artifacts]
[2024-12-03 18:49:30,088 : INFO : common : Created directory artifacts/model_train]


In [47]:
try:
  config=ConfigurationManager()
  model_evaluation_config=config.get_model_evaluation()  # Correct config retrieval
  model_evaluationn=ModelValidation(config=model_evaluation_config) 
  model_evaluationn.model_load()
  print('success1')
  model_evaluationn.predict()
  print('success2')
  model_evaluationn.scores()
  print('success3')
  model_evaluationn.plots()
except Exception as e:
  raise e

C:\Users\bored\Music\My_own\schema.yaml
[2024-12-03 18:52:42,275 : INFO : common : YAML file C:\Users\bored\Music\My_own\config\config.yaml is loaded safely]
[2024-12-03 18:52:42,278 : INFO : common : YAML file C:\Users\bored\Music\My_own\schema.yaml is loaded safely]
[2024-12-03 18:52:42,280 : INFO : common : YAML file C:\Users\bored\Music\My_own\params.yaml is loaded safely]
[2024-12-03 18:52:42,284 : INFO : common : Created directory artifacts]
Hitting the get model_evaluation
got model_validation
[2024-12-03 18:52:42,288 : INFO : common : Created directory artifacts/model_validation]
creatig
success1
success2
[2024-12-03 18:52:42,316 : INFO : common : File saved successfully at artifacts/model_validation/scores.csv]
scores is successful
success3


In [35]:
config=read_yaml(config_filepath)

[2024-12-03 18:49:35,558 : INFO : common : YAML file C:\Users\bored\Music\My_own\config\config.yaml is loaded safely]


In [36]:
m=config.model_validation

In [37]:
print(m.model)

artifacts/model_train/model.pkl


In [38]:
print("model_validation:", config.get("model_validation"))


model_validation: {'root_dir': 'artifacts/model_validation', 'cleaned_test_path': 'artifacts/data_clean/test.csv', 'model': 'artifacts/model_train/model.pkl', 'ResidualDistribution': 'artifacts/model_validation/ResidualDistribution.png', 'ResidualsVSPredictions': 'artifacts/model_validation/ResidualsVSPredictions.png', 'FeatureImportance': 'artifacts/model_validation/FeatureImportance.png', 'LearningCurve': 'artifacts/model_validation/LearningCurve.png', 'scores': 'artifacts/model_validation/scores.csv'}
