In [53]:
import os
os.chdir("D:\pp\Price-Analysis-DataScience")

In [54]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir : Path
    data_path: Path

In [55]:
from src.price_prediction.constants import *
from src.price_prediction.utils.common import read_yaml, create_directories



In [56]:
class Configuration:
    def __init__(self, 
    config_filepath = CONFIG_FILE_PATH,
    params_filepath = PARAMS_FILE_PATH,
    schema_filepath = SCHEMA_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])
        data_transformation_config = DataTransformationConfig(
            root_dir = Path(config.root_dir),
            data_path = Path(config.data_path)
        )
        return data_transformation_config
    
    
    

In [57]:
import os
from price_prediction import logger
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [113]:
data = pd.read_csv(r"D:\pp\Price-Analysis-DataScience\artifacts\data_ingestion\car_prices.csv")

In [114]:
data.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg566472,ca,5.0,16639.0,white,black,kia motors america inc,20500.0,21500.0,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
1,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg561319,ca,5.0,9393.0,white,beige,kia motors america inc,20800.0,21500.0,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
2,2014,BMW,3 Series,328i SULEV,Sedan,automatic,wba3c1c51ek116351,ca,45.0,1331.0,gray,black,financial services remarketing (lease),31900.0,30000.0,Thu Jan 15 2015 04:30:00 GMT-0800 (PST)
3,2015,Volvo,S60,T5,Sedan,automatic,yv1612tb4f1310987,ca,41.0,14282.0,white,black,volvo na rep/world omni,27500.0,27750.0,Thu Jan 29 2015 04:30:00 GMT-0800 (PST)
4,2014,BMW,6 Series Gran Coupe,650i,Sedan,automatic,wba6b2c57ed129731,ca,43.0,2641.0,gray,black,financial services remarketing (lease),66000.0,67000.0,Thu Dec 18 2014 12:30:00 GMT-0800 (PST)


In [115]:
df = data[data['transmission'] == 'automatic']

In [116]:
df.drop(['vin'], axis = 1, inplace = True)

In [117]:
df.dropna(inplace=True)

In [138]:
df.head()

Unnamed: 0,year,make,model,trim,body,transmission,state,condition,odometer,color,interior,seller,mmr,sellingprice,Datetime
0,2015,Kia,Sorento,LX,SUV,automatic,ca,5.0,16639.0,white,black,kia motors america inc,20500.0,21500.0,2014-12-16
1,2015,Kia,Sorento,LX,SUV,automatic,ca,5.0,9393.0,white,beige,kia motors america inc,20800.0,21500.0,2014-12-16
2,2014,BMW,3 Series,328i SULEV,Sedan,automatic,ca,45.0,1331.0,gray,black,financial services remarketing (lease),31900.0,30000.0,2015-01-15
3,2015,Volvo,S60,T5,Sedan,automatic,ca,41.0,14282.0,white,black,volvo na rep/world omni,27500.0,27750.0,2015-01-29
4,2014,BMW,6 Series Gran Coupe,650i,Sedan,automatic,ca,43.0,2641.0,gray,black,financial services remarketing (lease),66000.0,67000.0,2014-12-18


In [119]:
import pandas as pd

# Assuming saledate is the DataFrame column with the date
saledate = df['saledate'].str.split(expand=True)[[1,2,3,4,5]]

df['Datetime'] = pd.to_datetime(saledate[1] + ' ' + saledate[2].astype(str) + ' ' + saledate[3].astype(str) + ' ' + saledate[4] + ' ' + saledate[5], utc=True)

# Get only the date part
df['Datetime'] = df['Datetime'].dt.date

In [120]:
df.drop(columns=['saledate'], inplace=True)

In [121]:
year_df =df.groupby(by="year", as_index=False)["sellingprice"].first()

In [122]:
from plotly import express as px

In [123]:
px.bar(year_df, x= 'year', y = 'sellingprice',  title='selling price vs year')

In [124]:
data_by_seller = df.groupby(by=["seller", "year"], as_index=False)[
    "sellingprice"
].first()
data_by_seller.sort_values(by="sellingprice", ascending=False, inplace=True)

In [125]:
px.bar(data_by_seller, x= 'year', y = 'sellingprice',  title='selling price vs year')

In [109]:
make_price = df.groupby(by="make", as_index=False)["sellingprice"].first()
make_price.sort_values(by="sellingprice", ascending=False, inplace=True)
make_price.head(10)

Unnamed: 0,make,sellingprice
12,Ferrari,154000.0
41,Rolls-Royce,149800.0
25,Lamborghini,117500.0
4,Bentley,96000.0
47,Tesla,80000.0
13,Fisker,54500.0
1,Aston Martin,51000.0
39,Porsche,46500.0
32,Mercedes-Benz,45500.0
6,Cadillac,44000.0


In [126]:
new_df = df.groupby(
    by=["year", "make", "transmission", "condition", "color", "odometer", "mmr"],
    as_index=False,
)["sellingprice"].first()


new_df.sort_values(by="sellingprice", ascending=False).head(2)

Unnamed: 0,year,make,transmission,condition,color,odometer,mmr,sellingprice
409364,2014,Ford,automatic,43.0,green,27802.0,22800.0,230000.0
179258,2011,Ferrari,automatic,46.0,red,12116.0,182000.0,183000.0


In [127]:
new_df.shape

(455951, 8)

In [128]:
yearly_mean_price = (
    new_df.groupby("year", as_index=False)["sellingprice"].mean()
).round(2)

In [129]:
px.bar(yearly_mean_price, x="year", y="sellingprice", title="Yearly mean price")

In [139]:
y = df['sellingprice']
X = df.drop(['sellingprice', 'Datetime'], axis=1)

In [133]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor

In [140]:
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Creating a pipeline that includes preprocessing and the model
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [141]:
from sklearn.metrics import mean_squared_error, r2_score


In [142]:
model_pipeline.fit(X_train, y_train)

y_pred = model_pipeline.predict(X_test)
print(y_pred)
print(f'Gradient Boosting MSE: {mean_squared_error(y_test, y_pred)}')
print(f'Gradient Boosting R² score: {r2_score(y_test, y_pred)}')

[12674.77709045 33750.96216986 13991.52185082 ...  8821.96606105
  9960.87954789  9346.78714102]
Gradient Boosting MSE: 2253864.043296017
Gradient Boosting R² score: 0.9752197672608492


In [160]:
class DataTransformation:
    def __init__(self, config):
        self.config = config
        self.data = pd.read_csv(self.config.data_path)

    def handling_missing_values(self):
        
        df = self.data.drop(columns=["vin"])
        logger.info("vin column dropped")
        data = df.dropna()
        logger.info("Missing values handled")
        print(data)
        return data
    
    def date_time_conversion(self ,df):
        saledate = df['saledate'].str.split(expand=True)[[1,2,3,4,5]]
        df['Datetime'] = pd.to_datetime(saledate[1] + ' ' + saledate[2].astype(str) + ' ' + saledate[3].astype(str) + ' ' + saledate[4] + ' ' + saledate[5], utc=True)
        # Get only the date part
        df['Datetime'] = df['Datetime'].dt.date
        df.drop(columns=['saledate'], inplace=True)
        return df
    
    def train_test_split(self, df):
        train, test = train_test_split(df)
        train.to_csv(os.path.join(self.config.root_dir , 'train.csv'), index=False)   
        test.to_csv(os.path.join(self.config.root_dir , 'test.csv'), index=False)
        logger.info("train and test data saved")

    
   

    
    

    
    

In [161]:
try:
    config = Configuration()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(data_transformation_config)
    df = data_transformation.handling_missing_values()
    print(df)
    d= data_transformation.date_time_conversion(df)
    data_transformation.train_test_split(d)
except Exception as e:
    raise e

[2024-04-11 15:35:55,492: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-04-11 15:35:55,496: INFO: common: yaml file: params.yaml loaded successfully]
[2024-04-11 15:35:55,503: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-04-11 15:35:55,504: INFO: common: created directory at: artificats]
[2024-04-11 15:35:55,506: INFO: common: created directory at: artifacts/data_transformation]
[2024-04-11 15:35:58,786: INFO: 595435062: vin column dropped]
[2024-04-11 15:35:59,242: INFO: 595435062: Missing values handled]
        year    make                model         trim       body  \
0       2015     Kia              Sorento           LX        SUV   
1       2015     Kia              Sorento           LX        SUV   
2       2014     BMW             3 Series   328i SULEV      Sedan   
3       2015   Volvo                  S60           T5      Sedan   
4       2014     BMW  6 Series Gran Coupe         650i      Sedan   
...      ...     ...          