In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [6]:
def load_and_explore_dataset(filePath):
    print("=" * 60)
    print("LOADT AND EXPLORE DATASET")
    print("=" * 60)

    df = pd.read_csv(filePath)

    print('Shape of the dataset:')
    print(df.shape)
    print('\nCheck for missing value')
    print(df.isnull().sum())
    print('\nFirst five rows:')
    print(df.head())
    print('\nDescriptive stats:')
    print(df.describe())
    print('\nDataset Info:')
    print(df.info())
    print('\nCondition Distribution:')
    print(df['condition'].value_counts())
    print('\nTransmission Distribution:')
    print(df['transmission'].value_counts())

    return df

In [12]:
def preprocessing_data(df):
    print('\n' + '=' * 60)
    print("PREPROCESSING LOAD DATA")
    print('=' * 60)

    df_processed = df.copy()
    df_processed = df.dropna()

    label_encoder = {}
    df_columns = ['make', 'model', 'condition', 'transmission']
    for col in df_columns:
        le = LabelEncoder()
        df_processed[col + "_encoded"] = le.fit_transform(df_processed[col])
        label_encoder[col] = le
        print(f"\n{col} encoded")
        for i, label in enumerate(le.classes_):
            print(f"  {label} : {i}")
    print("Processed Dataset shape", df_processed.shape)

    return df_processed, label_encoder 

In [16]:
def features_data(df_processed):
    print('\n' + '=' * 60)
    print("FEATURES DATA")
    print('=' * 60)

    feature_columns = ['year', 'make_encoded', 'model_encoded', 'condition_encoded', 'transmission_encoded']
    target_column = ['price']

    X = df_processed[feature_columns]
    y = df_processed[target_column]

    print("\nFeatures Shape", df_processed[feature_columns].shape)
    print("\nTarget Shape", df_processed[target_column].shape)
    print("\nFeatures", feature_columns)
    
    return X, y, feature_columns

In [9]:
def main():
    filePath = 'cleaned_jiji_car_dataset.csv'

    df = load_and_explore_dataset(filePath)

    df_processed, label_encoder = preprocessing_data(df)

    X, y, feature_columns = features_data(df_processed)
    

In [17]:
if __name__ == "__main__":
    main()

LOADT AND EXPLORE DATASET
Shape of the dataset:
(1755, 7)

Check for missing value
title           0
make            0
model           0
year            0
condition       0
transmission    0
price           0
dtype: int64

First five rows:
                                               title           make  \
0                            Lexus RX 350 2009 White          Lexus   
1  Hyundai Sonata Limited w/Brown Leather 4dr Sed...        Hyundai   
2                Toyota Highlander Limited 2012 Gray         Toyota   
3  Mercedes-Benz C300 Base AWD 4Matic Sedan (2.0L...  Mercedes-Benz   
4                        Hyundai Elantra 2014 Silver        Hyundai   

                model  year     condition transmission       price  
0              RX 350  2009  Foreign used    Automatic  12850000.0  
1      Sonata Limited  2015  Foreign used    Automatic  15450000.0  
2  Highlander Limited  2012    Local used    Automatic  14500000.0  
3           C300 Base  2015    Local used    Automatic  1