In [1]:
import numpy as np
import pandas as pd
from category_encoders import MEstimateEncoder, TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import json
import math
import os

# Load data
train_file_path = "./data/train.csv"
test_file_path = "./data/test.csv"
train_df = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)

train, valid = train_test_split(train_df, test_size=0.2, random_state=42)

print("First few rows of the dataset:")
print(train_df.head())

# Print column names to confirm dataset structure
print("Column names in dataset:", train_df.columns)

First few rows of the dataset:
   listing_id                                              title  \
0     1292132  Land Rover Range Rover Velar 3.0A Si6 R-Dynami...   
1     1294696   Mercedes-Benz C-Class C200 Sport Premium Sunroof   
2     1311717              Honda Odyssey 2.4A (COE till 09/2027)   
3     1310068       Toyota Corolla Altis 1.6A (COE till 12/2028)   
4     1325280                     Lexus GS300 (COE till 06/2026)   

            make    model                                        description  \
0     land rover    range  1 owner, no repairs needed! it looks great, in...   
1  mercedes-benz     c200  rare beautiful white c200 sport premium sunroo...   
2          honda  odyssey            comes with warranty. full service done.   
3         toyota    altis                                                  0   
4          lexus       gs  wear and tear done up. well maintained and reg...   

   manufactured original_reg_date     reg_date  type_of_vehicle  \
0        201

## Encoding

### Target Encoding

In [2]:
def target_encode_make(df, column, target):
    """Target encode the make column using MEstimateEncoder"""
    encoder = MEstimateEncoder(
        cols=[column],
        m=5.0, 
    )
    encoder.fit(df[[column]], df[target])
    default_mean = df[target].mean()
    return encoder, default_mean

def apply_target_encoding(df, column, encoder, default_mean):
    """Apply encoder to the dataset"""
    df_temp = df[[column]].copy()
    encoded_values = encoder.transform(df_temp)
    df[f"{column}_target_encoded"] = encoded_values[column]
    return df

### Multi-label Encoding

In [3]:
def encode_categories_train(df, column_name):
    """Initialize and apply MultiLabelBinarizer on training data, return encoder for future use"""
    df[f"{column_name}_list"] = df[column_name].apply(lambda x: x.split(', '))
    mlb = MultiLabelBinarizer()
    mlb.fit_transform(df[f"{column_name}_list"])
    return mlb

def apply_categories_encoding(df, column_name, mlb):
    """Apply saved MultiLabelBinarizer to new DataFrame"""
    df[f"{column_name}_list"] = df[column_name].apply(lambda x: x.split(', '))
    df_encoded = mlb.transform(df[f"{column_name}_list"])
    df_encoded = pd.DataFrame(df_encoded, columns=mlb.classes_, index=df.index)
    df = pd.concat([df, df_encoded], axis=1)
    df = df.drop(columns=[column_name, f"{column_name}_list"])
    return df

### One-hot Encoding

In [4]:
def onehot_encode_columns_train(df, columns):
    """Initialize and apply OneHotEncoder on training data, return encoders for future use"""
    encoders = {}
    for column in columns:
        onehot_encoder = OneHotEncoder()
        df_encoded = onehot_encoder.fit_transform(df[[column]])
        encoders[column] = onehot_encoder
    return encoders

def apply_onehot_encoding(df, columns, encoders):
    """Apply saved OneHotEncoder to new DataFrame"""
    for column in columns:
        df_encoded = encoders[column].transform(df[[column]]).toarray()
        df_encoded = pd.DataFrame(df_encoded, columns=encoders[column].get_feature_names_out([column]), index=df.index)
        df = pd.concat([df, df_encoded], axis=1)
        df = df.drop(columns=column)

    return df

## Data Proecessing

In [5]:
import json

del_cols = ['listing_id', 'original_reg_date','opc_scheme', 'lifespan','eco_category', 'indicative_price']
text_cols = ['title', 'description', 'features', 'accessories']
date_cols = ['reg_date']
numeric_cols = ['manufactured', 'curb_weight', 'power', 'engine_cap', 'depreciation', 'coe', 'road_tax', 
            'dereg_value', 'mileage', 'omv', 'arf', 'year', 'month',
            'text_brand_popularity_score', 'text_model_value_score', 'text_condition_score',
            'text_feature_rarity_score', 'text_performance_score', 'text_sentiment_score']
log_cols = ['manufactured', 'curb_weight', 'power_log', 'engine_cap_log', 'depreciation_log', 'coe', 'road_tax_log', 'dereg_value_log', 'mileage_log', 'omv_log', 'arf_log', 'year', 'month']
root_cols = ['manufactured', 'curb_weight', 'power_root', 'engine_cap_root', 'depreciation_root', 'coe', 'road_tax_root', 'dereg_value_root', 'mileage_root', 'omv_root', 'arf_root', 'year', 'month']
categorical_cols = ['make', 'model', 'type_of_vehicle', 'category', 'transmission', 'fuel_type', 'no_of_owners']

def get_maxmin_dict(data, numeric_cols):
    max_dict = dict()
    min_dict = dict()
    for feature in numeric_cols:
        max_dict[feature] = data[feature].max()
        min_dict[feature] = data[feature].min()
    return max_dict, min_dict

In [6]:
def preprocess_data_cat(data, del_cols, text_cols, target_encoder, default_mean, mlb_encoder, onehot_encoders, data_type='train'):
    """Process categorical features and merge GPT features"""
    # Select GPT features file based on data type
    if data_type == 'train':
        gpt_features_file = 'data/with_text_features_train.csv'
    else:
        gpt_features_file = 'data/with_text_features_test.csv'
        
    if os.path.exists(gpt_features_file):
        print(f"Found GPT features file: {gpt_features_file}")
        gpt_features = pd.read_csv(gpt_features_file)
        gpt_cols = ['listing_id', 'text_brand_popularity_score', 'text_model_value_score', 
                   'text_condition_score', 'text_feature_rarity_score', 
                   'text_performance_score', 'text_sentiment_score']
        gpt_features = gpt_features[gpt_cols]
        data = data.merge(gpt_features, on='listing_id', how='left')
        
        gpt_feature_cols = [col for col in gpt_cols if col != 'listing_id']
        for col in gpt_feature_cols:
            data[col] = data[col].fillna(0.5)
    else:
        print(f"GPT features file not found: {gpt_features_file}, skipping GPT features merge")
    
    data = data.drop(columns=del_cols)
    data = data.drop(columns=text_cols)
    
    data = apply_target_encoding(data, 'make', target_encoder, default_mean)
    data = apply_categories_encoding(data, 'category', mlb_encoder)
    data = apply_onehot_encoding(data, ['type_of_vehicle', 'fuel_type', 'transmission'], onehot_encoders)

    data['reg_date'] = pd.to_datetime(data['reg_date'], format='%d-%b-%Y')  
    data['year'] = data['reg_date'].dt.year
    data['month'] = data['reg_date'].dt.month
    data = data.drop(columns='reg_date')
    data['no_of_owners'] = data['no_of_owners'].fillna(2)
    
    return data


def preprocess_data_num(data, max_dict, min_dict, 
                       remove_outliers=False,
                       do_normalize=False, 
                       normalize_method='standard'):
    """Process numeric features"""
    for feature in numeric_cols:
        data[feature] = data[feature].fillna(data[feature].median())
    
    mask = ~((data[numeric_cols] - data[numeric_cols].mean()).abs() > 3 * data[numeric_cols].std()).any(axis=1)
    
    if remove_outliers:
        data = data[mask]
    
    long_tail_features = ['omv', 'arf', 'depreciation', 'dereg_value', 'power', 'engine_cap', 'road_tax', 'mileage']
    for feature in long_tail_features:
        data[f'{feature}_log'] = np.log1p(data[feature])
        data[f'{feature}_root'] = np.sqrt(data[feature])

    if do_normalize:
        for feature in numeric_cols:
            if normalize_method == 'standard':
                max_dict[f"{feature}_mean"] = data[feature].mean()
                min_dict[f"{feature}_std"] = data[feature].std()
                data[feature] = (data[feature] - max_dict[f"{feature}_mean"]) / min_dict[f"{feature}_std"]
            else:
                max_dict[feature] = data[feature].max()
                min_dict[feature] = data[feature].min()
                data[feature] = (data[feature] - min_dict[feature]) / (max_dict[feature] - min_dict[feature])
        
        for feature in long_tail_features:
            log_name = f'{feature}_log'
            root_name = f'{feature}_root'
            
            if normalize_method == 'standard':
                max_dict[f"{log_name}_mean"] = data[log_name].mean()
                min_dict[f"{log_name}_std"] = data[log_name].std()
                max_dict[f"{root_name}_mean"] = data[root_name].mean()
                min_dict[f"{root_name}_std"] = data[root_name].std()
                
                data[log_name] = (data[log_name] - max_dict[f"{log_name}_mean"]) / min_dict[f"{log_name}_std"]
                data[root_name] = (data[root_name] - max_dict[f"{root_name}_mean"]) / min_dict[f"{root_name}_std"]
            else:
                max_dict[log_name] = data[log_name].max()
                min_dict[log_name] = data[log_name].min()
                max_dict[root_name] = data[root_name].max()
                min_dict[root_name] = data[root_name].min()
                
                data[log_name] = (data[log_name] - min_dict[log_name]) / (max_dict[log_name] - min_dict[log_name])
                data[root_name] = (data[root_name] - min_dict[root_name]) / (max_dict[root_name] - min_dict[root_name])
    
    return data, mask if remove_outliers else None

In [7]:
do_normalize = False
normalize_method='minmax'

# Part 1: Process training data
print("Part 1: Processing training data...")
X_train, y_train = train.drop(columns=['price']), train['price']
X_valid, y_valid = valid.drop(columns=['price']), valid['price']

# Generate encoders using training data
target_encoder_train, default_mean_train = target_encode_make(train, 'make', 'price')
mlb_encoder_train = encode_categories_train(train, 'category')
onehot_encoders_train = onehot_encode_columns_train(train, ['type_of_vehicle', 'fuel_type', 'transmission'])

# Process categorical features
X_train = preprocess_data_cat(X_train, del_cols, text_cols, target_encoder_train, default_mean_train, mlb_encoder_train, onehot_encoders_train)
X_valid = preprocess_data_cat(X_valid, del_cols, text_cols, target_encoder_train, default_mean_train, mlb_encoder_train, onehot_encoders_train)

# Process numerical features
max_dict_train, min_dict_train = get_maxmin_dict(X_train, numeric_cols)
X_train, mask = preprocess_data_num(X_train, max_dict_train, min_dict_train, do_normalize=do_normalize, normalize_method=normalize_method, remove_outliers=False)
if mask is not None:
    y_train = y_train[mask]
X_valid, _ = preprocess_data_num(X_valid, max_dict_train, min_dict_train, do_normalize=do_normalize, normalize_method=normalize_method, remove_outliers=False)

# Save training data
import os
os.makedirs('data/processed', exist_ok=True)
X_train.to_csv('data/processed/X_train.csv', index=False)
y_train.to_csv('data/processed/y_train.csv', index=False)
X_valid.to_csv('data/processed/X_valid.csv', index=False)
y_valid.to_csv('data/processed/y_valid.csv', index=False)
print('Training data saved to data/processed/ directory')

# Part 2: Process full dataset
print("Part 2: Processing full dataset...")
X_test = test

# 1. Process full training data first
X_train_full = train_df.drop(columns=['price'])
y_train_full = train_df['price']

# 2. Generate encoders using full training data
target_encoder_full, default_mean_full = target_encode_make(train_df, 'make', 'price')
mlb_encoder_full = encode_categories_train(train_df, 'category')
onehot_encoders_full = onehot_encode_columns_train(train_df, ['type_of_vehicle', 'fuel_type', 'transmission'])

# 3. Process categorical features
X_train_full = preprocess_data_cat(X_train_full, del_cols, text_cols, target_encoder_full, default_mean_full, mlb_encoder_full, onehot_encoders_full)
X_test = preprocess_data_cat(X_test, del_cols, text_cols, target_encoder_full, default_mean_full, mlb_encoder_full, onehot_encoders_full, data_type='test')

# 4. Get max/min dictionary using processed full training data
max_dict_full, min_dict_full = get_maxmin_dict(X_train_full, numeric_cols)

# 5. Process numerical features
X_train_full, _ = preprocess_data_num(X_train_full, max_dict_full, min_dict_full, do_normalize=do_normalize, normalize_method=normalize_method, remove_outliers=False)
X_test, _ = preprocess_data_num(X_test, max_dict_full, min_dict_full, do_normalize=do_normalize, normalize_method=normalize_method, remove_outliers=False)

# Save processed full training data and test data
X_train_full.to_csv('data/processed/X_train_full.csv', index=False)
y_train_full.to_csv('data/processed/y_train_full.csv', index=False)
X_test.to_csv('data/processed/X_test.csv', index=False)
print('Full training data and test data saved to data/processed/ directory')


Part 1: Processing training data...
Found GPT features file: data/with_text_features_train.csv


Found GPT features file: data/with_text_features_train.csv
Training data saved to data/processed/ directory
Part 2: Processing full dataset...
Found GPT features file: data/with_text_features_train.csv
Found GPT features file: data/with_text_features_test.csv
Full training data and test data saved to data/processed/ directory
