In [18]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import json
import math

# Load processed datasets
X_train = pd.read_csv('data/processed/X_train.csv')
y_train = pd.read_csv('data/processed/y_train.csv')
X_valid = pd.read_csv('data/processed/X_valid.csv')
y_valid = pd.read_csv('data/processed/y_valid.csv') 
X_train_full = pd.read_csv('data/processed/X_train_full.csv') 
y_train_full = pd.read_csv('data/processed/y_train_full.csv') 
X_test = pd.read_csv('data/processed/X_test.csv')

print('Dataset loading completed')
print(f'Training set size: {X_train.shape}')
print(f'Validation set size: {X_valid.shape}')
print(f'Full training set size: {X_train_full.shape}')
print(f'Test set size: {X_test.shape}')


Dataset loading completed
Training set size: (20000, 74)
Validation set size: (5000, 74)
Full training set size: (25000, 74)
Test set size: (10000, 74)


In [19]:
cat_nu_cols = ["manufactured", "curb_weight", "power", "engine_cap", "no_of_owners", "depreciation", "coe", "road_tax", "dereg_value", "mileage", "omv", "arf", "make_target_encoded", "text_brand_popularity_score", "text_model_value_score", "text_condition_score", "text_feature_rarity_score", "text_performance_score", "text_sentiment_score", "-", "almost new car", "coe car", "consignment car", "direct owner sale", "electric cars", "hybrid cars", "imported used vehicle", "low mileage car", "opc car", "parf car", "premium ad car", "rare & exotic", "sgcarmart warranty cars", "sta evaluated car", "vintage cars", "type_of_vehicle_bus/mini bus", "type_of_vehicle_hatchback", "type_of_vehicle_luxury sedan", "type_of_vehicle_mid-sized sedan", "type_of_vehicle_mpv", "type_of_vehicle_others", "type_of_vehicle_sports car", "type_of_vehicle_stationwagon", "type_of_vehicle_suv", "type_of_vehicle_truck", "type_of_vehicle_van", "fuel_type_diesel", "fuel_type_diesel-electric", "fuel_type_electric", "fuel_type_petrol", "fuel_type_petrol-electric", "fuel_type_nan", "transmission_manual", "year", "month"]

cat_log_cols = ["manufactured", "curb_weight", "power_log", "engine_cap_log", "depreciation_log", "coe", "road_tax_log", "dereg_value_log", "mileage_log", "omv_log", "arf_log", "make_target_encoded", "text_brand_popularity_score", "text_model_value_score", "text_condition_score", "text_feature_rarity_score", "text_performance_score", "text_sentiment_score", "-", "almost new car", "coe car", "consignment car", "direct owner sale", "electric cars", "hybrid cars", "imported used vehicle", "low mileage car", "opc car", "parf car", "premium ad car", "rare & exotic", "sgcarmart warranty cars", "sta evaluated car", "vintage cars", "type_of_vehicle_bus/mini bus", "type_of_vehicle_hatchback", "type_of_vehicle_luxury sedan", "type_of_vehicle_mid-sized sedan", "type_of_vehicle_mpv", "type_of_vehicle_others", "type_of_vehicle_sports car", "type_of_vehicle_stationwagon", "type_of_vehicle_suv", "type_of_vehicle_truck", "type_of_vehicle_van", "fuel_type_diesel", "fuel_type_diesel-electric", "fuel_type_electric", "fuel_type_petrol", "fuel_type_petrol-electric", "fuel_type_nan", "transmission_manual", "year", "month"]

cat_root_cols = ["manufactured", "curb_weight", "power_root", "engine_cap_root", "depreciation_root", "coe", "road_tax_root", "dereg_value_root", "mileage_root", "omv_root", "arf_root", "make_target_encoded", "text_brand_popularity_score", "text_model_value_score", "text_condition_score", "text_feature_rarity_score", "text_performance_score", "text_sentiment_score", "-", "almost new car", "coe car", "consignment car", "direct owner sale", "electric cars", "hybrid cars", "imported used vehicle", "low mileage car", "opc car", "parf car", "premium ad car", "rare & exotic", "sgcarmart warranty cars", "sta evaluated car", "vintage cars", "type_of_vehicle_bus/mini bus", "type_of_vehicle_hatchback", "type_of_vehicle_luxury sedan", "type_of_vehicle_mid-sized sedan", "type_of_vehicle_mpv", "type_of_vehicle_others", "type_of_vehicle_sports car", "type_of_vehicle_stationwagon", "type_of_vehicle_suv", "type_of_vehicle_truck", "type_of_vehicle_van", "fuel_type_diesel", "fuel_type_diesel-electric", "fuel_type_electric", "fuel_type_petrol", "fuel_type_petrol-electric", "fuel_type_nan", "transmission_manual", "year", "month"]

# Create a list without GPT features (text_ columns)
cat_nu_cols_wo_gpt = [col for col in cat_nu_cols if not col.startswith('text_')]

# Set whether to use GPT features
use_gpt = True

if use_gpt:
    try:
        # Use full feature set including GPT features
        X_train = X_train[cat_nu_cols]
        X_valid = X_valid[cat_nu_cols]
        X_test = X_test[cat_nu_cols]
        X_train_full = X_train_full[cat_nu_cols]
    except KeyError:
        print("GPT features not found, using features without GPT")
        # If error occurs, use feature set without GPT features
        X_train = X_train[cat_nu_cols_wo_gpt]
        X_valid = X_valid[cat_nu_cols_wo_gpt]
        X_test = X_test[cat_nu_cols_wo_gpt]
        X_train_full = X_train_full[cat_nu_cols_wo_gpt]
else:
    # Directly use feature set without GPT features
    X_train = X_train[cat_nu_cols_wo_gpt]
    X_valid = X_valid[cat_nu_cols_wo_gpt]
    X_test = X_test[cat_nu_cols_wo_gpt]
    X_train_full = X_train_full[cat_nu_cols_wo_gpt]

print(f'Training set: {X_train.shape}')
print(f'Full set: {X_train_full.shape}')
print(f'Test set: {X_test.shape}')

Training set: (20000, 55)
Full set: (25000, 55)
Test set: (10000, 55)


## Concat BERT Vectors

In [20]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from umap.umap_ import UMAP  # Correct import method
import numpy as np

# Load BERT vectors
bert_train_vectors = np.load('data/processed/train_vectors.npy')
bert_valid_vectors = np.load('data/processed/valid_vectors.npy')
bert_train_full_vectors = np.load('data/processed/train_full_vectors.npy')
bert_test_vectors = np.load('data/processed/test_vectors.npy')

# BERT dimensionality reduction
scaler = StandardScaler()

# UMAP settings
umap = UMAP(
    n_components=8,
    n_neighbors=20,
    min_dist=0.1,
    metric='cosine',
    random_state=42
)

# umap = UMAP(
#     n_components=16,
#     n_neighbors=30,
#     min_dist=0.3,
#     metric='cosine',
#     random_state=42
# )

# umap = UMAP(
#     n_components=32,
#     n_neighbors=50,
#     min_dist=0.5,
#     metric='cosine',
#     random_state=42
# )


# PCA settings
# pca = PCA(
#     n_components=8,
#     random_state=42
# )

# pca = PCA(
#     n_components=16,
#     random_state=42
# )

# pca = PCA(
#     n_components=32,
#     random_state=42
# )

dim_reduction = umap

# Perform dimensionality reduction on BERT vectors
bert_train_scaled = scaler.fit_transform(bert_train_vectors)
bert_train_reduced = dim_reduction.fit_transform(bert_train_scaled)

# Apply same transformation to validation and test sets
bert_valid_scaled = scaler.transform(bert_valid_vectors)
bert_valid_reduced = dim_reduction.transform(bert_valid_scaled)

bert_test_scaled = scaler.transform(bert_test_vectors)
bert_test_reduced = dim_reduction.transform(bert_test_scaled)

# Transform full training set
bert_train_full_scaled = scaler.transform(bert_train_full_vectors)
bert_train_full_reduced = dim_reduction.transform(bert_train_full_scaled)

# Concatenate features
X_train_combined = np.hstack((X_train.values, bert_train_reduced))
X_valid_combined = np.hstack((X_valid.values, bert_valid_reduced))
X_test_combined = np.hstack((X_test.values, bert_test_reduced))
X_train_full_combined = np.hstack((X_train_full.values, bert_train_full_reduced))


# Print dimension information
print("Feature dimensions:")
print(f"Original features: {X_train.shape[1]}")
print(f"Reduced features: {bert_train_reduced.shape[1]}")
print(f"Combined features: {X_train_combined.shape[1]}")

  warn(


Feature dimensions:
Original features: 55
Reduced features: 8
Combined features: 63


## Baseline

In [21]:
X = X_train  # Features
y = y_train       # Target variable

# Creating the linear regression model
model = LinearRegression()

# Fitting the model
model.fit(X, y)

# Making predictions
y_valid_pred = model.predict(X_valid)

# Calculating the performance metrics
mse = mean_squared_error(y_valid_pred, y_valid)
r2 = r2_score(y_valid_pred, y_valid)
# Calculating the RMSE
rmse = math.sqrt(mse)

# Printing the MSE, RMSE, and R² Score
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R² Score: {r2}')

Mean Squared Error: 1992363754.8481019
Root Mean Squared Error: 44635.902083951456
R² Score: 0.9000067197210553


In [22]:
X = X_train_combined 
y = y_train

# Creating the linear regression model
model = LinearRegression()

# Fitting the model
model.fit(X, y)

# Making predictions
y_valid_pred = model.predict(X_valid_combined)

# Calculating the performance metrics
mse = mean_squared_error(y_valid_pred, y_valid)
r2 = r2_score(y_valid_pred, y_valid)
# Calculating the RMSE
rmse = math.sqrt(mse)

# Printing the MSE, RMSE, and R² Score
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R² Score: {r2}')

Mean Squared Error: 1986163351.6898425
Root Mean Squared Error: 44566.39262594452
R² Score: 0.9003002680784926


## Tree-based Models

In [23]:
from xgboost import XGBRegressor

# Create XGBoost model
model = XGBRegressor(
    # Basic parameters
    n_estimators=1000,        # Number of trees
    max_depth=6,              # Maximum tree depth to avoid overfitting
    learning_rate=0.01,       # Small learning rate for model stability
    
    # Parameters to prevent overfitting
    min_child_weight=5,       # Controls overfitting
    gamma=0.1,               # Minimum loss reduction required for node split
    subsample=0.8,           # Ratio of training samples to randomly sample
    colsample_bytree=0.8,    # Ratio of features to randomly sample
    
    # Regularization parameters
    reg_alpha=0.1,           # L1 regularization
    reg_lambda=1,            # L2 regularization
    
    # Other parameters
    objective='reg:squarederror',  # Regression task
    random_state=42,
    n_jobs=-1,               # Use all CPU cores
    verbosity=0
)


# Use original features without BERT embeddings
print("Using original features without BERT embeddings")

# Define features and target

X = X_train  # Features
y = y_train       # Target variable

print(X.shape)

# Add early stopping to prevent overfitting
eval_set = [(X_valid, y_valid)]

model.fit(
    X, 
    y,
    eval_set=eval_set
)

# Making predictions
y_valid_pred = model.predict(X_valid)

# Calculating the performance metrics
mse = mean_squared_error(y_valid_pred, y_valid)
r2 = r2_score(y_valid_pred, y_valid)
# Calculating the RMSE
rmse = math.sqrt(mse)

# Printing the MSE, RMSE, and R² Score
# print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
# print(f'R² Score: {r2}')

Using original features without BERT embeddings
(20000, 55)
[0]	validation_0-rmse:142302.58503
[1]	validation_0-rmse:141059.21290
[2]	validation_0-rmse:139814.90232
[3]	validation_0-rmse:138596.87475
[4]	validation_0-rmse:137383.82899
[5]	validation_0-rmse:136174.87417
[6]	validation_0-rmse:134990.07053
[7]	validation_0-rmse:133801.25630
[8]	validation_0-rmse:132657.01807
[9]	validation_0-rmse:131527.25182
[10]	validation_0-rmse:130388.18631
[11]	validation_0-rmse:129256.71292
[12]	validation_0-rmse:128152.58535
[13]	validation_0-rmse:127038.98374
[14]	validation_0-rmse:125937.67423
[15]	validation_0-rmse:124875.60048
[16]	validation_0-rmse:123788.04927
[17]	validation_0-rmse:122715.07888
[18]	validation_0-rmse:121695.51921
[19]	validation_0-rmse:120644.18487
[20]	validation_0-rmse:119639.02184
[21]	validation_0-rmse:118635.35593
[22]	validation_0-rmse:117621.28938
[23]	validation_0-rmse:116587.83682
[24]	validation_0-rmse:115607.78485
[25]	validation_0-rmse:114628.93368
[26]	validatio

In [25]:
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor

# 1. LightGBM
lgb_model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    num_leaves=31,
    max_depth=6,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1
)

# 2. CatBoost
cat_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.01,
    depth=6,
    l2_leaf_reg=3,
    random_seed=42,
    verbose=False
)

# 3. Stacking
base_models = [
    ('xgb', XGBRegressor(verbosity=0, random_state=42)),
    ('lgb', lgb.LGBMRegressor(verbose=-1, random_state=42)),
    ('cat', CatBoostRegressor(verbose=False, random_seed=42))
]
stacking = StackingRegressor(
    estimators=base_models,
    final_estimator=lgb.LGBMRegressor(verbose=-1),
    cv=5
)

# 4. RandomForest
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=42
)

# 5. GradientBoosting
gb_model = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    min_samples_split=5,
    random_state=42
)

# Comparison function
def compare_models(models, X_train, y_train, X_valid, y_valid):
    results = []
    for name, model in models.items():
        print(f"Training {name}")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_valid)
        rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
        r2 = r2_score(y_valid, y_pred)
        results.append({
            'Model': name,
            'RMSE': rmse,
            'R2': r2
        })
    return pd.DataFrame(results).sort_values('RMSE')

# Compare all models
models = {
    'LightGBM': lgb_model,
    'CatBoost': cat_model,
    'Stacking': stacking,
    'RandomForest': rf_model,
    'GradientBoosting': gb_model,
    'XGBoost': model  # Previously defined XGBoost model
}

results = compare_models(models, X_train, y_train, 
                       X_valid, y_valid)
print("\nModel Performance Comparison:")
print(results)

Training LightGBM
Training CatBoost
Training Stacking


  y = column_or_1d(y, warn=True)


Training RandomForest


  return fit_method(estimator, *args, **kwargs)


Training GradientBoosting


  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


Training XGBoost

Model Performance Comparison:
              Model          RMSE        R2
5           XGBoost  23043.774120  0.974222
0          LightGBM  23288.311453  0.973672
1          CatBoost  27009.841126  0.964585
2          Stacking  27316.237260  0.963777
3      RandomForest  28696.351604  0.960024
4  GradientBoosting  29672.087093  0.957259


In [26]:
from xgboost import XGBRegressor

# Create XGBoost model
model = XGBRegressor(
    # Basic parameters
    n_estimators=1000,        # Number of trees
    max_depth=6,              # Maximum tree depth to avoid overfitting
    learning_rate=0.01,       # Small learning rate for model stability
    
    # Parameters to prevent overfitting
    min_child_weight=5,       # Controls overfitting
    gamma=0.1,               # Minimum loss reduction required for node split
    subsample=0.8,           # Ratio of training samples to randomly sample
    colsample_bytree=0.8,    # Ratio of features to randomly sample
    
    # Regularization parameters
    reg_alpha=0.1,           # L1 regularization
    reg_lambda=1,            # L2 regularization
    
    # Other parameters
    objective='reg:squarederror',  # Regression task
    random_state=42,
    n_jobs=-1,               # Use all CPU cores
    verbosity=0
)


# Use original features with BERT embeddings
print("Using original features with BERT embeddings")

X = X_train_combined  # Features
y = y_train       # Target variable

# Add early stopping to prevent overfitting
eval_set = [(X_valid_combined, y_valid)]

model.fit(
    X, 
    y,
    eval_set=eval_set
)

# Making predictions
y_valid_pred = model.predict(X_valid_combined)

# Calculating the performance metrics
mse = mean_squared_error(y_valid_pred, y_valid)
r2 = r2_score(y_valid_pred, y_valid)
# Calculating the RMSE
rmse = math.sqrt(mse)

# Printing the MSE, RMSE, and R² Score
# print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
# print(f'R² Score: {r2}')

Using original features with BERT embeddings
[0]	validation_0-rmse:142308.20010
[1]	validation_0-rmse:141066.81316
[2]	validation_0-rmse:139825.76145
[3]	validation_0-rmse:138569.25814
[4]	validation_0-rmse:137398.23651
[5]	validation_0-rmse:136214.05422
[6]	validation_0-rmse:135011.12224
[7]	validation_0-rmse:133807.26726
[8]	validation_0-rmse:132641.79629
[9]	validation_0-rmse:131474.29748
[10]	validation_0-rmse:130364.61580
[11]	validation_0-rmse:129224.62829
[12]	validation_0-rmse:128134.81280
[13]	validation_0-rmse:127031.73400
[14]	validation_0-rmse:125968.01218
[15]	validation_0-rmse:124898.26006
[16]	validation_0-rmse:123815.28734
[17]	validation_0-rmse:122743.82624
[18]	validation_0-rmse:121693.46088
[19]	validation_0-rmse:120666.11756
[20]	validation_0-rmse:119664.58005
[21]	validation_0-rmse:118660.23255
[22]	validation_0-rmse:117642.88000
[23]	validation_0-rmse:116655.81372
[24]	validation_0-rmse:115687.45828
[25]	validation_0-rmse:114697.81665
[26]	validation_0-rmse:113755

## Fine-tuning on XGBoost

In [None]:
import optuna
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

def objective(trial):
    """Objective function for Optuna to optimize XGBoost parameters"""
    param = {
        # Basic parameters
        'n_estimators': trial.suggest_int('n_estimators', 100, 5000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
        
        # Parameters to prevent overfitting
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'gamma': trial.suggest_float('gamma', 1e-4, 2.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        
        # Regularization parameters
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 20.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 20.0, log=True),
        
        # Fixed parameters
        'objective': 'reg:squarederror',
        'random_state': 42,
        'n_jobs': -1,
        'verbosity': 0
    }
    
    # Create model
    model = XGBRegressor(**param)
    
    try:
        # Train model
        model.fit(X_train_combined, y_train.values.ravel())
        
        # Predict validation set
        y_pred = model.predict(X_valid_combined)
        
        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
        return rmse
        
    except Exception as e:
        print(f"Training error: {str(e)}")
        return float('inf')

# Create Optuna study
study = optuna.create_study(direction='minimize')

# Run optimization
print("Starting Optuna parameter optimization...")
study.optimize(objective, n_trials=200, show_progress_bar=True)

# Print best parameters
print("\nBest parameters:")
for key, value in study.best_params.items():
    print(f"{key}: {value}")
print(f"\nBest RMSE: {study.best_value:.4f}")

# Create final model with best parameters
best_params = study.best_params
best_params.update({
    'objective': 'reg:squarederror',
    'random_state': 42,
    'n_jobs': -1,
    'verbosity': 0
})

# Create and train final model
final_model = XGBRegressor(**best_params)
final_model.fit(X_train_combined, y_train.values.ravel())

# Evaluate final model
y_pred = final_model.predict(X_valid_combined)
final_rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(f"\nFinal model RMSE on validation set: {final_rmse:.4f}")

# Update model dictionary
if 'models' not in locals():
    models = {}
models['XGBoost (Optimized)'] = final_model

In [27]:
# Create final model with best parameters
best_model = XGBRegressor(
    n_estimators=4163,
    max_depth=6, 
    learning_rate=0.0110604556318349,
    min_child_weight=3,
    gamma=0.0036210695281719447,
    subsample=0.6453444676549167,
    colsample_bytree=0.6761141085642669,
    reg_alpha=0.34592583966227625,
    reg_lambda=0.01554727012065511,
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1,
    verbosity=0
)


# Create and train the final model
best_model.fit(X_train_combined, y_train.values.ravel())

# Evaluate the final model
y_pred = best_model.predict(X_valid_combined)
final_rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(f"\nFinal model RMSE on validation set: {final_rmse:.4f}")


Final model RMSE on validation set: 19481.5527


## Prediction on test set

In [28]:
print("Generating validation set predictions...")
valid_predictions = best_model.predict(X_valid_combined)
valid_predictions_df = pd.DataFrame({
    'Id': range(len(valid_predictions)),
    'Predicted': valid_predictions
})
valid_predictions_df.to_csv('data/ml_valid.csv', index=False)
print("Validation set predictions saved to data/ml_valid.csv")

print("Generating test set predictions...")
test_predictions = best_model.predict(X_test_combined)
test_predictions_df = pd.DataFrame({
    'Id': range(len(test_predictions)),
    'Predicted': test_predictions
})
test_predictions_df.to_csv('data/ml_test.csv', index=False)
print("Test set predictions saved to data/ml_test.csv")

Generating validation set predictions...
Validation set predictions saved to data/ml_valid.csv
Generating test set predictions...
Test set predictions saved to data/ml_test.csv


In [29]:
print("Training final model with full dataset...")
best_model.fit(X_train_full_combined, y_train_full)

print("Generating test set predictions...")
test_predictions = best_model.predict(X_test_combined)

# Create predictions DataFrame
predictions_df = pd.DataFrame({
    'Id': range(len(test_predictions)),
    'Predicted': test_predictions
})

# Save predictions
predictions_df.to_csv('data/predictions.csv', index=False)
print("Predictions saved to data/predictions.csv")

print("Generating validation set predictions...")
valid_predictions = best_model.predict(X_valid_combined)
valid_predictions_df = pd.DataFrame({
    'Id': range(len(valid_predictions)),
    'Predicted': valid_predictions
})
valid_predictions_df.to_csv('data/full_ml_valid.csv', index=False)
print("Validation set predictions saved to data/full_ml_valid.csv")

print("Generating test set predictions...")
test_predictions = best_model.predict(X_test_combined)
test_predictions_df = pd.DataFrame({
    'Id': range(len(test_predictions)),
    'Predicted': test_predictions
})
test_predictions_df.to_csv('data/full_ml_test.csv', index=False)
print("Test set predictions saved to data/full_ml_test.csv")


Training final model with full dataset...
Generating test set predictions...
Predictions saved to data/predictions.csv
Generating validation set predictions...
Validation set predictions saved to data/full_ml_valid.csv
Generating test set predictions...
Test set predictions saved to data/full_ml_test.csv
