In [1]:
# Importing essential libraries
import numpy as np  # For numerical computations
import pandas as pd  # For data manipulation and analysis
import matplotlib.pyplot as plt  # For creating static, animated, and interactive visualizations
import seaborn as sns  # For statistical data visualization
import warnings  # For controlling warning messages

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import optuna


# Suppressing warnings to avoid clutter in output
warnings.filterwarnings('ignore')



In [2]:
train=pd.read_csv('/kaggle/input/fsajhd/df_train_clean.csv')
test=pd.read_csv('/kaggle/input/fsajhd/df_test_clean.csv')

In [3]:
tarain=pd.read_csv('/kaggle/input/fsajhd/train.csv')

In [4]:
tarain.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [5]:
train.head(5)

Unnamed: 0,brand,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,price,Horsepower,Displacement,Cylinder Count,model_age
0,31,213000,2,0,312,71,2,1,4200,172.0,1.6,4.0,18
1,28,143250,2,0,263,10,0,1,4999,252.0,3.9,8.0,23
2,9,136731,1,0,38,71,2,1,13900,320.0,5.3,8.0,23
3,16,19500,2,2,29,14,2,1,45000,420.0,5.0,8.0,8
4,36,7388,2,0,29,10,2,1,97500,208.0,2.0,4.0,4


In [6]:
test.sample(5)

Unnamed: 0,brand,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,Horsepower,Displacement,Cylinder Count,model_age
27822,22,95200,2,1,29,14,2,1,150.0,2.4,4.0,25
78750,24,110000,2,0,122,10,0,1,185.0,2.4,4.0,28
35272,36,13250,2,0,304,14,2,1,385.0,3.0,6.0,7
6836,4,50000,2,1,304,106,2,1,425.0,3.0,6.0,10
60246,3,87305,2,0,128,118,2,1,211.0,2.0,4.0,10


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   brand           188533 non-null  int64  
 1   milage          188533 non-null  int64  
 2   fuel_type       188533 non-null  int64  
 3   transmission    188533 non-null  int64  
 4   ext_col         188533 non-null  int64  
 5   int_col         188533 non-null  int64  
 6   accident        188533 non-null  int64  
 7   clean_title     188533 non-null  int64  
 8   price           188533 non-null  int64  
 9   Horsepower      188533 non-null  float64
 10  Displacement    188533 non-null  float64
 11  Cylinder Count  188533 non-null  float64
 12  model_age       188533 non-null  int64  
dtypes: float64(3), int64(10)
memory usage: 18.7 MB


In [8]:
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

# Assuming 'train' is your DataFrame with the dataset loaded
X = train.drop(columns=['price'])
y = train['price']

# Define a preprocessor that scales all features
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), X.columns)
    ]
)

# Tuned hyperparameters for LightGBM
tuned_params = {
    'n_estimators': 200,
    'learning_rate': 0.026075961680376267,
    'max_depth': 7,
    'min_child_samples': 8,  # equivalent to XGBoost's min_child_weight
    'subsample': 0.6403035387419401,
    'colsample_bytree': 0.5532517223827789,
    'min_split_gain': 0.143179155034863,  # similar to XGBoost's gamma
    'reg_alpha': 2.1450990765269337e-07,
    'reg_lambda': 0.05942705889240155,
    'random_state': 42,
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0
}

# Setup 5-Fold cross-validation
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Prepare arrays/dataframe for out-of-fold (OOF) predictions
oof_preds = np.zeros(len(X))
oof_df = pd.DataFrame(columns=['ID', 'Actual', 'OOF_Pred_LightGBM', 'Fold'])

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y), start=1):
    print(f"\nTraining Fold {fold}/{n_folds}...")
    
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Create a pipeline with the preprocessor and LightGBM model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', LGBMRegressor(**tuned_params))
    ])
    
    # Train the model on the current fold
    pipeline.fit(X_tr, y_tr)
    
    # Predict on the validation fold
    y_val_pred = pipeline.predict(X_val)
    oof_preds[val_idx] = y_val_pred
    
    # Compute and print the fold RMSE
    fold_rmse = mean_squared_error(y_val, y_val_pred, squared=False)
    print(f"Fold {fold} RMSE: {fold_rmse:.4f}")
    
    # Record fold results
    fold_df = pd.DataFrame({
        'ID': X.index[val_idx],
        'Actual': y_val.values,
        'OOF_Pred_LightGBM': y_val_pred,
        'Fold': fold
    })
    oof_df = pd.concat([oof_df, fold_df], ignore_index=True)

# Compute overall OOF RMSE
oof_rmse = mean_squared_error(y, oof_preds, squared=False)
print(f"\nOverall OOF RMSE: {oof_rmse:.4f}")

# Save the out-of-fold predictions to a CSV file
oof_df.to_csv('oof_predictions_lightgbm.csv', index=False)
print("OOF predictions saved to 'oof_predictions_lightgbm.csv'.")

# Save the final pipeline (from the last fold) as the trained model
joblib.dump(pipeline, 'lightgbm_model.pkl')
print("Model saved as 'lightgbm_model.pkl'.")



Training Fold 1/5...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 150826, number of used features: 12
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 9 dense feature groups (1.73 MB) transferred to GPU in 0.002486 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 43890.785316
Fold 1 RMSE: 68074.9114

Training Fold 2/5...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1078
[LightGBM] [Info] Number of data points in the train set: 150826, number of used features: 12
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB,

In [10]:
final_test_predictions = pipeline.predict(test)

# Save submission file
sub = pd.read_csv('/kaggle/input/fsajhd/sample_submission.csv')
sub['price'] = final_test_predictions
sub.to_csv('LGBMBoost_submission.csv', index=False)
print("Submission file saved as 'LGBMsubmission.csv'.")
sub.head()


Submission file saved as 'LGBMsubmission.csv'.


Unnamed: 0,id,price
0,188533,17256.937761
1,188534,79879.026805
2,188535,51266.340837
3,188536,25590.203005
4,188537,30815.568945
