In [1]:
# Importing essential libraries
import numpy as np  # For numerical computations
import pandas as pd  # For data manipulation and analysis
import matplotlib.pyplot as plt  # For creating static, animated, and interactive visualizations
import seaborn as sns  # For statistical data visualization
import warnings  # For controlling warning messages

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import optuna


# Suppressing warnings to avoid clutter in output
warnings.filterwarnings('ignore')



In [2]:
train=pd.read_csv('/kaggle/input/fsajhd/df_train_clean.csv')
test=pd.read_csv('/kaggle/input/fsajhd/df_test_clean.csv')

In [3]:
tarain=pd.read_csv('/kaggle/input/fsajhd/train.csv')

In [4]:
tarain.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [5]:
train.head(5)

Unnamed: 0,brand,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,price,Horsepower,Displacement,Cylinder Count,model_age
0,31,213000,2,0,312,71,2,1,4200,172.0,1.6,4.0,18
1,28,143250,2,0,263,10,0,1,4999,252.0,3.9,8.0,23
2,9,136731,1,0,38,71,2,1,13900,320.0,5.3,8.0,23
3,16,19500,2,2,29,14,2,1,45000,420.0,5.0,8.0,8
4,36,7388,2,0,29,10,2,1,97500,208.0,2.0,4.0,4


In [6]:
test.sample(5)

Unnamed: 0,brand,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,Horsepower,Displacement,Cylinder Count,model_age
61025,26,57785,3,2,29,14,2,1,395.0,3.0,6.0,5
81932,44,16554,2,0,44,14,2,0,440.769971,5.7,6.374268,4
45645,39,127500,2,0,234,152,0,1,290.0,3.5,6.0,11
71382,11,8307,2,0,304,14,2,0,329.403725,3.6,6.374268,3
104478,55,130000,2,0,263,71,0,1,305.0,3.7,6.0,14


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   brand           188533 non-null  int64  
 1   milage          188533 non-null  int64  
 2   fuel_type       188533 non-null  int64  
 3   transmission    188533 non-null  int64  
 4   ext_col         188533 non-null  int64  
 5   int_col         188533 non-null  int64  
 6   accident        188533 non-null  int64  
 7   clean_title     188533 non-null  int64  
 8   price           188533 non-null  int64  
 9   Horsepower      188533 non-null  float64
 10  Displacement    188533 non-null  float64
 11  Cylinder Count  188533 non-null  float64
 12  model_age       188533 non-null  int64  
dtypes: float64(3), int64(10)
memory usage: 18.7 MB


In [10]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

X = train.drop(columns=['price'])  # Features
y = train['price']  # Target variable

# Define hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None]
}

# Initialize Decision Tree model
dt_model = DecisionTreeRegressor(random_state=42)

# Perform Grid Search with 5-fold Cross Validation
grid_search = GridSearchCV(
    estimator=dt_model,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

# Fit GridSearchCV to find best parameters
grid_search.fit(X, y)

# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best RMSE:", (-grid_search.best_score_) ** 0.5)


Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best Parameters: {'max_depth': 5, 'max_features': None, 'min_samples_leaf': 8, 'min_samples_split': 2}
Best RMSE: 73946.36998622252


In [11]:
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Load dataset
X = train.drop(columns=['price'])
y = train['price']

# Number of folds for cross-validation
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
oof_df = pd.DataFrame(columns=['ID', 'Actual', 'OOF_Pred_DecisionTree', 'Fold'])

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y), start=1):
    print(f"\nTraining Fold {fold}/{n_folds}...")
    
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Define preprocessing pipeline
    num_pipeline = Pipeline(steps=[('scaler', StandardScaler())])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num_pipeline', num_pipeline, ['milage', 'Horsepower', 'Displacement', 'Cylinder Count', 'model_age'])
        ],
        remainder='passthrough'
    )

    # Create pipeline with preprocessor and Decision Tree model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', DecisionTreeRegressor(max_depth=5, max_features=None, min_samples_leaf=8,
                                        min_samples_split=2, random_state=42))
    ])
    
    # Train model
    pipeline.fit(X_tr, y_tr)
    
    # Predict on validation set
    y_val_pred = pipeline.predict(X_val)
    oof_preds[val_idx] = y_val_pred
    
    # Compute fold RMSE
    fold_rmse = mean_squared_error(y_val, y_val_pred, squared=False)
    print(f"Fold {fold} RMSE: {fold_rmse:.4f}")
    
    # Store fold results in DataFrame
    fold_df = pd.DataFrame({
        'ID': X.index[val_idx],
        'Actual': y_val.values,
        'OOF_Pred_DecisionTree': y_val_pred,
        'Fold': fold
    })
    
    oof_df = pd.concat([oof_df, fold_df], ignore_index=True)

# Compute overall RMSE
oof_rmse = mean_squared_error(y, oof_preds, squared=False)
print(f"\nOverall OOF RMSE: {oof_rmse:.4f}")

# Save OOF predictions
oof_df.to_csv('oof_predictions_decisiontree.csv', index=False)
print("OOF predictions saved to 'oof_predictions_decisiontree.csv'.")

# 🔥 Train final model on full dataset before saving
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(max_depth=5, max_features=None, min_samples_leaf=8,
                                    min_samples_split=2, random_state=42))
])

final_pipeline.fit(X, y)  # Train on full dataset
joblib.dump(final_pipeline, 'decisiontree_model.pkl')  # Save final trained model
print("Final Decision Tree model trained and saved as 'decisiontree_model.pkl'.")



Training Fold 1/5...
Fold 1 RMSE: 69444.3479

Training Fold 2/5...
Fold 2 RMSE: 69665.4501

Training Fold 3/5...
Fold 3 RMSE: 75133.6568

Training Fold 4/5...
Fold 4 RMSE: 77699.9043

Training Fold 5/5...
Fold 5 RMSE: 77683.8111

Overall OOF RMSE: 74017.4018
OOF predictions saved to 'oof_predictions_decisiontree.csv'.
Final Decision Tree model trained and saved as 'decisiontree_model.pkl'.


In [13]:
final_test_predictions = pipeline.predict(test)

# Save submission file
sub = pd.read_csv('/kaggle/input/fsajhd/sample_submission.csv')
sub['price'] = final_test_predictions
sub.to_csv('Decisiontree_submission.csv', index=False)
print("Submission file saved as 'DecisionTree.csv'.")
sub.head()


Submission file saved as 'DecisionTree.csv'.


Unnamed: 0,id,price
0,188533,18220.49892
1,188534,82857.711897
2,188535,56128.613085
3,188536,23103.655389
4,188537,27653.990005
