In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

##  Goal: The goal of this competition is to predict the price of used cars based on various attributes.

In [None]:
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# LOAD , READ AND UNDERSTAND DATA

In [None]:
# Load the dataset (replace 'path/to/your/data.csv' with the actual file path)
df = pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv')

# Display the first few rows of the dataframe
df.head()


In [None]:
# Display info about data types and missing values
df.info()


In [None]:
# Display summary statistics
df.describe(include='all')


In [None]:
# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
missing_percentage = df.isnull().sum() / len(df) * 100
missing_percentage

In [None]:
# Filter categorical columns
categorical_features = df.select_dtypes(include=['object']).columns

# Loop through each categorical column and find unique values
for column in categorical_features:
    unique_values = df[column].unique()
    print(f"Unique values in column '{column}':")
    print(unique_values)
    print("\n")

## HANDLING NAN VALUES

In [None]:
# Replace NaN values with 'none'
df['fuel_type'] = df['fuel_type'].fillna('none')

# Replace specific values
df['fuel_type'] = df['fuel_type'].replace({'–': 'none', 'not supported': 'none'})

# Verify the changes
df['fuel_type'].value_counts()

In [None]:
# Replace NaN values with 'none reported' in the 'accident' column
df['accident'] = df['accident'].fillna('none reported')

# Verify the changes
df['accident'].value_counts()


In [None]:
# Replace NaN values with 'no' in the 'clean_title' column
df['clean_title'] = df['clean_title'].fillna('no')

# Verify the changes
df['clean_title'].value_counts()


In [None]:
df.isnull().sum()

## test df : repeat data clening steps

In [None]:
# Read the test data
test_df = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv')
test_df.shape

In [None]:
# Check for missing values in the test set
missing_values_test = test_df.isnull().sum()
print(missing_values_test)


In [None]:
# Replace NaN values with 'none' in the 'fuel_type' column
test_df['fuel_type'] = test_df['fuel_type'].fillna('none')

# Replace specific values
test_df['fuel_type'] = test_df['fuel_type'].replace({'–': 'none', 'not supported': 'none'})

# Replace NaN values with 'none reported' in the 'accident' column
test_df['accident'] = test_df['accident'].fillna('none reported')

# Replace NaN values with 'no' in the 'clean_title' column
test_df['clean_title'] = test_df['clean_title'].fillna('no')

# Verify there are no more missing values
missing_values_test_after = test_df.isnull().sum()
print(missing_values_test_after)

In [None]:
# Drop the columns from the training data
df = df.drop(columns='id')

# Verify that the columns have been removed
print(df.columns)


In [None]:
# Drop the columns from the training data
test_df = test_df.drop(columns='id')

# Verify that the columns have been removed
print(df.columns)


## HANDLING OUTLIERS

In [None]:
# function to detect and handle outliers
def handle_outliers(df, feature):
    # Calculate IQR (Interquartile Range)
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1

    #  bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Filter out outliers
    df_no_outliers = df[(df[feature] >= lower_bound) & (df[feature] <= upper_bound)]
    return df_no_outliers

## outliers in train dataset

In [None]:
# List of features to handle outliers
features = ['milage', 'model_year', 'price']

# Handle outliers for each feature in the training set
for feature in features:
    df = handle_outliers(df, feature)

# Verify the shape of the DataFrame after removing outliers
print(f"DataFrame shape after removing outliers in selected features: {df.shape}")


## TRAIN TEST SPLITS

In [None]:
# Define features and target variable
X = df.drop('price', axis=1)  # Features
y = df['price']               # Target variable


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Define numerical and categorical features
numerical_features = ['milage', 'model_year'] 
categorical_features = ['fuel_type', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']  

# Create numerical transformer with MinMaxScaler
numerical_transformer = MinMaxScaler()

# Create categorical transformer with OneHotEncoder
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Fit and transform the training data
X_train_transformed = preprocessor.fit_transform(X_train)

# Transform the test data
X_test_transformed = preprocessor.transform(X_test)


## MODEL BUILDING

In [None]:
# Define the models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Support Vector Regression': SVR(),
    'Decision Tree': DecisionTreeRegressor(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'XGBoost': XGBRegressor(),
    'CatBoost': CatBoostRegressor(verbose=0),
    'LightGBM': LGBMRegressor()
}


In [None]:
import pickle
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

results = []

for model_name, model in models.items():
    # Train the model
    model.fit(X_train_transformed, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_transformed)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mse)  # Calculate RMSE
    
    # Store results
    results.append({
        'Model': model_name,
        'MSE': mse,
        'R^2': r2,
        'RMSE': rmse
    })
    
    # Save the model with pickle
    model_filename = f"{model_name.replace(' ', '_')}.pkl"
    with open(model_filename, 'wb') as file:
        pickle.dump(model, file)
    print(f"Saved {model_name} to {model_filename}")

# Convert results to a DataFrame for easy viewing
results_df = pd.DataFrame(results)

# Print results
results_df


In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Set up CatBoost parameter grid
catboost_param_grid = {
    'iterations': [500, 1000, 1500],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8, 10],
    'l2_leaf_reg': [1, 3, 5, 7]
}

# Initialize the CatBoost model
catboost_model = CatBoostRegressor(verbose=0)

# RandomizedSearchCV for hyperparameter tuning
catboost_search = RandomizedSearchCV(catboost_model, param_distributions=catboost_param_grid,
                                     n_iter=20, scoring='neg_mean_squared_error', cv=5, verbose=1)

catboost_search.fit(X_train_transformed, y_train)

# Get best parameters
best_catboost = catboost_search.best_estimator_
print("Best CatBoost Parameters: ", catboost_search.best_params_)


In [None]:
# Define the best CatBoost parameters
best_params = {
    'learning_rate': 0.05,
    'l2_leaf_reg': 5,
    'iterations': 1500,
    'depth': 8,
    'random_state': 42  # Optional: For reproducibility
}

# Initialize CatBoost model with the best parameters
catboost_model = CatBoostRegressor(**best_params, verbose=0)

# Fit the model on training data
catboost_model.fit(X_train_transformed, y_train)

In [None]:
# Make predictions on the validation/test set
y_val_pred = catboost_model.predict(X_test_transformed)

# Calculate metrics
mse = mean_squared_error(y_test, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_val_pred)
r2 = r2_score(y_test, y_val_pred)

print(f'R² Score: {r2:.4f}')
print(f'Mean Absolute Error (MAE): {mae:.4f}')
print(f'Mean Squared Error (MSE): {mse:.4f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.4f}')

In [None]:
# Trans4form the test data
test_df_transformed = preprocessor.transform(test_df)


In [None]:
# Make predictions on the test data
y_test_df_pred = catboost_model.predict(test_df_transformed)


In [None]:
# Prepare the submission DataFrame
submission_df = pd.DataFrame({
    'id': test_df['id'],  
    'price': y_test_df_pred  # Ensure this column contains the predicted prices
})

In [None]:
submission_df

In [None]:

# Save the DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)