<a href="https://colab.research.google.com/github/OleksiiLatypov/Regression_of_Used_Car_Prices/blob/main/used_car_price.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'playground-series-s4e9:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F76728%2F9057646%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240914%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240914T183307Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D94a6da211f95857ee004528fa741db3be1cff436ebdbbf1909564d1ab02f496731f95127a2a962e5bfc2cb9777527b276f0ba8dae16840fff51113fe7f7fd17946a78a007492a700d2aabca5e63114a495bfe6ff60373c922f1a804741e85dfa50e909a0626d49aed58b9d4f008cbad9183b5548595c0ef3874737850e852950c7c85d7eb438d9fe483f13947f01f039284d505bcd131f1b8249c6ed7196a71c5c3e4948df203755423f9afcbbfe6d833b7c5e1a24e97a2e466d9d13d29ea5606fb8c4530d6e409fdd03a86d13d53bed0c4b22861a248bbd3e524ac831aa03080d4cfa31ddc7915d405dbd40472ea811ae1c9c17b1399b7d763ac9ef960eeafa'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import matplotlib.pyplot as plt
import re
from datetime import datetime
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.linear_model import Lasso, Ridge, SGDRegressor, LinearRegression
import category_encoders as ce

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv')

In [None]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [None]:
train.shape

In [None]:
train.info()

In [None]:
test.shape

In [None]:
train['engine'][35:95]

In [None]:
train.describe()

In [None]:
train.describe(include='object')

In [None]:
100* (train.isna().sum() / len(train)).sort_values(ascending=False)

In [None]:
sns.displot(np.log(train['price']), kde=True)

In [None]:
train['clean_title'].value_counts()

In [None]:
df = pd.concat([train, test], axis=0, ignore_index=True)

In [None]:
df['brand'].unique()

In [None]:
avarage_price_by_brand = train.groupby('brand')['price'].agg(['mean', 'median'])
avarage_price_by_brand.sort_values(by='median', ascending=False).astype('int')

In [None]:
lux_auto = [
    "Mercedes-Benz",
    "Audi",
    "BMW",
    "Tesla",
    "Cadillac",
    "Land Rover",
    "Porsche",
    "Lexus",
    "INFINITI",
    "Jaguar",
    "Rolls-Royce",
    "Bentley",
    "Ferrari",
    "Aston Martin",
    "Lamborghini",
    "Bugatti",
    "Maserati",
    "Lucid",
    "Polestar",
    "Maybach",
    "Lincoln"
]

In [None]:
df['premium_auto'] = df['brand'].apply(lambda x: 1 if x in lux_auto else 0)
df.head()

In [None]:
horsepower = []
engine_volume = []
num_of_cylinders = []
for row in df['engine']:
    #print(row)
    hp = re.search(r'(\d+\.?\d*)HP', row)
    #print(hp)
    if hp:
        horsepower.append(float(hp.group(1)))
    else:
        horsepower.append(0)

    volume = re.search(r'(\d+\.?\d*)L', row)
    if volume:
        engine_volume.append(float(volume.group(1)))
    else:
        engine_volume.append(0)

    cylinder = re.search(r'(\d+)\s*Cylinder|V(\d+)', row, re.IGNORECASE)

    if cylinder:
        # If we find a numeric cylinder, use that
        if cylinder.group(1):
            num_of_cylinders.append(float(cylinder.group(1)))
        # If we find a V-style code, use the number after 'V'
        elif cylinder.group(2):
            num_of_cylinders.append(float(cylinder.group(2)))
    else:
        # If no match is found, default to 0
        num_of_cylinders.append(0)

df['horsepower'] =horsepower
df['engine_volume'] = engine_volume
df['num_of_cylinders'] = num_of_cylinders


In [None]:
#df[35:62]

In [None]:
df['num_of_cylinders'].nunique()

In [None]:
df[df['fuel_type'] == 'not supported']

In [None]:
to_replace = []
for row in df.loc[df['fuel_type'] == 'not supported', 'engine']:
    fuel = re.search(r'(Gasoline|Diesel|Hybrid|Flex Fuel|Electric|Plug-In Hybrid)', row)
    if fuel.group(1) == 'Flex Fuel':
        to_replace.append('E85 Flex Fuel')
    else:
        to_replace.append(fuel.group(1))
print(to_replace)

In [None]:
df.loc[df['fuel_type'] == 'not supported', 'fuel_type'] = to_replace

In [None]:
df['fuel_type'].value_counts()

In [None]:
df.loc[df['fuel_type'] == 'not supported', 'fuel_type']

In [None]:
df[pd.isna(df['fuel_type'])].shape

In [None]:
df['fuel_type'].unique()

In [None]:
fuel = df[pd.isna(df['fuel_type'])]
type_of_fuel = []
for col in fuel['engine']:
    #print(col)
    if 'Hybrid' in col:
        type_of_fuel.append('Hybrid')
        #print('hybrid')
    elif 'Gasoline' in col:
        type_of_fuel.append('Gasoline')
        #print('gas')
    elif 'Flex Fuel' in col:
        type_of_fuel.append('E85 Flex Fuel')
        #print('flex')
    elif 'Diesel' in col:
        type_of_fuel.append('Diesel')
        #print('diesel')
    else:
        type_of_fuel.append('Electric')

print(len(type_of_fuel))

nan_indices = df[pd.isna(df['fuel_type'])].index
df.loc[nan_indices, 'fuel_type'] = type_of_fuel

In [None]:
df['age'] = datetime.now().year - df['model_year']

In [None]:
df['transmission'].value_counts()

In [None]:
df[df['transmission'] == 'Single-Speed Fixed Gear']

In [None]:
num_of_speed = []
type_trans = []
for row in df['transmission']:
#     speed = re.search(r'(\d+)\s*[-/]*\s*Speed', row, re.IGNORECASE)
#     if speed:
#         num_of_speed.append(float(speed.group(1)))
#     else:
#         num_of_speed.append(0)
    if ('Automatic' in row) or ('A/T' in row) or ('AT' in row)\
    or ('A/t' in row) or ('Transmission Overdrive Switch' in row):
        type_trans.append('A')
    elif 'DCT' in row or 'Transmission w/Dual Shift Mode' in row:
        type_trans.append('DCT')
    elif 'CVT' in row or 'CVT-F':
        type_trans.append('CVT')
    elif 'Variable' in row:
        type_trans.append('Variable')
    elif 'Single-Speed Fixed Gear' in row:
        type_trans.append('Single-Speed Fixed Gear')
    elif 'F' in row:
        type_trans.append('F')
    elif 'At/Mt' in row:
        type_trans.append('At/Mt')
    elif 'â€“' in row:
        type_trans.append('Unknown')
    else:
        type_trans.append('M')


print(len(num_of_speed))
print(len(type_trans))


In [None]:
df['transmission'] = type_trans
#df['num_of_speed'] = num_of_speed

In [None]:
#df[df['transmission'] == 'Transmission w/Dual Shift Mode']

In [None]:
df['accident'] = df['accident'].fillna(df['accident'].mode()[0])

In [None]:
df.isna().sum()

In [None]:
df = df.drop(['clean_title'], axis=1)

In [None]:
df.head()

In [None]:
# df['speed'] = df['transmission'].str.extract('(\d+)-Speed', expand=False)
# df['speed'] = df['speed'].fillna(df['transmission'].str.extract('(\d+)-Speed', expand=False))
# df['speed'] = df['speed'].fillna(df['transmission'].str.extract('(\d+) Speed', expand=False))
# df['speed'] = df['speed'].fillna(df['transmission'].str.extract('(\d+) ', expand=False))
# df['speed'] = df['speed'].fillna(df['transmission'].str.extract('^(\d+)', expand=False))

In [None]:
# cat_columns = df.select_dtypes(include=['object'])
# missing_cols = [col for col in cat_columns.columns if cat_columns[col].isna().any()]
#missing_cols = ['fuel_type', 'accident']

In [None]:
# for col in missing_cols:
#     df.loc[:, col] = df.loc[:, col].fillna(df.loc[:, col].mode()[0])

In [None]:
#df['clean_title'] = df['clean_title'].fillna('No')
#df = df.drop('clean_title', axis=1)

In [None]:
# X_columns = df[['brand', 'model']]
# y_target = df['price']

# encoder = ce.TargetEncoder(cols=['brand', 'model'], smoothing=0.3)
# X_encoded = encoder.fit_transform(X_columns, y_target)
# df_encoded = pd.concat([X_encoded, y], axis=1)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# Initialize the OrdinalEncoder with handle_unknown='use_encoded_value'
# and specify the value to be used for unknown categories
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# Fit the encoder on training data
#df['model_encoded'] = ordinal_encoder.fit_transform(df[['model']])
df['ext_col']=ordinal_encoder.fit_transform(df[['ext_col']])
df['int_col']=ordinal_encoder.fit_transform(df[['int_col']])

In [None]:
#df = df.drop('model', axis=1)

In [None]:
# df['brand'] = df_encoded['brand']
# df['model'] = df_encoded['model']

In [None]:
df['int_col'].unique()

In [None]:
df.head()

In [None]:
# encoder=LabelEncoder()
# # df['brand']=encoder.fit_transform(df['brand'])
# # df['model']=encoder.fit_transform(df['model'])
# #df['fuel_type']=encoder.fit_transform(df['fuel_type'])
# df['ext_col']=encoder.fit_transform(df['ext_col'])
# df['int_col']=encoder.fit_transform(df['int_col'])
# #train['accident']=encoder.fit_transform(train['accident'])
# #df['clean_title']=encoder.fit_transform(df['clean_title'])

import category_encoders as ce

encoder = ce.CountEncoder()

# Apply CountEncoder to each categorical column
df['brand'] = encoder.fit_transform(df['brand'])
df['model'] = encoder.fit_transform(df['model'])
df['transmission'] = encoder.fit_transform(df['transmission'])
df['engine'] = encoder.fit_transform(df['engine'])
df['fuel_type'] = encoder.fit_transform(df['fuel_type'])
df['ext_col'] = encoder.fit_transform(df['ext_col'])
df['int_col'] = encoder.fit_transform(df['int_col'])

df.head()

In [None]:
df = pd.get_dummies(df, dtype='int')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.head()

In [None]:
train_df = df.iloc[:188533, :]
test_df = df.iloc[188533:, :]

In [None]:
X = train_df.copy()

In [None]:
y = np.log(X['price'])
X = X.drop('price', axis=1)

In [None]:
y

In [None]:
cbe_encoder = ce.cat_boost.CatBoostEncoder(a=1)
X_encoded = cbe_encoder.fit_transform(X, y)

In [None]:
# #for test split 0.3 is the best !!!
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import Normalizer
from catboost import CatBoostRegressor


clf_catboost = CatBoostRegressor(iterations=275,
                                  learning_rate=0.03,
                                  depth=7,
                                 l2_leaf_reg=3,
                                border_count=254,
                                 verbose=0)  # verbose=0 to suppress output during cross-validation


# Define k-fold cross-validation
n_splits = 7  # number of folds
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Perform cross-validation and evaluate RMSE
rmse_scores = -cross_val_score(clf_catboost, X_encoded, y, scoring='neg_root_mean_squared_error', cv=kf)

# Calculate mean and standard deviation of RMSE
mean_rmse = np.mean(rmse_scores)
std_rmse = np.std(rmse_scores)

print(f"Mean RMSE: {mean_rmse:.4f}")
print(f"Standard Deviation of RMSE: {std_rmse:.4f}")

clf_catboost.fit(X_encoded, y)
#72823 = 700iter
#72718 = 300iter

In [None]:
test_df = test_df.drop('price', axis=1)

In [None]:
test_df.head()

In [None]:
test_X_encoded = cbe_encoder.transform(test_df)

In [None]:
test_X_encoded.head()

In [None]:
test_id = pd.read_csv('/kaggle/input/playground-series-s4e9/sample_submission.csv')
predictions = clf_catboost.predict(test_X_encoded)
test_id['price'] = np.exp(predictions)
test_id.to_csv('submission_FE_catboost_1_log.csv', index=False)

In [None]:
test_id

In [None]:
y_hat = clf_catboost.predict(X_encoded)[:10]
np.exp(y_hat)

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_val_score
import numpy as np

# Define the XGBoost regressor
clf_xgboost = XGBRegressor(n_estimators=700,
                           learning_rate=0.01,
                           max_depth=6,
                           reg_alpha=3,
                           n_jobs=-1,  # Use all available cores
                           verbose=0)  # Suppress output during cross-validation

# Define k-fold cross-validation
n_splits = 5  # number of folds
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Perform cross-validation and evaluate RMSE
rmse_scores = -cross_val_score(clf_xgboost, X_encoded, y, scoring='neg_root_mean_squared_error', cv=kf)

# Calculate mean and standard deviation of RMSE
mean_rmse = np.mean(rmse_scores)
std_rmse = np.std(rmse_scores)

print(f"Mean RMSE: {mean_rmse:.4f}")
print(f"Standard Deviation of RMSE: {std_rmse:.4f}")

# Fit the model on the entire dataset
clf_xgboost.fit(X_encoded, y)


In [None]:
y_xgb = clf_xgboost.predict(X_encoded)
np.exp(y_xgb)[:10]

In [None]:
train_df['price'].head(10)

In [None]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from lightgbm import LGBMRegressor
import numpy as np

# Initialize the LGBMRegressor model
clf_lightgbm = LGBMRegressor(n_estimators=700,
                             learning_rate=0.01,
                             #max_depth=6,
                             #reg_alpha=3,
                             #num_leaves=31,
                             verbose=-1)  # verbose=-1 to suppress output during cross-validation

# Define k-fold cross-validation
n_splits = 5  # number of folds
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Perform cross-validation and evaluate RMSE
rmse_scores = -cross_val_score(clf_lightgbm, X_encoded, y, scoring='neg_root_mean_squared_error', cv=kf)

# Calculate mean and standard deviation of RMSE
mean_rmse = np.mean(rmse_scores)
std_rmse = np.std(rmse_scores)

print(f"Mean RMSE (LightGBM): {mean_rmse:.4f}")
print(f"Standard Deviation of RMSE (LightGBM): {std_rmse:.4f}")

# Fit the LightGBM model to the training data
clf_lightgbm.fit(X_encoded, y)


In [None]:
y_lgbm = clf_lightgbm.predict(X_encoded[:10])
np.exp(y_lgbm)

# DONT TOCH THIS BELOW

In [None]:
# #for test split 0.3 is the best !!!
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import Normalizer
from catboost import CatBoostRegressor


clf_catboost = CatBoostRegressor(iterations=700,
                                 learning_rate=0.0311,
                                 depth=6,
                                 l2_leaf_reg=3,
                                # border_count=100,
                                 verbose=0)  # verbose=0 to suppress output during cross-validation


# Define k-fold cross-validation
n_splits = 5  # number of folds
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Perform cross-validation and evaluate RMSE
rmse_scores = -cross_val_score(clf_catboost, X, y, scoring='neg_root_mean_squared_error', cv=kf)

# Calculate mean and standard deviation of RMSE
mean_rmse = np.mean(rmse_scores)
std_rmse = np.std(rmse_scores)

print(f"Mean RMSE: {mean_rmse:.4f}")
print(f"Standard Deviation of RMSE: {std_rmse:.4f}")

clf_catboost.fit(X, y)

In [None]:
# test_df = test_df.drop('price', axis=1)

In [None]:
test.head()

In [None]:
test_dataset = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv')

In [None]:
(100* (test_dataset.isna().sum())/len(test_dataset)).sort_values(ascending=False)

In [None]:
test_dataset = test_dataset.drop('clean_title', axis=1)

In [None]:
missing_columns = ['fuel_type', 'accident']

In [None]:
for col in missing_columns:
    test_dataset.loc[:, col] = test_dataset.loc[:, col].fillna(test_dataset.loc[:, col].mode()[0])

In [None]:
encoder=LabelEncoder()
test_dataset['brand']=encoder.fit_transform(test_dataset['brand'])
test_dataset['model']=encoder.fit_transform(test_dataset['model'])
#df['fuel_type']=encoder.fit_transform(df['fuel_type'])
test_dataset['engine']=encoder.fit_transform(test_dataset['engine'])
test_dataset['transmission']=encoder.fit_transform(test_dataset['transmission'])
test_dataset['ext_col']=encoder.fit_transform(test_dataset['ext_col'])
test_dataset['int_col']=encoder.fit_transform(test_dataset['int_col'])
test_dataset['accident']=encoder.fit_transform(test_dataset['accident'])

In [None]:
test_dataset = pd.get_dummies(test_dataset, dtype='int')

In [None]:
test_dataset.head()

In [None]:
test_id = test_dataset['id']

In [None]:
test_dataset = test_dataset.drop('id', axis=1)

In [None]:
# submit = pd.read_csv("/kaggle/input/playground-series-s4e9/sample_submission.csv")
# pred = clf_catboost.predict(test_df)
# submit["price"] = pred
# submit.to_csv("submission.csv", index=False)
# submit.head()

In [None]:
#test_id = test_dataset['id']
test_predictions_catboost = clf_catboost.predict(test_dataset)
print(test_predictions_catboost[:10])
# Prepare submission
test_predictions_catboost_df = pd.DataFrame(test_predictions_catboost, columns=['price'])
submit_df = pd.concat([test_id, test_predictions_catboost_df], axis=1)
submit_df.to_csv('submission_1.csv', index=False)

In [None]:
submit_df.head()

In [None]:
train_predictions_catboost = clf_catboost.predict(X.iloc[:10, :])
train_predictions_catboost

In [None]:
# train.head()

In [None]:
sample = pd.read_csv('/kaggle/input/playground-series-s4e9/sample_submission.csv')
sample.head()