# Import libraries

In [735]:
import pandas as pd
import pyodbc
import numpy as np
from fuzzywuzzy import process
from fuzzywuzzy import fuzz

# Get datas from database

In [736]:
server = 'DESKTOP-4I8KM5F'
database = 'cars_data'
driver = 'ODBC Driver 17 for SQL Server'
connection_string = f'DRIVER={{{driver}}};SERVER={server};DATABASE={database};Trusted_Connection=yes;'

conn = pyodbc.connect(connection_string)

In [737]:
def get_table(connection):
    cursor = connection.cursor()
    cursor.execute("SELECT table_name FROM information_schema.tables WHERE table_type = 'BASE TABLE' AND table_name LIKE '%[_]%'")
    tables = cursor.fetchall()
    cursor.close()
    return [table[0] for table in tables]

In [738]:
car_tables = get_table(conn)
dfs = {}

In [739]:
for table in car_tables:
    query = f"SELECT * FROM [{table}]"
    try:
        df = pd.read_sql(query, conn)
        dfs[table] = df
        print(f"Data from table '{table}' retrieved successfully")
    
    except Exception as e:
        print(f"Error retrieving data from table '{table}': {e}")

conn.close()

Data from table 'bonbanh_inf' retrieved successfully
Data from table 'newcar_inf' retrieved successfully
Data from table 'used_Car' retrieved successfully


  df = pd.read_sql(query, conn)


In [740]:
df_bonbanh_inf = dfs['bonbanh_inf']
df_used_Car = dfs['used_Car']
df_newcar_inf = dfs['newcar_inf']

# Pre-processing

## New cars

In [741]:
df_newcar_inf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 587 entries, 0 to 586
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   name              587 non-null    object
 1   link              587 non-null    object
 2   manufacturer      587 non-null    object
 3   origin            587 non-null    object
 4   body_type         587 non-null    object
 5   seating_capacity  587 non-null    object
 6   engine            587 non-null    object
 7   torque            587 non-null    object
 8   transmission      587 non-null    object
 9   max_power         587 non-null    object
 10  drive             587 non-null    object
 11  fuel_type         587 non-null    object
 12  fuel_capacity     587 non-null    object
dtypes: object(13)
memory usage: 59.7+ KB


### Replace None

In [742]:
df_newcar_inf.replace({'None': np.nan}, inplace=True)
df_newcar_inf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 587 entries, 0 to 586
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              587 non-null    object 
 1   link              587 non-null    object 
 2   manufacturer      586 non-null    object 
 3   origin            587 non-null    object 
 4   body_type         587 non-null    object 
 5   seating_capacity  587 non-null    object 
 6   engine            587 non-null    object 
 7   torque            0 non-null      float64
 8   transmission      587 non-null    object 
 9   max_power         530 non-null    object 
 10  drive             587 non-null    object 
 11  fuel_type         585 non-null    object 
 12  fuel_capacity     587 non-null    object 
dtypes: float64(1), object(12)
memory usage: 59.7+ KB


  df_newcar_inf.replace({'None': np.nan}, inplace=True)


### Fuel_capacity

In [743]:
df_newcar_inf['fuel_capacity'].unique()

array(['35 Lit', '33 Lit', '43 Lit', '42 Lit', '32 Lit', '0 Lit',
       '45 Lit', '40 Lit', '41 Lit', '37 Lit', '48 Lit', '44 Lit',
       '52 Lit', '50 Lit', '58 Lit', '80 Lit', '51 Lit', '46 Lit',
       '79 Lit', '55 Lit', '65 Lit', '76 Lit', '53 Lit', '47 Lit',
       '75 Lit', '54 Lit', '70 Lit', '62 Lit', '72 Lit', '60 Lit',
       '56 Lit', '78 Lit', '68 Lit', '57 Lit', '71 Lit', '103 Lit',
       '26 Lit', '85 Lit', '73 Lit', '66 Lit', '59 Lit', '88 Lit',
       '87 Lit', '67 Lit', '83 Lit', '82 Lit', '93 Lit', '90 Lit',
       '143 Lit', '89 Lit', '64 Lit', '104 Lit', '98 Lit', '84 Lit',
       '99 Lit', '77 Lit', '96 Lit'], dtype=object)

In [744]:
filtered_cars = df_newcar_inf[df_newcar_inf['fuel_capacity'] == '0 Lit']

car_names = filtered_cars['name']
print(car_names)

21     Vinfast Fadil 1.4 Tieu chuan CVT
175                Toyota Venza 2.5 CVT
244                Honda CR-V 1.5E 2021
248            Hyundai SantaFe 2.4 Xang
263                Honda CR-V 1.5G 2021
273                Honda CR-V 1.5L 2021
302      Toyota Fortuner 2.4 TRD AT 4X2
552                  BMW i8 1.5L Hybrid
585            Rolls-Royce Cullinan V12
586                Rolls-Royce Dawn V12
Name: name, dtype: object


In [745]:
df_newcar_inf.loc[df_newcar_inf['name'] == 'Vinfast Fadil 1.4 Tieu chuan CVT', 'fuel_capacity'] = '32 Lit'
df_newcar_inf.loc[df_newcar_inf['name'] == 'Toyota Venza 2.5 CVT',  'fuel_capacity'] = '55 Lit'
df_newcar_inf.loc[df_newcar_inf['name'] == 'Honda CR-V 1.5E 2021',  'fuel_capacity'] = '57 Lit'
df_newcar_inf.loc[df_newcar_inf['name'] == 'Hyundai SantaFe 2.4 Xang',  'fuel_capacity'] = '71 Lit'
df_newcar_inf.loc[df_newcar_inf['name'] == 'Honda CR-V 1.5G 2021',  'fuel_capacity'] = '57 Lit'
df_newcar_inf.loc[df_newcar_inf['name'] == 'Honda CR-V 1.5L 2021',  'fuel_capacity'] = '57 Lit'
df_newcar_inf.loc[df_newcar_inf['name'] == 'Toyota Fortuner 2.4 TRD AT 4X2',  'fuel_capacity'] = '80 Lit'
df_newcar_inf.loc[df_newcar_inf['name'] == 'BMW i8 1.5L Hybrid',  'fuel_capacity'] = '42 Lit'
df_newcar_inf.loc[df_newcar_inf['name'] == 'Rolls-Royce Cullinan V12',  'fuel_capacity'] = '83 Lit'
df_newcar_inf.loc[df_newcar_inf['name'] == 'Rolls-Royce Dawn V12',  'fuel_capacity'] = '78 Lit'

In [746]:
filtered_cars = df_newcar_inf[df_newcar_inf['fuel_capacity'] == '0 Lit']

car_names = filtered_cars['name']
print(car_names)

Series([], Name: name, dtype: object)


In [747]:
df_newcar_inf['fuel_capacity'] = df_newcar_inf['fuel_capacity'].str.replace(' Lit', '', regex=False).astype(int)
df_newcar_inf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 587 entries, 0 to 586
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              587 non-null    object 
 1   link              587 non-null    object 
 2   manufacturer      586 non-null    object 
 3   origin            587 non-null    object 
 4   body_type         587 non-null    object 
 5   seating_capacity  587 non-null    object 
 6   engine            587 non-null    object 
 7   torque            0 non-null      float64
 8   transmission      587 non-null    object 
 9   max_power         530 non-null    object 
 10  drive             587 non-null    object 
 11  fuel_type         585 non-null    object 
 12  fuel_capacity     587 non-null    int64  
dtypes: float64(1), int64(1), object(11)
memory usage: 59.7+ KB


### fuel_type

In [748]:
df_newcar_inf['fuel_type'].unique()

array(['Xang', 'Dau', nan, 'Dien'], dtype=object)

In [749]:
filtered_cars = df_newcar_inf[df_newcar_inf['fuel_type'].isna()]

car_names = filtered_cars['name']
print(car_names)

178        Honda Civic 1.8 G
245    Honda CR-V 1.5 E 2019
Name: name, dtype: object


In [750]:
df_newcar_inf.loc[df_newcar_inf['name'] == 'Honda Civic 1.8 G', 'fuel_type'] = 'Xang'
df_newcar_inf.loc[df_newcar_inf['name'] == 'Honda CR-V 1.5 E 2019',  'fuel_type'] = 'Xang'
df_newcar_inf['fuel_type'].unique()

array(['Xang', 'Dau', 'Dien'], dtype=object)

### max_power, torque, link

In [751]:
df_newcar_inf.drop(columns=['max_power', 'torque', 'link'], inplace=True)

### manufacturer

In [752]:
df_newcar_inf['manufacturer'].unique()

array(['Kia', 'Suzuki', 'Toyota', 'Hyundai', 'Chevrolet', 'Mitsubishi',
       'VinFast', 'Honda', 'Nissan', 'Mazda', 'Ford', 'Renault', 'Isuzu',
       'Volkswagen', 'Subaru', 'Peugeot', 'Land Rover', 'Mercedes',
       'Audi', 'BMW', 'Mini', 'Volvo', 'Jaguar', 'Lexus', 'Infiniti',
       'Porsche', nan, 'Cadillac', 'Lincoln', 'Maserati',
       'Mercedes Maybach', 'McLaren', 'Aston Martin', 'Bentley',
       'Rolls Royce'], dtype=object)

In [753]:
filtered_cars = df_newcar_inf[df_newcar_inf['manufacturer'].isna()]
filtered_cars['name']

464    Land Rover Range Rover Evoque 2.0L I4 Turbocha...
Name: name, dtype: object

In [754]:
df_newcar_inf.loc[464, 'manufacturer'] = 'Land Rover'

In [755]:
df_newcar_inf['manufacturer'] = df_newcar_inf['manufacturer'].replace('Mercedes Maybach', 'Mercedes')
df_newcar_inf['manufacturer'] = df_newcar_inf['manufacturer'].replace('Rolls Royce', 'Rolls-Royce')
df_newcar_inf['manufacturer'].unique()

array(['Kia', 'Suzuki', 'Toyota', 'Hyundai', 'Chevrolet', 'Mitsubishi',
       'VinFast', 'Honda', 'Nissan', 'Mazda', 'Ford', 'Renault', 'Isuzu',
       'Volkswagen', 'Subaru', 'Peugeot', 'Land Rover', 'Mercedes',
       'Audi', 'BMW', 'Mini', 'Volvo', 'Jaguar', 'Lexus', 'Infiniti',
       'Porsche', 'Cadillac', 'Lincoln', 'Maserati', 'McLaren',
       'Aston Martin', 'Bentley', 'Rolls-Royce'], dtype=object)

### origin

In [756]:
df_newcar_inf['origin'].unique()

array(['Lap rap', 'Nhap khau'], dtype=object)

### body_type, seating_capacity, engine

In [757]:
df_newcar_inf['body_type'].unique()

array(['Hatchback', 'Sedan', 'Minivan', 'SUV', 'Crossover', 'Xe ban tai',
       'Wagon', 'Van', 'Coupe', 'Convertible'], dtype=object)

In [758]:
df_newcar_inf['body_type'] = df_newcar_inf['body_type'].apply(lambda x: x.replace('Xe ban tai', 'Ban tai'))

In [759]:
df_newcar_inf['body_type'].unique()

array(['Hatchback', 'Sedan', 'Minivan', 'SUV', 'Crossover', 'Ban tai',
       'Wagon', 'Van', 'Coupe', 'Convertible'], dtype=object)

In [760]:
df_newcar_inf['seating_capacity'].unique()
df_newcar_inf['seating_capacity'] = df_newcar_inf['seating_capacity'].str.replace(' cho', '', regex=False).astype(int)

In [761]:
df_newcar_inf['engine'].unique()
df_newcar_inf['engine'] = df_newcar_inf['engine'].str.replace(' cc', '', regex=False).astype(int)

In [762]:
df_newcar_inf['engine'].unique()

array([1248,  998, 1197, 1206, 1193, 1399, 1368, 1395, 1199, 1498, 1496,
       1397, 1495, 1373, 1462, 1497, 1329, 1500, 1499, 1591, 1999, 2488,
       1353, 1598, 1599, 1596, 2198, 2393, 2457, 1796, 1898, 1364,  999,
       1000, 1798, 1799, 2396, 3198, 1998, 1996, 2402, 1988, 2359, 2999,
       1997, 2499, 2755, 1987, 1995, 2199, 2442, 2497, 2395, 2982, 1618,
       2998, 2360, 2694, 2995, 2494, 2498, 1595, 2448, 3342, 3471, 1984,
       1991, 1797, 1398, 3630, 1969, 2356, 2972, 2000, 2261, 2487, 3456,
       3778, 3498, 2979, 2996, 3696, 3457, 4608, 5663, 2894, 2993, 3649,
       3596, 4605, 4395, 2981, 5552, 3444, 3802, 4691, 6162, 3997, 5461,
       3996, 3994, 5935, 3982, 5000, 5980, 3995, 6592, 6755])

In [763]:
def round_up(x):
    return ((x + 49) // 50) * 50
df_newcar_inf['engine'] = df_newcar_inf['engine'].apply(round_up)
df_newcar_inf['engine'].unique()

array([1250, 1000, 1200, 1400, 1500, 1350, 1600, 2000, 2500, 2200, 2400,
       1800, 1900, 3200, 2450, 3000, 2800, 1650, 2700, 3350, 3500, 3650,
       2300, 3800, 3700, 4650, 5700, 2900, 3600, 4400, 5600, 3450, 3850,
       4700, 6200, 4000, 5500, 5950, 5000, 6000, 6600, 6800])

### Final

In [764]:
df_newcar_inf.columns = df_newcar_inf.columns.str.title()
df_newcar_inf.drop_duplicates()

Unnamed: 0,Name,Manufacturer,Origin,Body_Type,Seating_Capacity,Engine,Transmission,Drive,Fuel_Type,Fuel_Capacity
0,Kia Morning MT,Kia,Lap rap,Hatchback,5,1250,So san,FWD,Xang,35
1,Suzuki Celerio MT,Suzuki,Nhap khau,Hatchback,5,1000,So san,2WD,Xang,35
2,Kia Morning AT,Kia,Lap rap,Hatchback,5,1250,Tu dong,FWD,Xang,35
3,Toyota Wigo 1.2 G MT,Toyota,Nhap khau,Hatchback,5,1200,So san,FWD,Xang,33
4,Suzuki Celerio CVT,Suzuki,Nhap khau,Hatchback,5,1000,Vo cap CVT,2WD,Xang,35
...,...,...,...,...,...,...,...,...,...,...
582,Mercedes-AMG G65 6.0 V12,Mercedes,Nhap khau,SUV,5,6000,Tu dong,4WD,Xang,96
583,Aston Martin Vanquish S V12,Aston Martin,Nhap khau,Coupe,2,5950,Tu dong,RWD,Xang,77
584,Rolls-Royce Ghost V12,Rolls-Royce,Nhap khau,Sedan,5,6600,Tu dong,RWD,Xang,82
585,Rolls-Royce Cullinan V12,Rolls-Royce,Nhap khau,SUV,4,6800,Tu dong,AWD,Xang,83


In [765]:
df_newcar_inf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 587 entries, 0 to 586
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Name              587 non-null    object
 1   Manufacturer      587 non-null    object
 2   Origin            587 non-null    object
 3   Body_Type         587 non-null    object
 4   Seating_Capacity  587 non-null    int64 
 5   Engine            587 non-null    int64 
 6   Transmission      587 non-null    object
 7   Drive             587 non-null    object
 8   Fuel_Type         587 non-null    object
 9   Fuel_Capacity     587 non-null    int64 
dtypes: int64(3), object(7)
memory usage: 46.0+ KB


In [766]:
df_newcar_inf.head()

Unnamed: 0,Name,Manufacturer,Origin,Body_Type,Seating_Capacity,Engine,Transmission,Drive,Fuel_Type,Fuel_Capacity
0,Kia Morning MT,Kia,Lap rap,Hatchback,5,1250,So san,FWD,Xang,35
1,Suzuki Celerio MT,Suzuki,Nhap khau,Hatchback,5,1000,So san,2WD,Xang,35
2,Kia Morning AT,Kia,Lap rap,Hatchback,5,1250,Tu dong,FWD,Xang,35
3,Toyota Wigo 1.2 G MT,Toyota,Nhap khau,Hatchback,5,1200,So san,FWD,Xang,33
4,Suzuki Celerio CVT,Suzuki,Nhap khau,Hatchback,5,1000,Vo cap CVT,2WD,Xang,35


## Bonbanh.com

In [767]:
df_bonbanh_inf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4139 entries, 0 to 4138
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         4139 non-null   object
 1   Price        4139 non-null   object
 2   SellDate     4139 non-null   object
 3   Date         4139 non-null   object
 4   Condition    4139 non-null   object
 5   Mileage      4139 non-null   object
 6   Origin       4139 non-null   object
 7   BodyType     4139 non-null   object
 8   EngineType   4139 non-null   object
 9   Color        4139 non-null   object
 10  ColorInside  4139 non-null   object
 11  Seats        4139 non-null   object
 12  Doors        4139 non-null   object
 13  City         4139 non-null   object
dtypes: object(14)
memory usage: 452.8+ KB


### Replace None

In [768]:
df_bonbanh_inf.replace({'-': np.nan, 'None': np.nan}, inplace=True)
df_bonbanh_inf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4139 entries, 0 to 4138
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         4139 non-null   object
 1   Price        4139 non-null   object
 2   SellDate     4139 non-null   object
 3   Date         4139 non-null   object
 4   Condition    4139 non-null   object
 5   Mileage      4139 non-null   object
 6   Origin       4139 non-null   object
 7   BodyType     4139 non-null   object
 8   EngineType   4139 non-null   object
 9   Color        4030 non-null   object
 10  ColorInside  4085 non-null   object
 11  Seats        4139 non-null   object
 12  Doors        3179 non-null   object
 13  City         4139 non-null   object
dtypes: object(14)
memory usage: 452.8+ KB


### Condition   

In [769]:
df_bonbanh_inf['Condition'].unique()

array(['Xe da dung', 'Xe moi'], dtype=object)

In [770]:
df_bonbanh_inf = df_bonbanh_inf[df_bonbanh_inf['Condition'] != 'Xe moi']
df_bonbanh_inf.drop(columns=['Condition'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bonbanh_inf.drop(columns=['Condition'], inplace=True)


In [771]:
df_bonbanh_inf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3179 entries, 0 to 4135
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         3179 non-null   object
 1   Price        3179 non-null   object
 2   SellDate     3179 non-null   object
 3   Date         3179 non-null   object
 4   Mileage      3179 non-null   object
 5   Origin       3179 non-null   object
 6   BodyType     3179 non-null   object
 7   EngineType   3179 non-null   object
 8   Color        3174 non-null   object
 9   ColorInside  3125 non-null   object
 10  Seats        3179 non-null   object
 11  Doors        3179 non-null   object
 12  City         3179 non-null   object
dtypes: object(13)
memory usage: 347.7+ KB


### Name

#### Checking


In [772]:
filtered_cars = df_bonbanh_inf['name'].apply(lambda x: x.split(" - ")[0].strip()[-4:])
print(filtered_cars.apply(lambda x: x.isdigit()).all())
filtered_cars.unique()

False


array(['2021', '2009', '2020', '2017', '2023', '2022', '2019', '2010',
       '2013', '2011', '2007', '2014', '2016', '2018', '2012', '2015',
       '2005', '2024', '2008', '2006', '2004', '4 AT', '1995', '1990',
       '2002', '2003', '2001', '2000'], dtype=object)

In [773]:
filtered_cars = df_bonbanh_inf[['name']].copy()

filtered_cars['year'] = filtered_cars['name'].apply(lambda x: x.split(" - ")[0].strip()[-4:])
filtered_cars['is_valid_year'] = filtered_cars['year'].apply(lambda x: x.isdigit())

invalid_years = filtered_cars[~filtered_cars['is_valid_year']]

print(invalid_years)

                                           name  year  is_valid_year
1229  Xe Honda CRV 2.4 AT - TG 2017 - 569 Trieu  4 AT          False
1617  Xe Honda CRV 2.4 AT - TG 2017 - 595 Trieu  4 AT          False
2764  Xe Honda CRV 2.4 AT - TG 2017 - 625 Trieu  4 AT          False
2767  Xe Honda CRV 2.4 AT - TG 2017 - 625 Trieu  4 AT          False
3142  Xe Honda CRV 2.4 AT - TG 2016 - 569 Trieu  4 AT          False
3450  Xe Honda CRV 2.4 AT - TG 2017 - 635 Trieu  4 AT          False
3867  Xe Honda CRV 2.4 AT - TG 2017 - 615 Trieu  4 AT          False


#### Formating name

In [774]:
def extract_car_name(car_info):
    if car_info == 'Honda CRV 2.4 AT':
        return 'Xe Honda CRV 2.4'
    name_with_year = car_info.split(" - ")[0].strip()
    return name_with_year[:-5].strip()

In [775]:
df_bonbanh_inf['name'] = df_bonbanh_inf['name'].apply(lambda x: x.replace("Xe ", ""))
df_bonbanh_inf['name'] = df_bonbanh_inf['name'].apply(extract_car_name)
df_bonbanh_inf.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bonbanh_inf['name'] = df_bonbanh_inf['name'].apply(lambda x: x.replace("Xe ", ""))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bonbanh_inf['name'] = df_bonbanh_inf['name'].apply(extract_car_name)


Unnamed: 0,name,Price,SellDate,Date,Mileage,Origin,BodyType,EngineType,Color,ColorInside,Seats,Doors,City
0,Toyota Camry 2.5Q,999 Trieu,19/10/2024,2021,"20,000 Km",Nhap khau,Sedan,Xang 2.5 L,Trang,Kem,5 cho,4 cua,Dien Bien
1,Chevrolet Captiva LTZ Maxx 2.4 AT,178 Trieu,19/10/2024,2009,"90,000 Km",Lap rap trong nuoc,SUV,Xang 2.4 L,Den,Kem,7 cho,5 cua,Dong Nai
2,Mercedes Benz S class S450L,2 Ty 695 Trieu,19/10/2024,2020,"39,000 Km",Lap rap trong nuoc,Sedan,Xang 3.0 L,Trang,Den,5 cho,4 cua,Can Tho
3,Toyota Prado VX 2.7L,2 Ty 390 Trieu,20/10/2024,2021,"108,000 Km",Nhap khau,SUV,Xang 2.7 L,Den,Den,7 cho,5 cua,Hai Phong
4,Toyota Camry 2.5Q,999 Trieu,19/10/2024,2021,"20,000 Km",Nhap khau,Sedan,Xang 2.5 L,Trang,Kem,5 cho,4 cua,Ba Ria Vung Tau
5,Chevrolet Captiva LTZ Maxx 2.4 AT,178 Trieu,19/10/2024,2009,"90,000 Km",Lap rap trong nuoc,SUV,Xang 2.4 L,Den,Kem,7 cho,5 cua,Ha Noi
6,Mercedes Benz S class S450L,2 Ty 695 Trieu,19/10/2024,2020,"39,000 Km",Lap rap trong nuoc,Sedan,Xang 3.0 L,Trang,Den,5 cho,4 cua,Nghe An
7,Toyota Prado VX 2.7L,2 Ty 390 Trieu,20/10/2024,2021,"108,000 Km",Nhap khau,SUV,Xang 2.7 L,Den,Den,7 cho,5 cua,Quang Ninh
8,Toyota Camry 2.5Q,999 Trieu,19/10/2024,2021,"20,000 Km",Nhap khau,Sedan,Xang 2.5 L,Trang,Kem,5 cho,4 cua,Tp.HCM
9,Chevrolet Captiva LTZ Maxx 2.4 AT,178 Trieu,19/10/2024,2009,"90,000 Km",Lap rap trong nuoc,SUV,Xang 2.4 L,Den,Kem,7 cho,5 cua,Phu Tho


In [776]:
filtered_data = df_bonbanh_inf[df_bonbanh_inf['name'].str.contains('Honda CRV 2.4 AT', case=False)]

print(filtered_data)

                  name      Price     SellDate  Date     Mileage  \
403   Honda CRV 2.4 AT  435 Trieu  19/10/2024   2013   85,000 Km   
1282  Honda CRV 2.4 AT  555 Trieu  19/10/2024   2015  109,000 Km   
1532  Honda CRV 2.4 AT  555 Trieu  20/10/2024   2015   80,000 Km   
2528  Honda CRV 2.4 AT  495 Trieu  20/10/2024   2014   93,000 Km   
3031  Honda CRV 2.4 AT  348 Trieu  19/10/2024   2011   10,500 Km   
3117  Honda CRV 2.4 AT  335 Trieu  20/10/2024   2010  137,000 Km   
3159  Honda CRV 2.4 AT  458 Trieu  19/10/2024   2014  120,000 Km   
3543  Honda CRV 2.4 AT  380 Trieu  19/10/2024   2012   92,000 Km   

                  Origin BodyType  EngineType  Color ColorInside  Seats  \
403   Lap rap trong nuoc      SUV  Xang 2.4 L    Bac         Den  5 cho   
1282  Lap rap trong nuoc      SUV  Xang 2.4 L    Nau         Den  5 cho   
1532  Lap rap trong nuoc      SUV  Xang 2.4 L    Den         Den  5 cho   
2528  Lap rap trong nuoc      SUV  Xang 2.4 L    Nau         Den  5 cho   
3031  Lap ra

### Price

In [777]:
def convert_price_to_number(price_str):
    price_str = price_str.replace("Trieu", "").strip()

    parts = price_str.split(" ")

    total = 0
    if 'Ty' in parts:
        if len(parts) == 2:
            total += int(parts[0])*1000
        else:
            total += int(parts[0]) * 1000 + int(parts[2])
    else:
        total += int(parts[0])

    return total

df_bonbanh_inf['Price'] = df_bonbanh_inf['Price'].apply(convert_price_to_number)

df_bonbanh_inf.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bonbanh_inf['Price'] = df_bonbanh_inf['Price'].apply(convert_price_to_number)


Unnamed: 0,name,Price,SellDate,Date,Mileage,Origin,BodyType,EngineType,Color,ColorInside,Seats,Doors,City
0,Toyota Camry 2.5Q,999,19/10/2024,2021,"20,000 Km",Nhap khau,Sedan,Xang 2.5 L,Trang,Kem,5 cho,4 cua,Dien Bien
1,Chevrolet Captiva LTZ Maxx 2.4 AT,178,19/10/2024,2009,"90,000 Km",Lap rap trong nuoc,SUV,Xang 2.4 L,Den,Kem,7 cho,5 cua,Dong Nai
2,Mercedes Benz S class S450L,2695,19/10/2024,2020,"39,000 Km",Lap rap trong nuoc,Sedan,Xang 3.0 L,Trang,Den,5 cho,4 cua,Can Tho
3,Toyota Prado VX 2.7L,2390,20/10/2024,2021,"108,000 Km",Nhap khau,SUV,Xang 2.7 L,Den,Den,7 cho,5 cua,Hai Phong
4,Toyota Camry 2.5Q,999,19/10/2024,2021,"20,000 Km",Nhap khau,Sedan,Xang 2.5 L,Trang,Kem,5 cho,4 cua,Ba Ria Vung Tau
5,Chevrolet Captiva LTZ Maxx 2.4 AT,178,19/10/2024,2009,"90,000 Km",Lap rap trong nuoc,SUV,Xang 2.4 L,Den,Kem,7 cho,5 cua,Ha Noi
6,Mercedes Benz S class S450L,2695,19/10/2024,2020,"39,000 Km",Lap rap trong nuoc,Sedan,Xang 3.0 L,Trang,Den,5 cho,4 cua,Nghe An
7,Toyota Prado VX 2.7L,2390,20/10/2024,2021,"108,000 Km",Nhap khau,SUV,Xang 2.7 L,Den,Den,7 cho,5 cua,Quang Ninh
8,Toyota Camry 2.5Q,999,19/10/2024,2021,"20,000 Km",Nhap khau,Sedan,Xang 2.5 L,Trang,Kem,5 cho,4 cua,Tp.HCM
9,Chevrolet Captiva LTZ Maxx 2.4 AT,178,19/10/2024,2009,"90,000 Km",Lap rap trong nuoc,SUV,Xang 2.4 L,Den,Kem,7 cho,5 cua,Phu Tho


In [778]:
print(df_bonbanh_inf[df_bonbanh_inf['Price'] < 100]['Price'].unique())

[99 70 89 90 95 25 65 46 19]


In [779]:
df_bonbanh_inf[df_bonbanh_inf['Price'] == 19]

Unnamed: 0,name,Price,SellDate,Date,Mileage,Origin,BodyType,EngineType,Color,ColorInside,Seats,Doors,City
3984,Toyota Hiace Van 2.4,19,20/10/2024,2004,0 Km,Lap rap trong nuoc,Van/Minivan,Xang 2.4 L,Cat,,3 cho,4 cua,Bac Ninh


In [780]:
df_bonbanh_inf[df_bonbanh_inf['Price'] == 25]

Unnamed: 0,name,Price,SellDate,Date,Mileage,Origin,BodyType,EngineType,Color,ColorInside,Seats,Doors,City
2344,Nissan Maxima 3.0 AT,25,20/10/2024,1990,0 Km,Nhap khau,Sedan,Xang 3.0 L,Cat,Nau,5 cho,4 cua,Tp.HCM
2353,Nissan Maxima 3.0 AT,25,20/10/2024,1990,0 Km,Nhap khau,Sedan,Xang 3.0 L,Cat,Nau,5 cho,4 cua,Gia Lai


### Year, Date, SellDate

In [781]:
df_bonbanh_inf['SellDate'].unique()

array(['19/10/2024 ', '20/10/2024 '], dtype=object)

In [782]:
df_bonbanh_inf['Date'] = df_bonbanh_inf['Date'].astype(int)
df_bonbanh_inf['Date'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bonbanh_inf['Date'] = df_bonbanh_inf['Date'].astype(int)


array([2021, 2009, 2020, 2017, 2023, 2022, 2019, 2010, 2013, 2011, 2007,
       2014, 2016, 2018, 2012, 2015, 2005, 2024, 2008, 2006, 2004, 1995,
       1989, 2002, 2003, 2001, 1990, 2000])

In [783]:
df_bonbanh_inf = df_bonbanh_inf[df_bonbanh_inf['Date'] > 2000]

In [784]:
df_bonbanh_inf['Year'] = 2024 - df_bonbanh_inf['Date']
df_bonbanh_inf.rename(columns={'Date': 'Public Year'}, inplace=True)
df_bonbanh_inf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3172 entries, 0 to 4135
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         3172 non-null   object
 1   Price        3172 non-null   int64 
 2   SellDate     3172 non-null   object
 3   Public Year  3172 non-null   int64 
 4   Mileage      3172 non-null   object
 5   Origin       3172 non-null   object
 6   BodyType     3172 non-null   object
 7   EngineType   3172 non-null   object
 8   Color        3167 non-null   object
 9   ColorInside  3119 non-null   object
 10  Seats        3172 non-null   object
 11  Doors        3172 non-null   object
 12  City         3172 non-null   object
 13  Year         3172 non-null   int64 
dtypes: int64(3), object(11)
memory usage: 371.7+ KB


### Mileage

In [785]:
df_bonbanh_inf['Mileage'].unique()

array(['20,000 Km', '90,000 Km', '39,000 Km', '108,000 Km', '36,000 Km',
       '92,000 Km', '50,000 Km', '78,700 Km', '55,683 Km', '29,000 Km',
       '19,000 Km', '7,000 Km', '60,000 Km', '27,500 Km', '27,000 Km',
       '30,000 Km', '61,000 Km', '2,000 Km', '32,000 Km', '22,000 Km',
       '45,000 Km', '0 Km', '95,000 Km', '28,000 Km', '25,000 Km',
       '86,000 Km', '70,000 Km', '58,000 Km', '110,000 Km', '68,000 Km',
       '160,000 Km', '10,000 Km', '130,000 Km', '80,000 Km', '144,000 Km',
       '67,000 Km', '41,000 Km', '46,000 Km', '87,800 Km', '64,800 Km',
       '72,000 Km', '81,000 Km', '42,000 Km', '158,638 Km', '35,121 Km',
       '77,000 Km', '98,000 Km', '24,000 Km', '40,000 Km', '49,000 Km',
       '64,000 Km', '120,000 Km', '37,000 Km', '104,000 Km', '4,000 Km',
       '5,300 Km', '5 Km', '1,405 Km', '50,300 Km', '85,000 Km',
       '15,000 Km', '63,000 Km', '87,600 Km', '55,555 Km', '34,000 Km',
       '26,000 Km', '100,000 Km', '52,000 Km', '66,666 Km', '106,836 Km

In [786]:
df_bonbanh_inf['Mileage'] = df_bonbanh_inf['Mileage'].apply(lambda x: x.replace(" Km", "").replace(",", ""))

df_bonbanh_inf['Mileage'] = df_bonbanh_inf['Mileage'].astype(int)

In [787]:
df_bonbanh_inf.head(10)

Unnamed: 0,name,Price,SellDate,Public Year,Mileage,Origin,BodyType,EngineType,Color,ColorInside,Seats,Doors,City,Year
0,Toyota Camry 2.5Q,999,19/10/2024,2021,20000,Nhap khau,Sedan,Xang 2.5 L,Trang,Kem,5 cho,4 cua,Dien Bien,3
1,Chevrolet Captiva LTZ Maxx 2.4 AT,178,19/10/2024,2009,90000,Lap rap trong nuoc,SUV,Xang 2.4 L,Den,Kem,7 cho,5 cua,Dong Nai,15
2,Mercedes Benz S class S450L,2695,19/10/2024,2020,39000,Lap rap trong nuoc,Sedan,Xang 3.0 L,Trang,Den,5 cho,4 cua,Can Tho,4
3,Toyota Prado VX 2.7L,2390,20/10/2024,2021,108000,Nhap khau,SUV,Xang 2.7 L,Den,Den,7 cho,5 cua,Hai Phong,3
4,Toyota Camry 2.5Q,999,19/10/2024,2021,20000,Nhap khau,Sedan,Xang 2.5 L,Trang,Kem,5 cho,4 cua,Ba Ria Vung Tau,3
5,Chevrolet Captiva LTZ Maxx 2.4 AT,178,19/10/2024,2009,90000,Lap rap trong nuoc,SUV,Xang 2.4 L,Den,Kem,7 cho,5 cua,Ha Noi,15
6,Mercedes Benz S class S450L,2695,19/10/2024,2020,39000,Lap rap trong nuoc,Sedan,Xang 3.0 L,Trang,Den,5 cho,4 cua,Nghe An,4
7,Toyota Prado VX 2.7L,2390,20/10/2024,2021,108000,Nhap khau,SUV,Xang 2.7 L,Den,Den,7 cho,5 cua,Quang Ninh,3
8,Toyota Camry 2.5Q,999,19/10/2024,2021,20000,Nhap khau,Sedan,Xang 2.5 L,Trang,Kem,5 cho,4 cua,Tp.HCM,3
9,Chevrolet Captiva LTZ Maxx 2.4 AT,178,19/10/2024,2009,90000,Lap rap trong nuoc,SUV,Xang 2.4 L,Den,Kem,7 cho,5 cua,Phu Tho,15


### Origin

In [788]:
df_bonbanh_inf['Origin'].unique()
df_bonbanh_inf['Origin'] = df_bonbanh_inf['Origin'].replace("Lap rap trong nuoc", "Lap rap")

In [789]:
df_bonbanh_inf['Origin'].unique()

array(['Nhap khau', 'Lap rap'], dtype=object)

### BodyType

In [790]:
df_bonbanh_inf['BodyType'].unique()

array(['Sedan', 'SUV', 'Hatchback', 'Van/Minivan', 'Crossover',
       'Ban tai / Pickup', 'Coupe', 'Convertible/Cabriolet', 'Truck'],
      dtype=object)

In [791]:
df_bonbanh_inf['BodyType'] = df_bonbanh_inf['BodyType'].replace("Van/Minivan", "Van")
df_bonbanh_inf['BodyType'] = df_bonbanh_inf['BodyType'].replace('Convertible/Cabriolet', 'Convertible')
df_bonbanh_inf['BodyType'] = df_bonbanh_inf['BodyType'].replace('Ban tai / Pickup', 'Ban tai')

In [792]:
df_bonbanh_inf = df_bonbanh_inf[df_bonbanh_inf['BodyType'] != 'Truck']

In [793]:
df_bonbanh_inf['BodyType'].unique()

array(['Sedan', 'SUV', 'Hatchback', 'Van', 'Crossover', 'Ban tai',
       'Coupe', 'Convertible'], dtype=object)

### EngineType, Engine


In [794]:
df_bonbanh_inf['EngineType'].unique()

array(['Xang 2.5 L', 'Xang 2.4 L', 'Xang 3.0 L', 'Xang 2.7 L',
       'Xang 1.4 L', 'Xang 1.5 L', 'Xang 1.8 L', 'Xang 2.0 L',
       'Xang 1.2 L', 'Xang 6.6 L', 'Xang 1.1 L', 'Dau 2.5 L',
       'Xang 3.5 L', 'Dau 2.4 L', 'Xang 4.6 L', 'Dau 2.8 L', 'Xang 1.6 L',
       'Dau 2.2 L', 'Xang 5.0 L', 'Xang 2.3 L', 'Dau 2.0 L', 'Dien 0.2 L',
       'Xang 1.25 L', 'Xang 3.6 L', 'Xang 1.0 L', 'Dau 3.0 L',
       'Xang 4.0 L', 'Dien 0.1 L', 'Xang 6.2 L', 'Xang 2.9 L',
       'Xang 3.3 L', 'Dien', 'Xang 4.8 L', 'Xang 5.7 L', 'Xang 4.5 L',
       'Hybrid 1.8 L', 'Xang 3.4 L', 'Xang 4.7 L', 'Xang 4.3 L',
       'Xang 6.8 L', 'Xang 2.6 L', 'Dau 3.2 L', 'Dau 2.1 L',
       'Hybrid 3.5 L', 'Hybrid 2.0 L', 'Xang 4.2 L', 'Xang 5.5 L',
       'Xang 1.3 L', 'Xang 3.9 L', 'Xang 3.7 L', 'Dau 2.3 L',
       'Hybrid 2.5 L', 'Xang', 'Xang 0.8 L', 'Dau 4.4 L', 'Xang 6.7 L',
       'Xang 6.0 L', 'Hybrid 1.5 L', 'Xang 5.6 L', 'Xang 6.5 L',
       'Xang 3.8 L', 'Dau 1.7 L', 'Dau 1.6 L', 'Xang 3.2 L', 'Dau 1.9 L',

In [795]:
def extract_fuel_and_engine(engine_type):
    parts = engine_type.split(' ')
    fuel_type = parts[0]
    if fuel_type == "Dien":
        engine_size = 0
    elif len(parts) > 1:
        engine_size = float(parts[1].replace('L', '')) * 1000
    else:
        engine_size = None
    return fuel_type, engine_size

df_bonbanh_inf['FuelType'], df_bonbanh_inf['Engine'] = zip(*df_bonbanh_inf['EngineType'].apply(extract_fuel_and_engine))



In [796]:
df_bonbanh_inf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3168 entries, 0 to 4135
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         3168 non-null   object 
 1   Price        3168 non-null   int64  
 2   SellDate     3168 non-null   object 
 3   Public Year  3168 non-null   int64  
 4   Mileage      3168 non-null   int64  
 5   Origin       3168 non-null   object 
 6   BodyType     3168 non-null   object 
 7   EngineType   3168 non-null   object 
 8   Color        3163 non-null   object 
 9   ColorInside  3115 non-null   object 
 10  Seats        3168 non-null   object 
 11  Doors        3168 non-null   object 
 12  City         3168 non-null   object 
 13  Year         3168 non-null   int64  
 14  FuelType     3168 non-null   object 
 15  Engine       3164 non-null   float64
dtypes: float64(1), int64(4), object(11)
memory usage: 420.8+ KB


In [797]:
filtered_data = df_bonbanh_inf[df_bonbanh_inf['Engine'].isnull()]
filtered_data.head()

Unnamed: 0,name,Price,SellDate,Public Year,Mileage,Origin,BodyType,EngineType,Color,ColorInside,Seats,Doors,City,Year,FuelType,Engine
1376,Mercedes Benz C class,210,20/10/2024,2008,16000,Lap rap,Sedan,Xang,Den,Den,5 cho,0 cua,Tp.HCM,16,Xang,
1402,Hyundai i10 Grand 1.2AT,365,19/10/2024,2022,56000,Lap rap,Sedan,Xang,Do,Den,5 cho,0 cua,Vinh Phuc,2,Xang,
2271,Toyota Vios,240,19/10/2024,2015,9,Lap rap,Sedan,Xang,Trang,,5 cho,0 cua,Tp.HCM,9,Xang,
4010,Thaco KB1,660,20/10/2024,2009,500000,Lap rap,Van,Dau,Xanh,Nhieu mau,47 cho,1 cua,Ha Noi,15,Dau,


In [798]:
df_bonbanh_inf.loc[1376, 'Engine'] = 2500
df_bonbanh_inf.loc[1402, 'Engine'] = 1250
df_bonbanh_inf.loc[2271, 'Engine'] = 1500
df_bonbanh_inf.drop(index=4010, inplace=True)
filtered_data = df_bonbanh_inf[df_bonbanh_inf['EngineType'].isnull()]
filtered_data.head()
df_bonbanh_inf['Engine'] = df_bonbanh_inf['Engine'].astype(int)

In [799]:
df_bonbanh_inf.drop(columns='EngineType', inplace=True)

### Color, ColorInside

In [800]:
df_bonbanh_inf.drop(columns=['Color'], inplace=True)
df_bonbanh_inf.drop(columns=['ColorInside'], inplace=True)

### Seats, Doors

In [801]:
df_bonbanh_inf['Seats'].unique()
df_bonbanh_inf['Seats'] = df_bonbanh_inf['Seats'].str.replace(' cho', '', regex=False).astype(int)
df_bonbanh_inf.drop(columns=['Doors'], inplace=True)

In [802]:
df_bonbanh_inf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3167 entries, 0 to 4135
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         3167 non-null   object
 1   Price        3167 non-null   int64 
 2   SellDate     3167 non-null   object
 3   Public Year  3167 non-null   int64 
 4   Mileage      3167 non-null   int64 
 5   Origin       3167 non-null   object
 6   BodyType     3167 non-null   object
 7   Seats        3167 non-null   int64 
 8   City         3167 non-null   object
 9   Year         3167 non-null   int64 
 10  FuelType     3167 non-null   object
 11  Engine       3167 non-null   int64 
dtypes: int64(6), object(6)
memory usage: 321.6+ KB


### Manufacture

In [803]:
df_bonbanh_inf['Manufacturer'] = df_bonbanh_inf['name'].apply(lambda x: x.split(" ")[0])
df_bonbanh_inf['Manufacturer'].unique()

array(['Toyota', 'Chevrolet', 'Mercedes', 'VinFast', 'Volkswagen', 'Audi',
       'Mitsubishi', 'Porsche', 'LandRover', 'Hyundai', 'Rolls', 'Suzuki',
       'Kia', 'BMW', 'Lexus', 'Ford', 'Mazda', 'Mini', 'Honda',
       'Cadillac', 'MG', 'Peugeot', 'Daewoo', 'Scion', 'Infiniti', 'Fiat',
       'Isuzu', 'Volvo', 'Nissan', 'Bentley', 'Ssangyong', 'Haima',
       'Maserati', 'Ferrari', 'Acura', 'Jaguar', 'Subaru', 'Renault',
       'Haval', 'Baic', 'Wuling', 'McLaren', 'Jeep', 'Maybach', 'Smart',
       'Lamborghini', 'Daihatsu'], dtype=object)

In [804]:
df_bonbanh_inf['Manufacturer'] = df_bonbanh_inf['Manufacturer'].replace({'Rolls': 'Rolls-Royce'})
df_bonbanh_inf['Manufacturer'] = df_bonbanh_inf['Manufacturer'].replace({'LandRover': 'Land Rover'})
df_bonbanh_inf['Manufacturer'].unique()

array(['Toyota', 'Chevrolet', 'Mercedes', 'VinFast', 'Volkswagen', 'Audi',
       'Mitsubishi', 'Porsche', 'Land Rover', 'Hyundai', 'Rolls-Royce',
       'Suzuki', 'Kia', 'BMW', 'Lexus', 'Ford', 'Mazda', 'Mini', 'Honda',
       'Cadillac', 'MG', 'Peugeot', 'Daewoo', 'Scion', 'Infiniti', 'Fiat',
       'Isuzu', 'Volvo', 'Nissan', 'Bentley', 'Ssangyong', 'Haima',
       'Maserati', 'Ferrari', 'Acura', 'Jaguar', 'Subaru', 'Renault',
       'Haval', 'Baic', 'Wuling', 'McLaren', 'Jeep', 'Maybach', 'Smart',
       'Lamborghini', 'Daihatsu'], dtype=object)

### City

In [805]:
df_bonbanh_inf['City'].unique()

array(['Dien Bien', 'Dong Nai', 'Can Tho', 'Hai Phong', 'Ba Ria Vung Tau',
       'Ha Noi', 'Nghe An', 'Quang Ninh', 'Tp.HCM', 'Phu Tho',
       'Quang Tri', 'Binh Duong', 'Bac Giang', 'Hoa Binh', 'Son La',
       'Lam Dong', 'Hung Yen', 'Vinh Phuc', 'Nam Dinh', 'Thua Thien Hue',
       'Thai Binh', 'Phu Yen', 'Hai Duong', 'Cao Bang', 'Binh Dinh',
       'Thanh Hoa', 'Gia Lai', 'Long An', 'Bac Ninh', 'Dak Lak'],
      dtype=object)

### Final

In [806]:
df_bonbanh_inf.columns = df_bonbanh_inf.columns.str.title()
df_bonbanh_inf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3167 entries, 0 to 4135
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Name          3167 non-null   object
 1   Price         3167 non-null   int64 
 2   Selldate      3167 non-null   object
 3   Public Year   3167 non-null   int64 
 4   Mileage       3167 non-null   int64 
 5   Origin        3167 non-null   object
 6   Bodytype      3167 non-null   object
 7   Seats         3167 non-null   int64 
 8   City          3167 non-null   object
 9   Year          3167 non-null   int64 
 10  Fueltype      3167 non-null   object
 11  Engine        3167 non-null   int64 
 12  Manufacturer  3167 non-null   object
dtypes: int64(6), object(7)
memory usage: 346.4+ KB


## Oto.com

In [807]:
df_used_Car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1174 entries, 0 to 1173
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   car_id        1174 non-null   int64 
 1   title         1174 non-null   object
 2   year          1174 non-null   object
 3   Body_Type     1174 non-null   object
 4   Origin        1174 non-null   object
 5   Mileage       1174 non-null   object
 6   City          1174 non-null   object
 7   District      1174 non-null   object
 8   Transmission  1174 non-null   object
 9   Fuel_Type     1174 non-null   object
 10  Price         1174 non-null   object
 11  sale_date     1174 non-null   object
 12  Manufacturer  1174 non-null   object
dtypes: int64(1), object(12)
memory usage: 119.4+ KB


### Replace None

In [808]:
df_used_Car.replace({'None': np.nan}, inplace=True)
df_used_Car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1174 entries, 0 to 1173
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   car_id        1174 non-null   int64 
 1   title         1174 non-null   object
 2   year          1174 non-null   object
 3   Body_Type     992 non-null    object
 4   Origin        1174 non-null   object
 5   Mileage       1170 non-null   object
 6   City          1174 non-null   object
 7   District      1094 non-null   object
 8   Transmission  1174 non-null   object
 9   Fuel_Type     1174 non-null   object
 10  Price         1174 non-null   object
 11  sale_date     1174 non-null   object
 12  Manufacturer  1174 non-null   object
dtypes: int64(1), object(12)
memory usage: 119.4+ KB


In [809]:
df_used_Car.drop(columns='car_id', inplace=True)
df_used_Car.head()

Unnamed: 0,title,year,Body_Type,Origin,Mileage,City,District,Transmission,Fuel_Type,Price,sale_date,Manufacturer
0,2023 - Lexus LX 600,2023,SUV,Nhap khau,14000.0,Tp.HCM,Thu Duc,So tu dong,May xang,8 ti 950 trieu,22/08/2024,Lexus
1,2014 - Mercedes-Benz CLA 200 1.6 I4,2014,Sedan,Nhap khau,60000.0,Ha Noi,Nam Tu Liem,So tu dong,May xang,499 trieu,8/9/2024,Mercedes-Benz
2,2016 - Mercedes-Benz C200 2.0 AT,2016,Sedan,Trong nuoc,95000.0,Ha Noi,Nam Tu Liem,So tu dong,May xang,639 trieu,17/09/2024,Mercedes-Benz
3,2015 - Mercedes-Benz C250 AMG,2015,Sedan,Trong nuoc,100000.0,Ha Noi,Nam Tu Liem,So tu dong,May xang,666 trieu,3/9/2024,Mercedes-Benz
4,2014 - Mercedes-Benz C200 Edition C,2014,Sedan,Trong nuoc,100000.0,Ha Noi,Nam Tu Liem,So tu dong,May xang,420 trieu,2/9/2024,Mercedes-Benz


### Name

In [810]:
filtered_data = df_used_Car['title'].apply(lambda x:x.split(" - ")[0])
filtered_data.unique()

array(['2023', '2014', '2016', '2015', '2017', '2021', '2022', '2019',
       '2018', '2007', '2024', '2009', '2011', '2006', '2003', '2020',
       '2013', '2010', '2012', '2008', '2000', '1995', '1999', '2004',
       '2002', '2005'], dtype=object)

In [811]:
filtered_data = df_used_Car['title'].apply(lambda x:len(x.split(" - ")))
filtered_data.unique()

array([2])

In [812]:
df_used_Car['name'] = df_used_Car['title'].apply(lambda x:x.split(" - ")[1])
df_used_Car.drop(columns='title', inplace=True)
df_used_Car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1174 entries, 0 to 1173
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   year          1174 non-null   object
 1   Body_Type     992 non-null    object
 2   Origin        1174 non-null   object
 3   Mileage       1170 non-null   object
 4   City          1174 non-null   object
 5   District      1094 non-null   object
 6   Transmission  1174 non-null   object
 7   Fuel_Type     1174 non-null   object
 8   Price         1174 non-null   object
 9   sale_date     1174 non-null   object
 10  Manufacturer  1174 non-null   object
 11  name          1174 non-null   object
dtypes: object(12)
memory usage: 110.2+ KB


### Public Year

In [813]:
df_used_Car['year'].unique()

array(['2023', '2014', '2016', '2015', '2017', '2021', '2022', '2019',
       '2018', '2007', '2024', '2009', '2011', '2006', '2003', '2020',
       '2013', '2010', '2012', '2008', '2000', '1995', '1999', '2004',
       '2002', '2005'], dtype=object)

In [814]:
df_used_Car['year'] = df_used_Car['year'].astype(float)
df_used_Car['year'] = df_used_Car['year'].astype(int)
df_used_Car['year'].unique()

array([2023, 2014, 2016, 2015, 2017, 2021, 2022, 2019, 2018, 2007, 2024,
       2009, 2011, 2006, 2003, 2020, 2013, 2010, 2012, 2008, 2000, 1995,
       1999, 2004, 2002, 2005])

In [815]:
df_used_Car.rename(columns={'year': 'Public Year'}, inplace=True)

### Origin

In [816]:
df_used_Car['Origin'].unique()

array(['Nhap khau', 'Trong nuoc'], dtype=object)

In [817]:
df_used_Car['Origin'] = df_used_Car['Origin'].replace({'Trong nuoc': 'Lap rap'})
df_used_Car['Origin'].unique()

array(['Nhap khau', 'Lap rap'], dtype=object)

### Mileage

In [818]:
df_used_Car = df_used_Car.dropna(subset=['Mileage'])

In [819]:
df_used_Car['Mileage'].unique()

array(['14000.0', '60000.0', '95000.0', '100000.0', '80000.0', '82000.0',
       '41000.0', '4000.0', '48000.0', '31000.0', '70000.0', '90000.0',
       '50000.0', '32000.0', '14600.0', '29000.0', '22900.0', '85000.0',
       '10000.0', '20000.0', '150000.0', '42000.0', '5000.0', '120000.0',
       '96000.0', '39000.0', '30000.0', '19000.0', '56000.0', '56789.0',
       '180000.0', '81234.0', '105000.0', '43000.0', '110000.0',
       '140000.0', '58000.0', '91000.0', '59999.0', '33000.0', '68000.0',
       '123456.0', '13000.0', '38000.0', '135000.0', '40000.0', '88000.0',
       '9000.0', '15000.0', '109000.0', '26000.0', '1000.0', '18000.0',
       '74000.0', '12000.0', '81000.0', '53000.0', '64000.0', '79000.0',
       '55600.0', '83000.0', '36000.0', '62000.0', '35689.0', '55000.0',
       '129000.0', '44000.0', '45000.0', '130000.0', '17000.0',
       '112000.0', '91689.0', '91268.0', '36899.0', '33689.0', '45689.0',
       '78999.0', '39899.0', '56899.0', '6846.0', '28000.0', '11

In [820]:
df_used_Car['Mileage'] = df_used_Car['Mileage'].astype(float)
df_used_Car['Mileage'] = df_used_Car['Mileage'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_used_Car['Mileage'] = df_used_Car['Mileage'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_used_Car['Mileage'] = df_used_Car['Mileage'].astype(int)


### City, District

In [821]:
df_used_Car['City'].unique()

array(['Tp.HCM', 'Ha Noi', 'Hai Duong', 'Hai Phong', 'Bac Ninh',
       'Nghe An', 'Thanh Hoa', 'Quang Ninh', 'Binh Duong', 'Gia Lai',
       'Lam Dong', 'Thua Thien Hue', 'Hung Yen', 'Dak Lak', 'Nam Dinh',
       'Dong Nai', 'Long An', 'Ba Ria Vung Tau', 'Phu Tho', 'Quang Tri',
       'Dien Bien', 'Hoa Binh', 'Can Tho', 'Bac Giang', 'Thai Binh',
       'Cao Bang', 'Son La', 'Vinh Phuc', 'Binh Dinh', 'Phu Yen'],
      dtype=object)

In [822]:
df_used_Car.drop(columns='District', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_used_Car.drop(columns='District', inplace=True)


### Transmission

In [823]:
df_used_Car['Transmission'].unique()

array(['So tu dong', 'So san', 'So hon hop'], dtype=object)

### Fuel Type

In [824]:
df_used_Car['Fuel_Type'].unique()

array(['May xang', 'May dau', 'Hybrid', 'Dien'], dtype=object)

In [825]:
df_used_Car['Fuel_Type'] = df_used_Car['Fuel_Type'].replace({'May xang': 'Xang', 'May dau': 'Dau',})
df_used_Car['Fuel_Type'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_used_Car['Fuel_Type'] = df_used_Car['Fuel_Type'].replace({'May xang': 'Xang', 'May dau': 'Dau',})


array(['Xang', 'Dau', 'Hybrid', 'Dien'], dtype=object)

### Manufacturer

In [826]:
df_used_Car['Manufacturer'].unique()

array(['Lexus', 'Mercedes-Benz', 'Ford', 'Porsche', 'Toyota',
       'Mitsubishi', 'Nissan', 'Hyundai', 'Infiniti', 'Kia', 'Honda',
       'Chevrolet', 'VinFast', 'Bentley', 'Land Rover', 'Audi', 'BMW',
       'Daewoo', 'McLaren', 'Mazda', 'Suzuki', 'Jaguar', 'Peugeot',
       'Ferrari', 'MG', 'Volvo', 'Rolls-Royce', 'Cadillac', 'Jeep',
       'Volkswagen', 'Dodge', 'Lincoln', 'Isuzu', 'Renault', 'Mini'],
      dtype=object)

In [827]:
df_used_Car['Manufacturer'] = df_used_Car['Manufacturer'].replace({'Mercedes-Benz': 'Mercedes'})
df_used_Car['Manufacturer'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_used_Car['Manufacturer'] = df_used_Car['Manufacturer'].replace({'Mercedes-Benz': 'Mercedes'})


array(['Lexus', 'Mercedes', 'Ford', 'Porsche', 'Toyota', 'Mitsubishi',
       'Nissan', 'Hyundai', 'Infiniti', 'Kia', 'Honda', 'Chevrolet',
       'VinFast', 'Bentley', 'Land Rover', 'Audi', 'BMW', 'Daewoo',
       'McLaren', 'Mazda', 'Suzuki', 'Jaguar', 'Peugeot', 'Ferrari', 'MG',
       'Volvo', 'Rolls-Royce', 'Cadillac', 'Jeep', 'Volkswagen', 'Dodge',
       'Lincoln', 'Isuzu', 'Renault', 'Mini'], dtype=object)

### Price

In [828]:
def convert_price(price_str):
    price_str = price_str.replace(" trieu", "").strip()

    parts = price_str.split(" ")

    total = 0
    if 'ti' in parts:
        if len(parts) == 2:
            total += int(parts[0])*1000
        else:
            total += int(parts[0]) * 1000 + int(parts[2])
    else:
        total += int(parts[0])

    return total


In [829]:
df_used_Car['Price'].unique()

array(['8 ti 950 trieu', '499 trieu', '639 trieu', '666 trieu',
       '420 trieu', '1 ti 399 trieu', '445 trieu', '925 trieu',
       '1 ti 950 trieu', '995 trieu', '3 ti 999 trieu', '369 trieu',
       '450 trieu', '569 trieu', '415 trieu', '799 trieu',
       '1 ti 979 trieu', '4 ti 879 trieu', '385 trieu', '725 trieu',
       '870 trieu', '3 ti 180 trieu', '485 trieu', '880 trieu',
       '1 ti 799 trieu', '3 ti 430 trieu', '525 trieu', '690 trieu',
       '875 trieu', '660 trieu', '288 trieu', '939 trieu',
       '8 ti 340 trieu', '1 ti 125 trieu', '340 trieu', '495 trieu',
       '830 trieu', '115 trieu', '727 trieu', '3 ti 450 trieu',
       '435 trieu', '2 ti 299 trieu', '650 trieu', '388 trieu',
       '390 trieu', '555 trieu', '559 trieu', '685 trieu', '905 trieu',
       '310 trieu', '5 ti 599 trieu', '1 ti 499 trieu', '440 trieu',
       '950 trieu', '258 trieu', '989 trieu', '1 ti 920 trieu',
       '1 ti 740 trieu', '125 trieu', '2 ti 850 trieu', '1 ti 680 trieu',
       

In [830]:
df_used_Car['Price'] = df_used_Car['Price'].apply(convert_price)

df_used_Car.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_used_Car['Price'] = df_used_Car['Price'].apply(convert_price)


Unnamed: 0,Public Year,Body_Type,Origin,Mileage,City,Transmission,Fuel_Type,Price,sale_date,Manufacturer,name
0,2023,SUV,Nhap khau,14000,Tp.HCM,So tu dong,Xang,8950,22/08/2024,Lexus,Lexus LX 600
1,2014,Sedan,Nhap khau,60000,Ha Noi,So tu dong,Xang,499,8/9/2024,Mercedes,Mercedes-Benz CLA 200 1.6 I4
2,2016,Sedan,Lap rap,95000,Ha Noi,So tu dong,Xang,639,17/09/2024,Mercedes,Mercedes-Benz C200 2.0 AT
3,2015,Sedan,Lap rap,100000,Ha Noi,So tu dong,Xang,666,3/9/2024,Mercedes,Mercedes-Benz C250 AMG
4,2014,Sedan,Lap rap,100000,Ha Noi,So tu dong,Xang,420,2/9/2024,Mercedes,Mercedes-Benz C200 Edition C
5,2016,Sedan,Lap rap,80000,Ha Noi,So tu dong,Xang,1399,8/9/2024,Mercedes,Mercedes-Benz S400 3.0 V6
6,2017,SUV,Lap rap,82000,Tp.HCM,So tu dong,Xang,445,19/09/2024,Ford,Ford Focus 1.5L EcoBoost Sport
7,2021,SUV,Lap rap,41000,Tp.HCM,So tu dong,Xang,925,19/09/2024,Ford,Ford Everest Titanium 2.0 AT 4x2
8,2022,SUV,Nhap khau,4000,Tp.HCM,So tu dong,Xang,1950,19/09/2024,Ford,Ford Explorer Limited
9,2022,Ban Tai,Nhap khau,48000,Tp.HCM,So tu dong,Dau,995,15/09/2024,Ford,Ford Ranger Raptor


In [831]:
print(df_used_Car[df_used_Car['Price'] < 100]['Price'].unique())

[70 79 75 68 98 95 96 90 59]


### Year

In [832]:
df_used_Car['sale_date'].unique()

array(['22/08/2024', '8/9/2024', '17/09/2024', '3/9/2024', '2/9/2024',
       '19/09/2024', '15/09/2024', '6/9/2024', '31/08/2024', '29/08/2024',
       '5/9/2024', '23/08/2024', '30/08/2024', '9/9/2024', '20/09/2024',
       '7/9/2024', '4/9/2024', '12/9/2024', '14/09/2024', '13/09/2024',
       '28/08/2024', '16/09/2024', '27/08/2024', '11/9/2024',
       '18/09/2024', '10/9/2024', '23/09/2024', '25/08/2024',
       '26/08/2024', '25/09/2024', '1/9/2024', '24/08/2024', '24/09/2024',
       '22/09/2024'], dtype=object)

In [833]:
df_used_Car['Year'] = 2024 - df_used_Car['Public Year']
df_used_Car['Year'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_used_Car['Year'] = 2024 - df_used_Car['Public Year']


array([ 1, 10,  8,  9,  7,  3,  2,  5,  6, 17,  0, 15, 13, 18, 21,  4, 11,
       14, 12, 16, 24, 29, 25, 20, 22, 19])

### Body_type

In [834]:
df_used_Car.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1170 entries, 0 to 1173
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Public Year   1170 non-null   int64 
 1   Body_Type     988 non-null    object
 2   Origin        1170 non-null   object
 3   Mileage       1170 non-null   int64 
 4   City          1170 non-null   object
 5   Transmission  1170 non-null   object
 6   Fuel_Type     1170 non-null   object
 7   Price         1170 non-null   int64 
 8   sale_date     1170 non-null   object
 9   Manufacturer  1170 non-null   object
 10  name          1170 non-null   object
 11  Year          1170 non-null   int64 
dtypes: int64(4), object(8)
memory usage: 118.8+ KB


In [835]:
def find_body_type2(row):
    match = process.extractOne(row['name'], df_newcar_inf['Name'], score_cutoff=90)
    if match and match[1] > 90:
        print(f"Tên xe cũ: '{row['name']}' - Tên xe mới: '{match[0]}' - Điểm tương đồng: {match[1]}")
        index = df_newcar_inf[df_newcar_inf['Name'] == match[0]].index
        return df_newcar_inf.loc[index, 'Body_Type'].values[0]
    return None

df_used_Car['Body_Type'] = df_used_Car.apply(lambda row: find_body_type2(row) if pd.isna(row['Body_Type']) else row['Body_Type'], axis=1)

Tên xe cũ: 'Toyota Corolla Cross 1.8 G' - Tên xe mới: 'Toyota Corolla Cross 1.8 G' - Điểm tương đồng: 100
Tên xe cũ: 'Land Rover Range Rover' - Tên xe mới: 'Land Rover Range Rover Velar S' - Điểm tương đồng: 95
Tên xe cũ: 'Porsche Macan S' - Tên xe mới: 'Porsche Macan S' - Điểm tương đồng: 100
Tên xe cũ: 'Porsche Macan S' - Tên xe mới: 'Porsche Macan S' - Điểm tương đồng: 100
Tên xe cũ: 'Porsche Macan' - Tên xe mới: 'Porsche Macan 2.0' - Điểm tương đồng: 95
Tên xe cũ: 'Land Rover Range Rover Evoque 2.0 R-Dynamic SE' - Tên xe mới: 'Land Rover Range Rover Evoque 2.0L I4 Turbocharged R-Dynamic SE' - Điểm tương đồng: 92
Tên xe cũ: 'Toyota Corolla Cross 1.8 V' - Tên xe mới: 'Toyota Corolla Cross 1.8 HV' - Điểm tương đồng: 98
Tên xe cũ: 'Hyundai Santa Fe 2.2 Dau Dac biet' - Tên xe mới: 'Hyundai SantaFe 2.2 Dau dac biet' - Điểm tương đồng: 98
Tên xe cũ: 'Land Rover Range Rover AutoBiography LWB 3.0L' - Tên xe mới: 'Land Rover Range Rover Autobiography LWB' - Điểm tương đồng: 95
Tên xe cũ: 'Ch

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_used_Car['Body_Type'] = df_used_Car.apply(lambda row: find_body_type2(row) if pd.isna(row['Body_Type']) else row['Body_Type'], axis=1)


In [836]:
def find_body_type3(row):
    match = process.extractOne(row['name'], df_bonbanh_inf['Name'], score_cutoff=90)
    if match and match[1] > 90:
        print(f"Tên xe cũ: '{row['name']}' - Tên xe mới: '{match[0]}' - Điểm tương đồng: {match[1]}")
        index = df_bonbanh_inf[df_bonbanh_inf['Name'] == match[0]].index
        return df_bonbanh_inf.loc[index, 'Bodytype'].values[0]
    return None

df_used_Car['Body_Type'] = df_used_Car.apply(lambda row: find_body_type3(row) if pd.isna(row['Body_Type']) else row['Body_Type'], axis=1)

Tên xe cũ: 'Lexus RX 350' - Tên xe mới: 'Lexus RX 350' - Điểm tương đồng: 100
Tên xe cũ: 'Lexus LX 570' - Tên xe mới: 'Lexus LX 570' - Điểm tương đồng: 100
Tên xe cũ: 'Toyota Zace' - Tên xe mới: 'Toyota Zace GL' - Điểm tương đồng: 95
Tên xe cũ: 'Toyota Zace' - Tên xe mới: 'Toyota Zace GL' - Điểm tương đồng: 95
Tên xe cũ: 'Toyota Vios' - Tên xe mới: 'Toyota Vios' - Điểm tương đồng: 100
Tên xe cũ: 'Daewoo Lacetti SE' - Tên xe mới: 'Daewoo Lacetti SE' - Điểm tương đồng: 100
Tên xe cũ: 'Lexus RX 200t' - Tên xe mới: 'Lexus RX 200t' - Điểm tương đồng: 100
Tên xe cũ: 'Mercedes-Benz Maybach S450' - Tên xe mới: 'Mercedes Benz Maybach S400' - Điểm tương đồng: 96
Tên xe cũ: 'Mazda 6 2.0 Premium' - Tên xe mới: 'Mazda 6 2.0L Premium' - Điểm tương đồng: 97
Tên xe cũ: 'VinFast VF9 Plus' - Tên xe mới: 'VinFast VF9 Plus' - Điểm tương đồng: 100
Tên xe cũ: 'Mercedes-Benz GLS 450' - Tên xe mới: 'Mercedes Benz GLS 450 4Matic' - Điểm tương đồng: 95
Tên xe cũ: 'Mercedes-Benz C250' - Tên xe mới: 'Mercedes Ben

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_used_Car['Body_Type'] = df_used_Car.apply(lambda row: find_body_type3(row) if pd.isna(row['Body_Type']) else row['Body_Type'], axis=1)


In [837]:
def find_body_type(row, df):
    filtered_df = df[df['name'] != row['name']]

    match = process.extractOne(row['name'], filtered_df['name'], score_cutoff=87)
    
    if match and match[1] > 87:
        similar_car_name = match[0]
        print(f"Tên xe cũ: '{row['name']}' - Tên xe tương tự: '{similar_car_name}' - Điểm tương đồng: {match[1]}")

        body_type = df.loc[df['name'] == similar_car_name, 'Body_Type'].values[0]
        return body_type

    return None

df_used_Car['Body_Type'] = df_used_Car.apply(
    lambda row: find_body_type(row, df_used_Car) if pd.isna(row['Body_Type']) else row['Body_Type'], 
    axis=1
)

Tên xe cũ: 'Honda CR-V' - Tên xe tương tự: 'Honda CR-V 2.0' - Điểm tương đồng: 95
Tên xe cũ: 'Honda City 1.5 TOP CVT' - Tên xe tương tự: 'Honda City 1.5 CVT' - Điểm tương đồng: 95
Tên xe cũ: 'Honda CR-V' - Tên xe tương tự: 'Honda CR-V 2.0' - Điểm tương đồng: 95
Tên xe cũ: 'Mazda 3 1.5L Sedan' - Tên xe tương tự: 'Mazda 3 Sedan 1.5L Luxury' - Điểm tương đồng: 95
Tên xe cũ: 'Peugeot 2008' - Tên xe tương tự: 'Peugeot 5008' - Điểm tương đồng: 92
Tên xe cũ: 'VinFast LUX A2.0' - Tên xe tương tự: 'VinFast LUX A2.0 Cao cap' - Điểm tương đồng: 90
Tên xe cũ: 'Mazda 3' - Tên xe tương tự: 'Mazda 3 Sedan 1.5L Luxury' - Điểm tương đồng: 90
Tên xe cũ: 'Honda CR-V' - Tên xe tương tự: 'Honda CR-V 2.0' - Điểm tương đồng: 95
Tên xe cũ: 'Ford EcoSport' - Tên xe tương tự: 'Ford EcoSport Titanium 1.0L AT' - Điểm tương đồng: 90
Tên xe cũ: 'Toyota Alphard' - Tên xe tương tự: 'Toyota Alphard Executive Lounge' - Điểm tương đồng: 90
Tên xe cũ: 'Ford Focus' - Tên xe tương tự: 'Ford Focus 1.5L EcoBoost Sport' - Điể

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_used_Car['Body_Type'] = df_used_Car.apply(


In [838]:
df_used_Car.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1170 entries, 0 to 1173
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Public Year   1170 non-null   int64 
 1   Body_Type     1165 non-null   object
 2   Origin        1170 non-null   object
 3   Mileage       1170 non-null   int64 
 4   City          1170 non-null   object
 5   Transmission  1170 non-null   object
 6   Fuel_Type     1170 non-null   object
 7   Price         1170 non-null   int64 
 8   sale_date     1170 non-null   object
 9   Manufacturer  1170 non-null   object
 10  name          1170 non-null   object
 11  Year          1170 non-null   int64 
dtypes: int64(4), object(8)
memory usage: 118.8+ KB


In [839]:
df_used_Car['name'][df_used_Car['Body_Type'].isna()]

521           Ford Laser
573     MG 5 1.5L Luxury
589         Renault Wind
784          Nissan Juke
1153         Peugeot 408
Name: name, dtype: object

In [840]:
df_used_Car.loc[521, 'Body_Type'] = 'Sedan'
df_used_Car.loc[573, 'Body_Type'] = 'Crossover'
df_used_Car.loc[589, 'Body_Type'] = 'Convertible'
df_used_Car.loc[784, 'Body_Type'] = 'SUV'
df_used_Car.loc[1153, 'Body_Type'] = 'Crossover'

In [841]:
df_used_Car['name'][df_used_Car['Body_Type'].isna()]

Series([], Name: name, dtype: object)

In [842]:
df_used_Car['Body_Type'].unique()

array(['SUV', 'Sedan', 'Ban Tai', 'Hatchback', 'MPV', 'Van/Minivan',
       'Crossover', 'Sport Car', 'Coupe', 'Convertible', 'Minibus',
       'Ban tai', 'Van', 'Xe tai'], dtype=object)

### Final

In [843]:
df_used_Car.columns = df_used_Car.columns.str.title()

In [844]:
df_used_Car.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1170 entries, 0 to 1173
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Public Year   1170 non-null   int64 
 1   Body_Type     1170 non-null   object
 2   Origin        1170 non-null   object
 3   Mileage       1170 non-null   int64 
 4   City          1170 non-null   object
 5   Transmission  1170 non-null   object
 6   Fuel_Type     1170 non-null   object
 7   Price         1170 non-null   int64 
 8   Sale_Date     1170 non-null   object
 9   Manufacturer  1170 non-null   object
 10  Name          1170 non-null   object
 11  Year          1170 non-null   int64 
dtypes: int64(4), object(8)
memory usage: 151.1+ KB


# Merge data

In [845]:
df_used_Car.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1170 entries, 0 to 1173
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Public Year   1170 non-null   int64 
 1   Body_Type     1170 non-null   object
 2   Origin        1170 non-null   object
 3   Mileage       1170 non-null   int64 
 4   City          1170 non-null   object
 5   Transmission  1170 non-null   object
 6   Fuel_Type     1170 non-null   object
 7   Price         1170 non-null   int64 
 8   Sale_Date     1170 non-null   object
 9   Manufacturer  1170 non-null   object
 10  Name          1170 non-null   object
 11  Year          1170 non-null   int64 
dtypes: int64(4), object(8)
memory usage: 151.1+ KB


In [846]:
df_bonbanh_inf = df_bonbanh_inf.rename(columns={'Bodytype': 'Body_Type', 'Selldate':'Sale_Date', 'Fueltype': 'Fuel_Type'})

In [847]:
df_bonbanh_inf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3167 entries, 0 to 4135
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Name          3167 non-null   object
 1   Price         3167 non-null   int64 
 2   Sale_Date     3167 non-null   object
 3   Public Year   3167 non-null   int64 
 4   Mileage       3167 non-null   int64 
 5   Origin        3167 non-null   object
 6   Body_Type     3167 non-null   object
 7   Seats         3167 non-null   int64 
 8   City          3167 non-null   object
 9   Year          3167 non-null   int64 
 10  Fuel_Type     3167 non-null   object
 11  Engine        3167 non-null   int64 
 12  Manufacturer  3167 non-null   object
dtypes: int64(6), object(7)
memory usage: 475.4+ KB


Can merge: Name, Public Year, Mileage, Body_Type, Origin, City, Fuel_Type, Sale_Date, Manufacturer, Year, Price \\\
Error: Seats, Engine, Transmission

In [848]:
merge_keys = ['Name', 'Public Year', 'Mileage', 'Body_Type', 'Origin', 'City', 'Fuel_Type', 'Sale_Date', 'Manufacturer', 'Year', 'Price']

final_data = pd.merge(df_bonbanh_inf, df_used_Car, on=merge_keys, how='outer')
final_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4337 entries, 0 to 4336
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Name          4337 non-null   object 
 1   Price         4337 non-null   int64  
 2   Sale_Date     4337 non-null   object 
 3   Public Year   4337 non-null   int64  
 4   Mileage       4337 non-null   int64  
 5   Origin        4337 non-null   object 
 6   Body_Type     4337 non-null   object 
 7   Seats         3167 non-null   float64
 8   City          4337 non-null   object 
 9   Year          4337 non-null   int64  
 10  Fuel_Type     4337 non-null   object 
 11  Engine        3167 non-null   float64
 12  Manufacturer  4337 non-null   object 
 13  Transmission  1170 non-null   object 
dtypes: float64(2), int64(4), object(8)
memory usage: 474.5+ KB


In [849]:
final_data.drop(columns=['Seats', 'Engine', 'Transmission'], inplace = True)

In [850]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4337 entries, 0 to 4336
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Name          4337 non-null   object
 1   Price         4337 non-null   int64 
 2   Sale_Date     4337 non-null   object
 3   Public Year   4337 non-null   int64 
 4   Mileage       4337 non-null   int64 
 5   Origin        4337 non-null   object
 6   Body_Type     4337 non-null   object
 7   City          4337 non-null   object
 8   Year          4337 non-null   int64 
 9   Fuel_Type     4337 non-null   object
 10  Manufacturer  4337 non-null   object
dtypes: int64(4), object(7)
memory usage: 372.8+ KB


### Formating

In [851]:
for column in ['Origin', 'Body_Type', 'City', 'Fuel_Type', 'Manufacturer']:
    unique_values = final_data[column].unique()
    print(f"Giá trị duy nhất trong '{column}': {unique_values}")

Giá trị duy nhất trong 'Origin': ['Nhap khau' 'Lap rap']
Giá trị duy nhất trong 'Body_Type': ['SUV' 'Hatchback' 'Convertible' 'Sedan' 'Coupe' 'Crossover' 'Ban Tai'
 'Ban tai' 'Van/Minivan' 'Van' 'Sport Car' 'MPV' 'Minibus' 'Xe tai']
Giá trị duy nhất trong 'City': ['Tp.HCM' 'Long An' 'Nghe An' 'Lam Dong' 'Thanh Hoa' 'Ha Noi' 'Bac Ninh'
 'Hai Phong' 'Binh Dinh' 'Ba Ria Vung Tau' 'Thua Thien Hue' 'Dak Lak'
 'Thai Binh' 'Cao Bang' 'Phu Yen' 'Dien Bien' 'Bac Giang' 'Son La'
 'Quang Ninh' 'Gia Lai' 'Quang Tri' 'Vinh Phuc' 'Dong Nai' 'Binh Duong'
 'Can Tho' 'Hai Duong' 'Hung Yen' 'Phu Tho' 'Hoa Binh' 'Nam Dinh']
Giá trị duy nhất trong 'Fuel_Type': ['Xang' 'Dien' 'Dau' 'Hybrid']
Giá trị duy nhất trong 'Manufacturer': ['Acura' 'Audi' 'BMW' 'Baic' 'Bentley' 'Cadillac' 'Chevrolet' 'Daewoo'
 'Daihatsu' 'Dodge' 'Ferrari' 'Fiat' 'Ford' 'Haima' 'Haval' 'Honda'
 'Hyundai' 'Infiniti' 'Isuzu' 'Jaguar' 'Jeep' 'Kia' 'Lamborghini'
 'Land Rover' 'Lexus' 'Lincoln' 'MG' 'Maserati' 'Maybach' 'Mazda'
 'McLaren'

### Binning

In [852]:
final_data.describe()

Unnamed: 0,Price,Public Year,Mileage,Year
count,4337.0,4337.0,4337.0,4337.0
mean,1208.390823,2018.441088,61318.8,5.558912
std,1708.084829,4.159948,265588.2,4.159948
min,19.0,1995.0,0.0,0.0
25%,465.0,2016.0,23000.0,2.0
50%,670.0,2020.0,48000.0,4.0
75%,1225.0,2022.0,77000.0,8.0
max,26999.0,2024.0,10000000.0,29.0


In [853]:
nins = [0, 300, 800, 2000, float('inf')]

final_data['Price-binned'] = pd.qcut(final_data['Price'], q=4, labels=['Low', 'Medium', 'High', 'Luxury'])

In [854]:
final_data['Price-binned'].unique()

['Medium', 'Low', 'Luxury', 'High']
Categories (4, object): ['Low' < 'Medium' < 'High' < 'Luxury']

### Data Normalization (Min - max scaling)

In [855]:
def min_max_scaling(series):
    return (series - series.min()) / (series.max() - series.min())

final_data['Price'] = min_max_scaling(final_data['Price'])
final_data['Mileage'] = min_max_scaling(final_data['Mileage'])
final_data.describe()

Unnamed: 0,Price,Public Year,Mileage,Year
count,4337.0,4337.0,4337.0,4337.0
mean,0.044084,2018.441088,0.006132,5.558912
std,0.063309,4.159948,0.026559,4.159948
min,0.0,1995.0,0.0,0.0
25%,0.016531,2016.0,0.0023,2.0
50%,0.024129,2020.0,0.0048,4.0
75%,0.0447,2022.0,0.0077,8.0
max,1.0,2024.0,1.0,29.0


### Encode(One-hot-encoding)

In [856]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4337 entries, 0 to 4336
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Name          4337 non-null   object  
 1   Price         4337 non-null   float64 
 2   Sale_Date     4337 non-null   object  
 3   Public Year   4337 non-null   int64   
 4   Mileage       4337 non-null   float64 
 5   Origin        4337 non-null   object  
 6   Body_Type     4337 non-null   object  
 7   City          4337 non-null   object  
 8   Year          4337 non-null   int64   
 9   Fuel_Type     4337 non-null   object  
 10  Manufacturer  4337 non-null   object  
 11  Price-binned  4337 non-null   category
dtypes: category(1), float64(2), int64(2), object(7)
memory usage: 377.3+ KB


In [857]:
final_data_encoded = pd.get_dummies(final_data, columns=['Origin', 'Body_Type', 'City', 'Fuel_Type', 'Manufacturer', 'Price-binned'])

In [858]:
final_data_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4337 entries, 0 to 4336
Columns: 109 entries, Name to Price-binned_Luxury
dtypes: bool(103), float64(2), int64(2), object(2)
memory usage: 639.7+ KB


Map boolean to 0 1

In [859]:
bool_columns = final_data_encoded.select_dtypes(include=['bool']).columns
final_data_encoded[bool_columns] = final_data_encoded[bool_columns].astype(int)
final_data_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4337 entries, 0 to 4336
Columns: 109 entries, Name to Price-binned_Luxury
dtypes: float64(2), int64(105), object(2)
memory usage: 3.6+ MB


# Store data

## Create table

In [863]:
conn = pyodbc.connect(connection_string)
cursor = conn.cursor()
cursor.execute('''
    CREATE TABLE Final_data (
        Name NVARCHAR(100),
        Price FLOAT,
        Sale_Date NVARCHAR(50),
        Public_Year INT,
        Mileage FLOAT,
        Origin NVARCHAR(100),
        Body_Type NVARCHAR(100),
        City NVARCHAR(100),
        Year INT,
        Fuel_Type NVARCHAR(50),
        Manufacturer NVARCHAR(100),
        Price_binned NVARCHAR(50)
    )
''')

conn.commit()


## Insert

In [864]:
for index, row in final_data.iterrows():
    cursor.execute('''
        INSERT INTO Final_data (Name, Price, Sale_Date, Public_Year, Mileage, Origin, Body_Type, City, Year, Fuel_Type, Manufacturer, Price_binned) 
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    ''', 
    row['Name'], row['Price'], row['Sale_Date'], row['Public Year'], row['Mileage'], 
    row['Origin'], row['Body_Type'], row['City'], row['Year'], row['Fuel_Type'], 
    row['Manufacturer'], row['Price-binned'])

conn.commit()
cursor.close()
conn.close()