### EDA 분석

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dotenv import load_dotenv
from sqlalchemy import create_engine
import os

plt.rcParams['font.family'] = 'Malgun Gothic'
dotenv_path = '../.env' 
load_dotenv(dotenv_path=dotenv_path)

db_host = os.getenv('DB_HOST')
db_port = os.getenv('DB_PORT')
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_name = os.getenv('DB_NAME')
db_connection_str = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
db_engine = create_engine(db_connection_str)

sql_query = "SELECT * FROM public.vehicles;"

df = pd.read_sql(sql_query, db_engine)
df.head()


Unnamed: 0,vehicleid,carseq,vehicleno,platform,origin,cartype,manufacturer,model,generation,trim,...,colorname,modelyear,firstregistrationdate,distance,price,originprice,selltype,location,detailurl,photo
0,[1],[40280246],359마6078,encar,국산,경차,기아,모닝 어반 (JA),스탠다드,(세부등급 없음),...,흰색,[2021],[20210617],[28088],[990],[1195],일반,경기,https://fem.encar.com/cars/detail/40275440?pag...,/carpicture07/pic4027/40275440_
1,[2],[40539834],196주2722,encar,국산,경차,기아,더 뉴 기아 레이,시그니처,,...,흰색,[2023],[20221123],[23600],[1520],[1720],일반,경기,https://fem.encar.com/cars/detail/40539517?pag...,/carpicture03/pic4053/40539517_
2,[3],[40536828],30보7029,encar,국산,경차,쉐보레(GM대우),스파크,LT,기본형,...,분홍색,[2012],[20110602],[156693],[280],[1094],일반,인천,https://fem.encar.com/cars/detail/40533790?pag...,/carpicture03/pic4053/40533790_
3,[4],[40523706],65오0942,encar,국산,경차,쉐보레(GM대우),더 넥스트 스파크,LTZ,,...,흰색,[2016],[20160309],[39354],[699],[1308],일반,인천,https://fem.encar.com/cars/detail/40522192?pag...,/carpicture02/pic4052/40522192_
4,[5],[40524639],177거9036,encar,국산,경차,기아,올 뉴 모닝 (JA),디럭스,,...,흰색,[2017],[20170725],[104173],[630],[990],일반,대전,https://fem.encar.com/cars/detail/40510685?pag...,/carpicture01/pic4051/40510685_


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85308 entries, 0 to 85307
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   vehicleid              85308 non-null  object
 1   carseq                 85308 non-null  object
 2   vehicleno              85308 non-null  object
 3   platform               85308 non-null  object
 4   origin                 85308 non-null  object
 5   cartype                85308 non-null  object
 6   manufacturer           85308 non-null  object
 7   model                  85308 non-null  object
 8   generation             85308 non-null  object
 9   trim                   35292 non-null  object
 10  fueltype               85307 non-null  object
 11  transmission           85306 non-null  object
 12  colorname              85308 non-null  object
 13  modelyear              85307 non-null  object
 14  firstregistrationdate  77551 non-null  object
 15  distance           

In [3]:
# 데이터의 행과 열 개수 확인
print(f"데이터 크기: {df.shape[0]}행 / {df.shape[1]}열")

print("\n[결측치(빈 값) 개수]")
# 각 컬럼별 결측치 개수 확인
print(df.isnull().sum())

데이터 크기: 85308행 / 22열

[결측치(빈 값) 개수]
vehicleid                    0
carseq                       0
vehicleno                    0
platform                     0
origin                       0
cartype                      0
manufacturer                 0
model                        0
generation                   0
trim                     50016
fueltype                     1
transmission                 2
colorname                    0
modelyear                    1
firstregistrationdate     7757
distance                     0
price                        0
originprice               4661
selltype                     0
location                     6
detailurl                    0
photo                       68
dtype: int64


In [None]:
# 가장 비싼차 5개 뽑아보니 렌트/리스 차량은 가격표기가 달라서 이부분 전처리 필요함, 그리고 99999 이런 가격도 계약중인 차량이라 전처리 필요
most_expensive_cars = df.sort_values(by='price', ascending=False).head(5)

display(most_expensive_cars)

Unnamed: 0,vehicleid,carseq,vehicleno,platform,origin,cartype,manufacturer,model,generation,trim,...,colorname,modelyear,firstregistrationdate,distance,price,originprice,selltype,location,detailurl,photo
66017,[221344],[40436732],225하1379,encar,국산,RV,기아,더 뉴 카니발 4세대,가솔린 9인승 노블레스,,...,흰색,[2025],,[11700],[3603392],[3991],렌트,인천,https://fem.encar.com/cars/detail/40436732?pag...,/carpicture03/pic4043/40436732_
6618,[9393],[40390691],221하3272,encar,국산,경차,기아,더 뉴 기아 레이,시그니처,,...,쥐색,[2023],,[15000],[1592162],[1720],렌트,인천,https://fem.encar.com/cars/detail/40390691?pag...,/carpicture09/pic4039/40390691_
74954,[239598],[40056887],361나5397,encar,수입,스포츠카,애스턴마틴,DBS,5.2 GT 자가토,,...,주황색,[2020],[20201208],[854],[250000],,일반,서울,https://fem.encar.com/cars/detail/40056887?pag...,/carpicture05/pic4005/40056887_
75100,[239802],[39089797],131주2636,encar,수입,스포츠카,맥라렌,세나,4.0,,...,검정색,[2019],[20190117],[300],[200000],,일반,서울,https://fem.encar.com/cars/detail/39089797?pag...,/carpicture08/pic3908/39089797_
77613,[243024],[40386952],15서2500,encar,수입,스포츠카,포르쉐,911,카레라 4S 카브리올레,997.0,...,검정색,[2010],[20100519],[14700],[99999],[19340],일반,서울,https://fem.encar.com/cars/detail/40386952?pag...,/carpicture08/pic4038/40386952_
