### 데이터의 정보를 확인 - 통계메서드
- 데이터의 컬럼별 분포 및 특징 파악하기 위해서 사용
- 데이터 기본적인 분석에 활용

In [1]:
import pandas as pd

In [2]:
file_path = '../data/auto_mpg.csv'

In [3]:
mpg_df = pd.read_csv(file_path)

In [4]:
mpg_df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [5]:
# 데이터 확인
# - 기본 정보 확인
mpg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [7]:
# - 컬럼과 실제 데이터 타입 확인
print(mpg_df.head(2), mpg_df.tail(2), sep='\n\n')

    mpg  cylinders  displacement horsepower  weight  acceleration  model year  \
0  18.0          8         307.0        130    3504          12.0          70   
1  15.0          8         350.0        165    3693          11.5          70   

   origin                   car name  
0       1  chevrolet chevelle malibu  
1       1          buick skylark 320  

      mpg  cylinders  displacement horsepower  weight  acceleration  \
396  28.0          4         120.0         79    2625          18.6   
397  31.0          4         119.0         82    2720          19.4   

     model year  origin     car name  
396          82       1  ford ranger  
397          82       1   chevy s-10  


In [9]:
# 데이터 관련 통계메서드
mpg_df.mean(numeric_only=True)

mpg               23.514573
cylinders          5.454774
displacement     193.425879
weight          2970.424623
acceleration      15.568090
model year        76.010050
origin             1.572864
dtype: float64

In [11]:
mpg_df.sum(numeric_only=True)

mpg                9358.8
cylinders          2171.0
displacement      76983.5
weight          1182229.0
acceleration       6196.1
model year        30252.0
origin              626.0
dtype: float64

In [13]:
# - 중앙값, 최빈값
print(mpg_df.median(numeric_only=True), mpg_df.mode(numeric_only=True), sep='\n\n')

mpg               23.0
cylinders          4.0
displacement     148.5
weight          2803.5
acceleration      15.5
model year        76.0
origin             1.0
dtype: float64

    mpg  cylinders  displacement  weight  acceleration  model year  origin
0  13.0        4.0          97.0    1985          14.5        73.0     1.0
1   NaN        NaN           NaN    2130           NaN         NaN     NaN


In [14]:
mpg_df['weight'].mode()

0    1985
1    2130
Name: weight, dtype: int64

In [15]:
mpg_df['mpg'].mode()

0    13.0
Name: mpg, dtype: float64

In [16]:
# - 고유값 : 칼럼 안에 값의 종류
mpg_df['origin'].unique()

array([1, 3, 2])

In [17]:
mpg_df['horsepower'].unique()

array(['130', '165', '150', '140', '198', '220', '215', '225', '190',
       '170', '160', '95', '97', '85', '88', '46', '87', '90', '113',
       '200', '210', '193', '?', '100', '105', '175', '153', '180', '110',
       '72', '86', '70', '76', '65', '69', '60', '80', '54', '208', '155',
       '112', '92', '145', '137', '158', '167', '94', '107', '230', '49',
       '75', '91', '122', '67', '83', '78', '52', '61', '93', '148',
       '129', '96', '71', '98', '115', '53', '81', '79', '120', '152',
       '102', '108', '68', '58', '149', '89', '63', '48', '66', '139',
       '103', '125', '133', '138', '135', '142', '77', '62', '132', '84',
       '64', '74', '116', '82'], dtype=object)

In [19]:
# - 컬럼들의 관계 정도를 수치화하여 나타낸것
mpg_df.corr(numeric_only=True)

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,model year,origin
mpg,1.0,-0.775396,-0.804203,-0.831741,0.420289,0.579267,0.56345
cylinders,-0.775396,1.0,0.950721,0.896017,-0.505419,-0.348746,-0.562543
displacement,-0.804203,0.950721,1.0,0.932824,-0.543684,-0.370164,-0.609409
weight,-0.831741,0.896017,0.932824,1.0,-0.417457,-0.306564,-0.581024
acceleration,0.420289,-0.505419,-0.543684,-0.417457,1.0,0.288137,0.205873
model year,0.579267,-0.348746,-0.370164,-0.306564,0.288137,1.0,0.180662
origin,0.56345,-0.562543,-0.609409,-0.581024,0.205873,0.180662,1.0


In [21]:
mpg_df.corr(numeric_only=True).min()    # 히트맵으로 그렸으면 좀 보기 편했겠다.

mpg            -0.831741
cylinders      -0.775396
displacement   -0.804203
weight         -0.831741
acceleration   -0.543684
model year     -0.370164
origin         -0.609409
dtype: float64