## 中古車についてのデータセット

データの取得
https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data<br>
データの説明：https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.names
- メーカー、燃料の種類、吸気、ドア数、ボディ形状等

内容を確認
```
3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.60,168.80,64.10,48.80,2548,dohc,four,130,mpfi,3.47,2.68,9.00,111,5000,21,27,13495
```

- CSVファイル
- ? … 欠損値がある

In [52]:
import pandas as pd

# 欠損値が '?' で入っていることを na_values='?' で指定
# - 指定しなければ '?' を含む列は全て文字列として読み込まれる
autos = pd.read_csv('data/imports-85.data', na_values='?')

# imports-85.data には列ラベルがついていないため imports-85.names から列ラベルを付ける
autos.columns = ['symboling', 'normalized_losses', 'make', 'fuel_type', 'aspiration',
                    'num_of_doors', 'body_style', 'drive_wheels', 'engine_location',
                    'wheel_base', 'length','width', 'height', 'curb_weight',
                    'engine_type', 'num_of_cylinders', 'engine_size', 'fuel_system',
                    'bore', 'stroke', 'compression_ratio', 'horsepower', 'peak_rpm',
                    'city_mpg', 'highway_mpg','price']

# データの確認
# - 欠損値は NaN と表示
autos

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,width,height,curb_weight,engine_type,num_of_cylinders,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.00,111.0,5000.0,21,27,16500.0
1,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.00,154.0,5000.0,19,26,16500.0
2,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.40,10.00,102.0,5500.0,24,30,13950.0
3,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.40,8.00,115.0,5500.0,18,22,17450.0
4,2,,audi,gas,std,two,sedan,fwd,front,99.8,177.3,66.3,53.1,2507,ohc,five,136,mpfi,3.19,3.40,8.50,110.0,5500.0,19,25,15250.0
5,1,158.0,audi,gas,std,four,sedan,fwd,front,105.8,192.7,71.4,55.7,2844,ohc,five,136,mpfi,3.19,3.40,8.50,110.0,5500.0,19,25,17710.0
6,1,,audi,gas,std,four,wagon,fwd,front,105.8,192.7,71.4,55.7,2954,ohc,five,136,mpfi,3.19,3.40,8.50,110.0,5500.0,19,25,18920.0
7,1,158.0,audi,gas,turbo,four,sedan,fwd,front,105.8,192.7,71.4,55.9,3086,ohc,five,131,mpfi,3.13,3.40,8.30,140.0,5500.0,17,20,23875.0
8,0,,audi,gas,turbo,two,hatchback,4wd,front,99.5,178.2,67.9,52.0,3053,ohc,five,131,mpfi,3.13,3.40,7.00,160.0,5500.0,16,22,
9,2,192.0,bmw,gas,std,two,sedan,rwd,front,101.2,176.8,64.8,54.3,2395,ohc,four,108,mpfi,3.50,2.80,8.80,101.0,5800.0,23,29,16430.0


In [53]:
# 表示する最大列数の設定
pd.set_option('display.max_columns', len(autos.columns))
# 表示する最大行数の設定
# pd.set_option('display.max_rows', len(autos))

In [54]:
# DataFrame の要約を表示
autos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 26 columns):
symboling            204 non-null int64
normalized_losses    164 non-null float64
make                 204 non-null object
fuel_type            204 non-null object
aspiration           204 non-null object
num_of_doors         202 non-null object
body_style           204 non-null object
drive_wheels         204 non-null object
engine_location      204 non-null object
wheel_base           204 non-null float64
length               204 non-null float64
width                204 non-null float64
height               204 non-null float64
curb_weight          204 non-null int64
engine_type          204 non-null object
num_of_cylinders     204 non-null object
engine_size          204 non-null int64
fuel_system          204 non-null object
bore                 200 non-null float64
stroke               200 non-null float64
compression_ratio    204 non-null float64
horsepower           202 non-

In [55]:
# 欠損値 NaN の個数
autos.isna().sum()

symboling             0
normalized_losses    40
make                  0
fuel_type             0
aspiration            0
num_of_doors          2
body_style            0
drive_wheels          0
engine_location       0
wheel_base            0
length                0
width                 0
height                0
curb_weight           0
engine_type           0
num_of_cylinders      0
engine_size           0
fuel_system           0
bore                  4
stroke                4
compression_ratio     0
horsepower            2
peak_rpm              2
city_mpg              0
highway_mpg           0
price                 4
dtype: int64

In [56]:
# 欠損値 NaN を含む行の除去
# - 欠損値を含む行を全て除去
df = autos.dropna()

# 確認
# - 0, 1, 4, 6, 8行目が無くなっている
# - rows が 204 から 159 に減少
df

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,width,height,curb_weight,engine_type,num_of_cylinders,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
2,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.40,10.00,102.0,5500.0,24,30,13950.0
3,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.40,8.00,115.0,5500.0,18,22,17450.0
5,1,158.0,audi,gas,std,four,sedan,fwd,front,105.8,192.7,71.4,55.7,2844,ohc,five,136,mpfi,3.19,3.40,8.50,110.0,5500.0,19,25,17710.0
7,1,158.0,audi,gas,turbo,four,sedan,fwd,front,105.8,192.7,71.4,55.9,3086,ohc,five,131,mpfi,3.13,3.40,8.30,140.0,5500.0,17,20,23875.0
9,2,192.0,bmw,gas,std,two,sedan,rwd,front,101.2,176.8,64.8,54.3,2395,ohc,four,108,mpfi,3.50,2.80,8.80,101.0,5800.0,23,29,16430.0
10,0,192.0,bmw,gas,std,four,sedan,rwd,front,101.2,176.8,64.8,54.3,2395,ohc,four,108,mpfi,3.50,2.80,8.80,101.0,5800.0,23,29,16925.0
11,0,188.0,bmw,gas,std,two,sedan,rwd,front,101.2,176.8,64.8,54.3,2710,ohc,six,164,mpfi,3.31,3.19,9.00,121.0,4250.0,21,28,20970.0
12,0,188.0,bmw,gas,std,four,sedan,rwd,front,101.2,176.8,64.8,54.3,2765,ohc,six,164,mpfi,3.31,3.19,9.00,121.0,4250.0,21,28,21105.0
17,2,121.0,chevrolet,gas,std,two,hatchback,fwd,front,88.4,141.1,60.3,53.2,1488,l,three,61,2bbl,2.91,3.03,9.50,48.0,5100.0,47,53,5151.0
18,1,98.0,chevrolet,gas,std,two,hatchback,fwd,front,94.5,155.9,63.6,52.0,1874,ohc,four,90,2bbl,3.03,3.11,9.60,70.0,5400.0,38,43,6295.0


In [60]:
# 線形重回帰モデルの作成
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

model = LinearRegression()
# df.drop(['symboling', 'normalized_losses', 'fuel_type', 'aspiration',
#                     'num_of_doors', 'body_style', 'drive_wheels', 'engine_location',
#                     'wheel_base', 'length','width', 'height', 'curb_weight',
#                     'engine_type', 'num_of_cylinders', 'engine_size', 'fuel_system',
#                     'bore', 'stroke', 'compression_ratio', 'peak_rpm',
#                     'city_mpg', 'highway_mpg'], axis=1)

df = pd.get_dummies(data=autos, drop_first=True).dropna()
print(df)

X = df.drop(['price'], axis=1)
Y = df[['price']]

model.fit(X, Y)

Y_predict = model.predict(X)

# ２．４気筒（num_of_cylindersが'four'）の車両の中で最も多いメーカー (make) は何か
# temp = df.query("num_of_cylinders=='four'")
# print(temp['make'])
print(temp.groupby('make')['num_of_cylinders'].count())

# ３．最も値段 (price) が高い車両のメーカー (make) は何か
# maxPrice = df['price'].max()
# maxPrice
# df.query("price == 35056.0")['make']



     symboling  normalized_losses  wheel_base  length  width  height  \
2            2              164.0        99.8   176.6   66.2    54.3   
3            2              164.0        99.4   176.6   66.4    54.3   
5            1              158.0       105.8   192.7   71.4    55.7   
7            1              158.0       105.8   192.7   71.4    55.9   
9            2              192.0       101.2   176.8   64.8    54.3   
10           0              192.0       101.2   176.8   64.8    54.3   
11           0              188.0       101.2   176.8   64.8    54.3   
12           0              188.0       101.2   176.8   64.8    54.3   
17           2              121.0        88.4   141.1   60.3    53.2   
18           1               98.0        94.5   155.9   63.6    52.0   
19           0               81.0        94.5   158.8   63.6    52.0   
20           1              118.0        93.7   157.3   63.8    50.8   
21           1              118.0        93.7   157.3   63.8    

In [61]:
# 平均二乗誤差、決定係数の出力
r2_score(Y, Y_predict)


0.9725414259178758

In [None]:
# 回帰係数の出力