In [78]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [79]:
car = pd.read_csv("D:/input/car.csv")
car.head(5)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,acceptance
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [80]:
car.shape

(1728, 7)

In [81]:
# 接受程度数量分布
print(car["acceptance"].value_counts())

unacc    1210
acc       384
good       69
vgood      65
Name: acceptance, dtype: int64


In [82]:
#接受程度比例分布
print((car["acceptance"].value_counts() + 0.0) / car.shape[0])#+0.0的作用？

unacc    0.700231
acc      0.222222
good     0.039931
vgood    0.037616
Name: acceptance, dtype: float64


In [83]:
encode_df = car.copy()
lug_dict = {"small": 0, "med": 1, "big": 2}
encode_df["lug_boot"] = encode_df["lug_boot"].map(lug_dict)

In [84]:
# 查看编码列的前五行
encode_df["lug_boot"].head(5)

0    0
1    0
2    0
3    1
4    1
Name: lug_boot, dtype: int64

In [85]:
#safety特征数字编码
safe_dict = {"low": 0, "med": 1, "high": 2}
encode_df["safety"] = encode_df["safety"].map(safe_dict)

In [86]:
#buying特征数字编码
buy_dict = {"vhigh": 0, "high": 1, "med": 2,"low":3}
encode_df["buying"] = encode_df["buying"].map(buy_dict)

In [87]:
#maint特征数字编码
maint_dict = {"vhigh": 0, "high": 1, "med": 2,"low":3}
encode_df["maint"] = encode_df["maint"].map(maint_dict)

In [88]:
# 查看编码后的数据前五行
encode_df.head(5)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,acceptance
0,0,0,2,2,0,0,unacc
1,0,0,2,2,0,1,unacc
2,0,0,2,2,0,2,unacc
3,0,0,2,2,1,0,unacc
4,0,0,2,2,1,1,unacc


In [89]:
# 将车门数量一列中，5more替换为6
encode_df.doors.replace("5more", "6", inplace=True)
# 将核载人数一列中，more替换为6
encode_df.persons.replace("more", "6", inplace=True)

In [90]:
print(encode_df["doors"].value_counts())

2    432
3    432
4    432
6    432
Name: doors, dtype: int64


In [91]:
print(encode_df["persons"].value_counts())

2    576
4    576
6    576
Name: persons, dtype: int64


In [92]:
print(encode_df.doors.dtype, encode_df.persons.dtype)

object object


In [93]:
encode_df["doors"] = encode_df["doors"].astype("int")
encode_df["persons"] = encode_df["persons"].astype("int")
print(encode_df.doors.dtype, encode_df.persons.dtype)

int32 int32


In [94]:
encode_df.head(5)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,acceptance
0,0,0,2,2,0,0,unacc
1,0,0,2,2,0,1,unacc
2,0,0,2,2,0,2,unacc
3,0,0,2,2,1,0,unacc
4,0,0,2,2,1,1,unacc


In [95]:
# 创建编码器
buying_encoder = LabelEncoder()
# 适配数据集并完成编码
encoded_buying = buying_encoder.fit_transform(car["buying"])
#一键编码 但是次序打乱了

In [96]:
buying_encoder.inverse_transform([0, 1, 2, 3])
#将编码后的数据转换为原来的

array(['high', 'low', 'med', 'vhigh'], dtype=object)

In [97]:
pd.get_dummies(car["buying"], prefix="buying_").head(50)
#onehot

Unnamed: 0,buying__high,buying__low,buying__med,buying__vhigh
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1
5,0,0,0,1
6,0,0,0,1
7,0,0,0,1
8,0,0,0,1
9,0,0,0,1


In [98]:
car_onehot_encoded = pd.DataFrame()

In [99]:
for col in ["buying", "maint", "doors", "persons", "lug_boot", "safety"]:#将每列加入list 进行重复的操作 减少重复代码
    col_encoded = pd.get_dummies(car[col], prefix=col + "_")
    car_onehot_encoded = pd.concat([car_onehot_encoded, col_encoded], axis=1)

In [100]:
car_onehot_encoded = pd.concat([car_onehot_encoded, car["acceptance"]], axis=1)

In [101]:
car_onehot_encoded.head(5)

Unnamed: 0,buying__high,buying__low,buying__med,buying__vhigh,maint__high,maint__low,maint__med,maint__vhigh,doors__2,doors__3,...,persons__2,persons__4,persons__more,lug_boot__big,lug_boot__med,lug_boot__small,safety__high,safety__low,safety__med,acceptance
0,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,0,1,0,1,0,unacc
1,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,0,1,0,0,1,unacc
2,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,0,1,1,0,0,unacc
3,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,1,0,0,1,0,unacc
4,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,1,0,0,0,1,unacc


In [102]:
# 数字编码数据集进行划分
x_num = encode_df.iloc[:,:6]
y_num = encode_df["acceptance"]
# 将百分之七十数据作为训练集，剩余作为测试集。按照acceptance进行分成抽样，确保测试集和训练集满意程度分布情况一致
x_train_num, x_test_num, y_train_num, y_test_num = train_test_split(x_num, y_num, test_size=0.3, stratify=y_num, random_state=42)

In [103]:
#训练集各个取值样本所占比例
print (y_train_num.value_counts()/len(y_train_num))
#测试集各个取值样本所占比例
print (y_test_num.value_counts()/len(y_test_num))

unacc    0.700579
acc      0.222498
good     0.039702
vgood    0.037221
Name: acceptance, dtype: float64
unacc    0.699422
acc      0.221580
good     0.040462
vgood    0.038536
Name: acceptance, dtype: float64


In [104]:
# one-hot编码数据集进行划分
x_onehot = car_onehot_encoded.iloc[:,:21]
y_onehot = car_onehot_encoded["acceptance"]
x_train_onehot, x_test_onehot, y_train_onehot, y_test_onehot = train_test_split(x_onehot, y_onehot, test_size=0.3, stratify=y_onehot, random_state=42)

In [105]:
# 输出训练集各个取值样本所占比例
print (y_train_onehot.value_counts()/len(y_train_onehot))
# 输出测试集各个取值样本所占比例
print (y_test_onehot.value_counts()/len(y_test_onehot))

unacc    0.700579
acc      0.222498
good     0.039702
vgood    0.037221
Name: acceptance, dtype: float64
unacc    0.699422
acc      0.221580
good     0.040462
vgood    0.038536
Name: acceptance, dtype: float64


In [106]:
# 创建逻辑回归模型模型
num_model = LogisticRegression(max_iter=5000)
onehot_model = LogisticRegression(max_iter=5000)

In [107]:
# 分别用数字编码的数据集和one-hot编码的数据集训练模型
num_model.fit(x_train_num, y_train_num)
onehot_model.fit(x_train_onehot, y_train_onehot)

In [108]:
# 获取数据编码对应模型的预测结果
y_pred_num = num_model.predict(x_test_num)
# 获取one-hot编码对应模型的预测结果
y_pred_onehot = onehot_model.predict(x_test_onehot)

In [109]:
# 查看数字编码训练模型的性能
print(classification_report(y_test_num, y_pred_num))

              precision    recall  f1-score   support

         acc       0.65      0.50      0.57       115
        good       0.67      0.57      0.62        21
       unacc       0.87      0.94      0.90       363
       vgood       0.76      0.65      0.70        20

    accuracy                           0.82       519
   macro avg       0.74      0.67      0.70       519
weighted avg       0.81      0.82      0.81       519



In [110]:
# 查看onehot编码训练模型的性能
print(classification_report(y_test_onehot, y_pred_onehot))
#性能似乎更优秀

              precision    recall  f1-score   support

         acc       0.79      0.84      0.82       115
        good       0.69      0.43      0.53        21
       unacc       0.96      0.97      0.97       363
       vgood       0.95      0.90      0.92        20

    accuracy                           0.92       519
   macro avg       0.85      0.78      0.81       519
weighted avg       0.91      0.92      0.91       519



In [111]:
car_onehot_encoded.to_csv("D:/output/one-hot.csv")