In [60]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re

# Data Cleaning

In [61]:
# raw_data是从原表读到的原始数据
raw_data = pd.read_excel(os.path.join(os.getcwd(),'../data/rawdata/car_all_shenzhen.xls'))

In [62]:
raw_data = raw_data.rename(columns={
    '标题':'title',
    '价格':'price',
    '新车价格':'price_new',
    '首次上牌':'date_regi',
    '表显里程':'mileage', 
    '官方续航':'official_endurance',
    '排放标准':'standard',
    '变速箱':'gearbox',
    '过户次数':'num_trans',
    '车牌地':'license_location',
    '车身颜色':'color',
    '电动机总功率':'motor_power',
    '电池容量':'battery_capacity',
    '电池类型':'battery_type',
    '能源类型':'energy_type',
    '排量':'displacement',
    '钥匙数':'keys'
    })

In [63]:
print(raw_data.shape) # (39216, 17)
# 根据title去掉空行
not_null_raw_data = raw_data[~(raw_data['title'].isnull())]
print(not_null_raw_data.shape) # (39159, 17)
print('空行数量：',39216-39159)
print('查看每个列空值数量：')
print(not_null_raw_data.isnull().sum())

(39216, 17)
(39159, 17)
空行数量： 57
查看每个列空值数量：
title                     0
price                     0
price_new                 0
date_regi                 0
mileage                   0
official_endurance    34083
standard                  0
gearbox                   0
num_trans                 0
license_location          0
color                     0
motor_power           34083
battery_capacity      34083
battery_type          34083
energy_type               1
displacement           5076
keys                      1
dtype: int64


In [64]:
# 全部数据清洗
clean_data = not_null_raw_data.copy()
# title处理
clean_data['brand'] = clean_data['title'].str.split(' ',expand=True)[0]

In [65]:
# def is_all_eng(strs):
#     import string
#     for i in strs:
#         if i not in string.ascii_lowercase+string.ascii_uppercase:
#             return False
#     return True
#
# def brand(strs):
#     count = 0
#     if is_all_eng(strs):
#         return(strs)
#     else:
#         for i in strs:
#             if is_all_eng(i) or i.isdigit():
#                 return(strs[:count])
#             else:
#                 count += 1
#         return(strs)
#
# clean_data['brand']=clean_data['title'].apply(lambda x:brand(x))

In [66]:
clean_data['brand']

0        荣威e950
1           WEY
2           雪佛兰
3        本田XR-V
4            现代
          ...  
39211        路虎
39212     哈弗H6S
39213    现代ix35
39214        三菱
39215    小鹏汽车G3
Name: brand, Length: 39159, dtype: object

In [67]:
#去掉-
clean_data['brand']=clean_data['brand'].map(lambda x: re.sub(r'[-·]','',x))

In [68]:
#去掉字母数字
def find_brand(x):
    """brand"""
    if re.search(r'[\u4e00-\u9fff]+', x):
        res = re.sub(r'( )*[0-9A-Za-z]+$','',x)
    else:
        res = x
    return res
clean_data['brand'] = clean_data['brand'].apply(find_brand)
clean_data['brand']

0          荣威
1         WEY
2         雪佛兰
3          本田
4          现代
         ... 
39211      路虎
39212      哈弗
39213      现代
39214      三菱
39215    小鹏汽车
Name: brand, Length: 39159, dtype: object

In [69]:
clean_data['brand'].isnull().sum()

0

In [70]:
clean_data['brand'].unique()

array(['荣威', 'WEY', '雪佛兰', '本田', '现代', '北汽威旺', '起亚', '奇瑞', '吉利汽车', '马自达',
       '标致', '三菱', '雷克萨斯', '奔腾', '长安', '日产', '众泰', '别克', '奥迪', '英菲尼迪',
       '特斯拉', '丰田', '奔驰GLA级', '宝马', '哪吒汽车', '领克', '理想汽车', '大众', '哈弗',
       '保时捷', '欧拉好猫', '五菱汽车', '陆风', '宝马3系', '力帆汽车', '几何汽车', 'MINI',
       '广汽埃安', '广汽传祺', '沃尔沃', '奔驰E级', '比亚迪', '宝骏', '宝马5系', '凯迪拉克', '铃木',
       '斯柯达', '福特', '长安欧尚', '捷豹', '北汽新能源', '东南', '路虎', '小鹏汽车', '名爵',
       '奔驰B级', '红旗', '领克01新能源', '零跑汽车', '奔驰GLC级', '宝马1系', '威马汽车', '欧拉黑猫',
       '飞凡汽车', '捷达', '上汽', '奔驰C级', '阿尔法罗密欧', '东风风光', '奔驰A级', '林肯大陆',
       '岚图汽车', '新宝骏', '奔驰', '启辰', 'Jeep', '北汽幻速', '猎豹汽车', '江淮', '广汽新能源',
       '宝马X1新能源', '奔驰CLA级', '道奇', '领克02新能源', 'ARCFOX极狐', '野马汽车', 'smart',
       '纳智捷', '雷克萨斯UX新能源', '奇瑞蚂蚁', '菲亚特', '宝马4系', '宝马7系', '蔚来', '雪铁龙',
       '长安新能源', '东风风神', '东风风行', '奔驰GLE级', '欧拉', '雷诺', '北京汽车', '沃尔沃S60新能源',
       '奔驰S级', 'DS', '讴歌', '星途追风', '北京', '宝沃', '玛莎拉蒂', '宝马5系新能源', '开瑞',
       '奔腾X40新能源', '奔驰CLS级', '哈弗神兽', '广汽集团', '观致', '林肯', '凯迪拉克CT6新能

In [71]:
#去掉-
clean_data['brand']=clean_data['brand'].map(lambda x: re.sub(r'[-·]','',x))

# 手动去掉北京品牌中的多余字
clean_data['brand']=clean_data['brand'].map(
     lambda x: re.sub(r'[级系]+','', x)).map(
     lambda x: re.sub(r'(风光)|(风行)|(风神)|(小康)|(EV)|(新能源)|(黑猫)|(传祺)|(欧尚)|(罗密欧)|(2多功能旅行车)|(2旅行车)|(ID.)|(e:)|(G列)|(大狗)|(初恋)|(EZS纯电动)|(跨越)|(白猫)|(蚂蚁)|(好猫)|(精灵#)|(轻型车)|(原力版)|(神兽)|(赤兔)|(CRV)|(UX)|(T300)|(Panamera)|(S90)|(UNIK)|(DX3)|(HS)|(XC60)|(A6L)|(X40)|(C30)','', x)).map(
     lambda x: re.sub(r'(启辰大|启辰星)','启辰', x)).map(
     lambda x: re.sub(r'(BEIJING汽车)','北京汽车', x)).map(
     lambda x: re.sub(r'(奔驰E)|(奔驰B)|(奔驰S)|(奔驰GLA)|(奔驰A)|(奔驰V)|(奔驰GLS)|(奔驰SLK)|(奔驰CLA)|(奔驰GLE)|(奔驰GLC)|(奔驰GLK)|(奔驰CLS)|(奔驰LC)|(奔驰R)|(奔驰C)','奔驰', x)).map(
      lambda x: re.sub(r'(领克01)|(领克06)|(领克05)|(领克09)','领克', x)).map(
      lambda x: re.sub(r'(宝马5)|(宝马2)|(宝马1)|(宝马X1新能源)|(宝马4)|(宝马X5)|(宝马7)|(宝马6)|(宝马3)|(宝马X1)','宝马', x)).map(
      lambda x: re.sub(r'(日产)','东风日产', x))


In [72]:
#去掉-
clean_data['brand']=clean_data['brand'].map(lambda x: re.sub(r'[-·]','',x))

# 手动去掉深圳品牌中的多余字
clean_data['brand']=clean_data['brand'].map(
     lambda x: re.sub(r'[级系]+','', x)).map(
     lambda x: re.sub(r'(风光)|(风行)|(风神)|(小康)|(EV)|(新能源)|(黑猫)|(传祺)|(欧尚)|(罗密欧)|(2多功能旅行车)|(2旅行车)|(ID.)|(e:)|(G列)|(大狗)|(初恋)|(EZS纯电动)|(跨越)|(白猫)|(蚂蚁)|(好猫)|(精灵#)|(轻型车)|(原力版)|(神兽)|(赤兔)|(埃安)|(埃安)|(酷狗)|(风度)|(S5青春版)|(昌河)|(制造)|(ATSL2017款)|(绅宝)|(Macan2018款)|(3星骋)|(ATSL2017款)|(RX经典)|(2018款)|(A5翼舞)|(电马)|(集团)','', x)).map(
     lambda x: re.sub(r'(启辰大|启辰星)','启辰', x)).map(
     lambda x: re.sub(r'(BEIJING汽车)','北京汽车', x)).map(
     lambda x: re.sub(r'(名爵6)','名爵', x))


In [73]:

# 手动去掉shanghai品牌中的多余字
clean_data['brand']=clean_data['brand'].map(
     lambda x: re.sub(r'[级系]+','', x)).map(
     lambda x: re.sub(r'(风光)|(风行)|(风神)|(小康)|(新能源)|(黑猫)|(传祺)|(欧尚)|(罗密欧)|(2多功能旅行车)|(2旅行车)|(ID.)|(e:)|(G列)|(大狗)|(初恋)|(EZS纯电动)|(跨越)|(白猫)|(蚂蚁)|(好猫)|(精灵#)|(轻型车)|(原力版)|(神兽)|(赤兔)|(埃安)|(埃安)|(酷狗)|(风度)|(S5青春版)|(昌河)|(制造)|(福克斯2018款)|(CT6)|(S60)|(CRV)|(UNIK)|(A6L)|(UX)|(3星骋)|(T300)|(up!)|(S0)|(CT)','', x)).map(
     lambda x: re.sub(r'(启辰大|启辰星)','启辰', x)).map(
     lambda x: re.sub(r'(BEIJING汽车)','北京汽车', x)).map(
     lambda x: re.sub(r'(宝马5)|(宝马1)|(宝马X1新能源)|(宝马4)|(宝马X5)|(宝马7)|(宝马6)|(宝马3)|(宝马X1)','宝马', x)).map(
     lambda x: re.sub(r'(奔驰E)|(奔驰B)|(奔驰S)|(奔驰GLA)|(奔驰A)|(奔驰V)|(奔驰GLS)|(奔驰SLK)|(奔驰CLA)|(奔驰GLE)|(奔驰GLC)|(奔驰GLK)|(奔驰CLS)|(奔驰LC)|(奔驰R)|(奔驰C)|(奔驰M)|(奔驰LK)','奔驰', x)).map(
     lambda x: re.sub(r'(领克01)|(领克06)|(领克05)|(领克03)|(领克0)','领克', x)).map(
     lambda x: re.sub(r'(沃尔沃L)','沃尔沃', x))



In [74]:
clean_data['brand'].unique()

array(['荣威', 'WEY', '雪佛兰', '本田', '现代', '北汽威旺', '起亚', '奇瑞', '吉利汽车', '马自达',
       '标致', '三菱', '雷克萨斯', '奔腾', '长安', '东风日产', '众泰', '别克', '奥迪', '英菲尼迪',
       '特斯拉', '丰田', '奔驰', '宝马', '哪吒汽车', '领克', '理想汽车', '大众', '哈弗', '保时捷',
       '欧拉', '五菱汽车', '陆风', '力帆汽车', '几何汽车', 'MINI', '广汽', '沃尔沃', '比亚迪',
       '宝骏', '凯迪拉克', '铃木', '斯柯达', '福特', '捷豹', '北汽', '东南', '路虎', '小鹏汽车',
       '名爵', '红旗', '零跑汽车', '威马汽车', '飞凡汽车', '捷达', '上汽', '阿尔法', '东风',
       '林肯大陆', '岚图汽车', '新宝骏', '启辰', 'Jeep', '北汽幻速', '猎豹汽车', '江淮', '道奇',
       '领克2', 'ARCFOX极狐', '野马汽车', 'smart', '纳智捷', '菲亚特', '蔚来', '雪铁龙',
       '雷诺', '北京汽车', 'DS', '讴歌', '星途追风', '北京', '宝沃', '玛莎拉蒂', '开瑞', '观致',
       '林肯', '捷途', '坦克', 'SERES赛力斯', '斯巴鲁', '国机智骏', '汉腾汽车', '海马',
       'Polestar', '全球鹰', '五十铃', '上汽大通', 'SWM斯威汽车', '潍柴英致', '新特汽车', '驭胜',
       '腾势', '2019款', '星途凌云', '华颂', '凯翼', '合众汽车', '一汽', '合创', '思皓', '电咖',
       '极氪', '天际汽车', '大运', '中华', '华晨新日', '星途', 'AITO', '魏牌', '凌宝汽车',
       '创维汽车', '君马汽车', '潍柴汽车', '星途揽月', '知豆', 'R汽车', '路特斯', '高合汽车', '思铭'

In [75]:
clean_data['brand'].isnull().sum()

0

In [76]:
# 去掉单位
clean_data['mileage'] = clean_data['mileage'].str.replace('万公里','').apply(lambda x :float(x.replace('公里',''))/10000 if '公里' in str(x) else x).astype(np.float64)
clean_data['official_endurance'] = clean_data['official_endurance'].str.replace('km','').replace('-',np.nan).astype(np.float64)
clean_data['num_trans'] = clean_data['num_trans'].str.replace('次','').replace('-',np.nan).astype(np.float64)
clean_data['battery_capacity'] = clean_data['battery_capacity'].str.replace('kWh','').replace('-',np.nan).astype(np.float64)
clean_data['motor_power'] = clean_data['motor_power'].str.replace('kw','').replace('-',np.nan).astype(np.float64)
clean_data['keys'] = clean_data['keys'].str.replace('把','').astype(np.float64)

# location 一线城市（北上广深）为1，其他城市为0
clean_data['license_location'] = clean_data['license_location'].str.split('[(（]',expand=True)[0]
clean_data['license_location'] = clean_data['license_location'].apply(lambda x : 1 if x in ['上海','北京','广州','深圳'] else 0).astype(np.float64)

# color 黑色、白色、深灰色、银灰色为0，其他颜色为1
clean_data['color'] = clean_data['color'].apply(lambda x : 0 if x in ['白色','黑色','深灰色','银灰色'] else 1).astype(np.float64)

# displace：T*1.4变成L，方便分析，排放越大性能越好
clean_data['displacement'] = clean_data['displacement'].astype(str).apply(lambda x : float(x.replace('T',''))*1.4 if 'T' in x else float(x.replace('L',''))).astype(np.float64)


In [77]:
energy_type = list(clean_data['energy_type'].dropna().unique())
energy_type

['插电式混合动力',
 '汽油',
 '纯电动',
 '油电混合',
 '增程式',
 '汽油+48V轻混系统',
 '汽油+90V轻混系统',
 '柴油',
 '汽油电驱',
 '汽油+24V轻混系统']

In [78]:
# 能源类型:汽油0，电动1，混合2
energy_type = list(clean_data['energy_type'].dropna().unique())
# 北京
# type_num = [0,2,1,2,2,2,2,0,2,2]

#深圳
type_num = [2,0,1,2,2,2,2,0,2,2]

#上海
# type_num = [2,0,1,2,2,2,2,0,2]
energy_type_num=dict(zip(energy_type,type_num))
clean_data['energy_type'] = clean_data['energy_type'].map(energy_type_num)

# 变速箱：手动0，自动1
clean_data['gearbox'] = clean_data['gearbox'].apply(lambda x : 1 if str(x)=='自动' else 0)

In [79]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39159 entries, 0 to 39215
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title               39159 non-null  object 
 1   price               39159 non-null  float64
 2   price_new           39159 non-null  float64
 3   date_regi           39159 non-null  object 
 4   mileage             39159 non-null  float64
 5   official_endurance  5075 non-null   float64
 6   standard            39159 non-null  object 
 7   gearbox             39159 non-null  int64  
 8   num_trans           39159 non-null  float64
 9   license_location    39159 non-null  float64
 10  color               39159 non-null  float64
 11  motor_power         5075 non-null   float64
 12  battery_capacity    4931 non-null   float64
 13  battery_type        5076 non-null   object 
 14  energy_type         39158 non-null  float64
 15  displacement        34083 non-null  float64
 16  keys

In [80]:
# 挑出油车
petrol = clean_data[clean_data['energy_type']==0]
# 删除电车特征
petrol = petrol.drop(['official_endurance','motor_power','battery_capacity','battery_type'],axis=1)

In [81]:
#挑出电车
electric = clean_data[clean_data['energy_type']==1]
# 均值motor_power、battery_capacity、official_endurance # 众数battery_type #删除displacement
electric['motor_power'].fillna(electric['motor_power'].mean(), inplace=True)
electric['battery_capacity'].fillna(electric['battery_capacity'].mean(), inplace=True)
electric['official_endurance'].fillna(electric['official_endurance'].mean(), inplace=True)
electric['battery_type'].fillna('三元锂电池', inplace=True)
electric = electric.drop(['displacement'],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  electric['motor_power'].fillna(electric['motor_power'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  electric['battery_capacity'].fillna(electric['battery_capacity'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  electric['official_endurance'].fillna(electric['official_endurance'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.p

In [82]:
#挑出混合车
mixed = clean_data[clean_data['energy_type']==2]

In [83]:
# 写文件
def write_xlsx(df,path, sheetName):
    writer = pd.ExcelWriter(path, engine='openpyxl',mode='a', if_sheet_exists="overlay")
    df.to_excel(writer, sheet_name=sheetName)
    writer.close()

In [84]:
# 写入同一个文件三张表
path = '../data/cleaneddata/cleaned_car_all_shenzhen.xlsx'
pd.DataFrame().to_excel(path,sheet_name='total') # 先创建文件
write_xlsx(petrol,path,'petrol')
write_xlsx(electric, path,'electric')
write_xlsx(mixed,path,'mixed')
write_xlsx(clean_data, path,'total')

更改品牌日产为东风日产（单独运行）

In [2]:
df_p= pd.read_excel(os.path.join(os.getcwd(),'../data/cleaneddata/cleaned_car_all.xlsx'),sheet_name='petrol')
df_e =pd.read_excel(os.path.join(os.getcwd(),'../data/cleaneddata/cleaned_car_all.xlsx'),sheet_name='electric')
df_m =pd.read_excel(os.path.join(os.getcwd(),'../data/cleaneddata/cleaned_car_all.xlsx'),sheet_name='mixed')

In [48]:
df_p['brand']

0          荣威
1          丰田
2          宝骏
3          别克
4          现代
         ... 
66560      奔驰
66561      领克
66562      丰田
66563    凯迪拉克
66564      哈弗
Name: brand, Length: 66565, dtype: object

In [5]:
# 手动去掉北京品牌中的多余字
df_p['brand']=df_p['brand'].map(
     lambda x: re.sub(r'[级系]+','', x)).map(
     lambda x: re.sub(r'(风光)|(风行)|(风神)|(小康)|(EV)|(新能源)|(黑猫)|(传祺)|(欧尚)|(罗密欧)|(2多功能旅行车)|(2旅行车)|(ID.)|(e:)|(G列)|(大狗)|(初恋)|(EZS纯电动)|(跨越)|(白猫)|(蚂蚁)|(好猫)|(精灵#)|(轻型车)|(原力版)|(神兽)|(赤兔)|(CRV)|(UX)|(T300)|(Panamera)|(S90)|(UNIK)|(DX3)|(HS)|(XC60)|(A6L)|(X40)|(C30)','', x)).map(
     lambda x: re.sub(r'(启辰大|启辰星)','启辰', x)).map(
     lambda x: re.sub(r'(BEIJING汽车)','北京汽车', x)).map(
     lambda x: re.sub(r'(奔驰E)|(奔驰B)|(奔驰S)|(奔驰GLA)|(奔驰A)|(奔驰V)|(奔驰GLS)|(奔驰SLK)|(奔驰CLA)|(奔驰GLE)|(奔驰GLC)|(奔驰GLK)|(奔驰CLS)|(奔驰LC)|(奔驰R)|(奔驰C)','奔驰', x)).map(
      lambda x: re.sub(r'(领克01)|(领克06)|(领克05)|(领克09)','领克', x)).map(
      lambda x: re.sub(r'(宝马5)|(宝马2)|(宝马1)|(宝马X1新能源)|(宝马4)|(宝马X5)|(宝马7)|(宝马6)|(宝马3)|(宝马X1)','宝马', x)).map(
      lambda x: re.sub(r'(日产)','东风日产', x))

TypeError: expected string or bytes-like object

In [3]:
df_p['brand']=df_p['brand'].map(lambda x: re.sub(r'(日产)','东风日产', x))

TypeError: expected string or bytes-like object

In [45]:
df_e['brand']=df_e['brand'].map(lambda x: re.sub(r'(日产)','东风日产', x))


TypeError: expected string or bytes-like object

In [46]:
df_m['brand']=df_m['brand'].map(lambda x: re.sub(r'(日产)','东风日产', x))

In [None]:
# 写文件
def write_xlsx(df,path, sheetName):
    writer = pd.ExcelWriter(path, engine='openpyxl',mode='a', if_sheet_exists="overlay")
    df.to_excel(writer, sheet_name=sheetName)
    writer.close()

# 写入同一个文件三张表
path = '../data/cleaneddata/cleaned_car_v1.xlsx'
pd.DataFrame().to_excel(path,sheet_name='total') # 先创建文件
write_xlsx(df_p,path,'petrol')
write_xlsx(df_e, path,'electric')
write_xlsx(df_m,path,'mixed')