In [314]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re

# Data Cleaning

In [315]:
# raw_data是从原表读到的原始数据
raw_data = pd.read_excel(os.path.join(os.getcwd(),'../data/rawdata/car_all_shanghai.xls'))

In [316]:
raw_data = raw_data.rename(columns={
    '标题':'title',
    '价格':'price',
    '新车价格':'price_new',
    '首次上牌':'date_regi',
    '表显里程':'mileage', 
    '官方续航':'official_endurance',
    '排放标准':'standard',
    '变速箱':'gearbox',
    '过户次数':'num_trans',
    '车牌地':'license_location',
    '车身颜色':'color',
    '电动机总功率':'motor_power',
    '电池容量':'battery_capacity',
    '电池类型':'battery_type',
    '能源类型':'energy_type',
    '排量':'displacement',
    '钥匙数':'keys'
    })

In [317]:
print(raw_data.shape) # (39216, 17)
# 根据title去掉空行
not_null_raw_data = raw_data[~(raw_data['title'].isnull())]
print(not_null_raw_data.shape) # (39159, 17)
print('空行数量：',39216-39159)
print('查看每个列空值数量：')
print(not_null_raw_data.isnull().sum())

(26169, 17)
(26110, 17)
空行数量： 57
查看每个列空值数量：
title                     0
price                     0
price_new                 0
date_regi                 0
mileage                   0
official_endurance    23490
standard                  0
gearbox                   0
num_trans                 0
license_location          0
color                     0
motor_power           23490
battery_capacity      23490
battery_type          23490
energy_type               0
displacement           2620
keys                      0
dtype: int64


In [318]:
# 全部数据清洗
clean_data = not_null_raw_data.copy()
# title处理
clean_data['brand'] = clean_data['title'].str.split(' ',expand=True)[0]

In [319]:
# #去掉-
# clean_data['brand']=clean_data['brand'].map(lambda x: re.sub(r'[-·]','',x))
#
# # 手动去掉北京品牌中的多余字
# clean_data['brand']=clean_data['brand'].map(
#      lambda x: re.sub(r'[级系]+','', x)).map(
#      lambda x: re.sub(r'(风光)|(风行)|(风神)|(小康)|(EV)|(新能源)|(黑猫)|(传祺)|(欧尚)|(罗密欧)|(2多功能旅行车)|(2旅行车)|(ID.)|(e:)|(G列)|(大狗)|(初恋)|(EZS纯电动)|(跨越)|(白猫)|(蚂蚁)|(好猫)|(精灵#)|(轻型车)|(原力版)|(神兽)|(赤兔)','', x)).map(
#      lambda x: re.sub(r'(启辰大|启辰星)','启辰', x)).map(
#      lambda x: re.sub(r'(BEIJING汽车)','北京汽车', x))


In [320]:
#去掉-
clean_data['brand']=clean_data['brand'].map(lambda x: re.sub(r'[-·]','',x))

# 手动去掉深圳品牌中的多余字
clean_data['brand']=clean_data['brand'].map(
     lambda x: re.sub(r'[级系]+','', x)).map(
     lambda x: re.sub(r'(风光)|(风行)|(风神)|(小康)|(EV)|(新能源)|(黑猫)|(传祺)|(欧尚)|(罗密欧)|(2多功能旅行车)|(2旅行车)|(ID.)|(e:)|(G列)|(大狗)|(初恋)|(EZS纯电动)|(跨越)|(白猫)|(蚂蚁)|(好猫)|(精灵#)|(轻型车)|(原力版)|(神兽)|(赤兔)|(埃安)|(埃安)|(酷狗)|(风度)|(S5青春版)|(昌河)|(制造)|(ATSL2017款)|(绅宝)|(Macan2018款)|(3星骋)|(ATSL2017款)|(RX经典)|(2018款)|(A5翼舞)|(电马)|(集团)','', x)).map(
     lambda x: re.sub(r'(启辰大|启辰星)','启辰', x)).map(
     lambda x: re.sub(r'(BEIJING汽车)','北京汽车', x))

In [321]:
#去掉字母数字
def find_brand(x):
    """brand"""
    if re.search(r'[\u4e00-\u9fff]+', x):
        res = re.sub(r'( )*[0-9A-Za-z]+$','',x)
    else:
        res = x
    return res
clean_data['brand'] = clean_data['brand'].apply(find_brand)
clean_data['brand']

0          荣威
1         WEY
2         雪佛兰
3          日产
4          奇瑞
         ... 
26164      丰田
26165    凯迪拉克
26166      路虎
26167      哈弗
26168    小鹏汽车
Name: brand, Length: 26110, dtype: object

In [323]:
clean_data['brand'].unique()

array(['荣威', 'WEY', '雪佛兰', '日产', '奇瑞', '起亚', '马自达', '吉利汽车', '标致', '本田',
       '丰田', '别克', '奥迪', '特斯拉', '英菲尼迪', '蔚来', '大众', '宝骏', '长安', '领克',
       '宝马', '理想汽车', '五菱汽车', 'Jeep', '力帆汽车', 'MINI', '几何汽车', '哪吒汽车', '现代',
       '奔驰', '小鹏汽车', '凯迪拉克', '广汽', '沃尔沃', '东风', '福特', '哈弗', '斯柯达', '铃木',
       '比亚迪', '雷诺', '捷豹', '零跑汽车', '启辰', '北汽', '雪铁龙', '路虎', '合众汽车', '阿尔法',
       '林肯大陆', '新宝骏', '红旗', '林肯', '威马汽车', '江淮', '欧拉', '捷达', 'smart',
       '北京汽车', '坦克', '上汽', '名爵', '雷克萨斯', '斯巴鲁', '东南', '极氪', '众泰', '保时捷',
       '玛莎拉蒂', '奔腾', '三菱', '中华', '北京', 'SERES赛力斯', '岚图汽车', '国机智骏', '观致',
       '飞凡汽车', '腾势', '上汽大通', '猎豹汽车', '菲亚特', '驭胜', '凯翼', '吉利', '宝沃',
       '星途揽月', '思铭', 'AITO', '道奇', '一汽', '海马', '思皓', '电咖', '星途凌云',
       'SWM斯威汽车', '理念', '欧宝', '陆风', '天美汽车', '纳智捷', '江铃', '汉腾汽车', '福田',
       '星途追风', '捷途', 'Polestar', '百智', '凌宝汽车', '全球鹰', '依维柯', '君马汽车', '讴歌',
       '金杯快运', 'R汽车', '北汽幻速', '路特斯', '华晨新日', '高合汽车', 'ALPINA', '金杯',
       '云度π', 'DS', '2019款', '知豆', '福特福克斯', '比速汽车', '长城', '合创', 'MG',
 

In [324]:
# 去掉单位
clean_data['mileage'] = clean_data['mileage'].str.replace('万公里','').apply(lambda x :float(x.replace('公里',''))/10000 if '公里' in str(x) else x).astype(np.float64)
clean_data['official_endurance'] = clean_data['official_endurance'].str.replace('km','').replace('-',np.nan).astype(np.float64)
clean_data['num_trans'] = clean_data['num_trans'].str.replace('次','').replace('-',np.nan).astype(np.float64)
clean_data['battery_capacity'] = clean_data['battery_capacity'].str.replace('kWh','').replace('-',np.nan).astype(np.float64)
clean_data['motor_power'] = clean_data['motor_power'].str.replace('kw','').replace('-',np.nan).astype(np.float64)
clean_data['keys'] = clean_data['keys'].str.replace('把','').astype(np.float64)

# location 一线城市（北上广深）为1，其他城市为0
clean_data['license_location'] = clean_data['license_location'].str.split('[(（]',expand=True)[0]
clean_data['license_location'] = clean_data['license_location'].apply(lambda x : 1 if x in ['上海','北京','广州','深圳'] else 0).astype(np.float64)

# color 黑色、白色、深灰色、银灰色为0，其他颜色为1
clean_data['color'] = clean_data['color'].apply(lambda x : 0 if x in ['白色','黑色','深灰色','银灰色'] else 1).astype(np.float64)

# displace：T*1.4变成L，方便分析，排放越大性能越好
clean_data['displacement'] = clean_data['displacement'].astype(str).apply(lambda x : float(x.replace('T',''))*1.4 if 'T' in x else float(x.replace('L',''))).astype(np.float64)

# 能源类型:汽油0，电动1，混合2
energy_type = list(clean_data['energy_type'].dropna().unique())
type_num = [2,0,1,2,2,2,2,0,2,2]
energy_type_num=dict(zip(energy_type,type_num))
clean_data['energy_type'] = clean_data['energy_type'].map(energy_type_num)

# 变速箱：手动0，自动1
clean_data['gearbox'] = clean_data['gearbox'].apply(lambda x : 1 if str(x)=='自动' else 0)

In [325]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26110 entries, 0 to 26168
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title               26110 non-null  object 
 1   price               26110 non-null  float64
 2   price_new           26110 non-null  float64
 3   date_regi           26110 non-null  object 
 4   mileage             26110 non-null  float64
 5   official_endurance  2619 non-null   float64
 6   standard            26110 non-null  object 
 7   gearbox             26110 non-null  int64  
 8   num_trans           26110 non-null  float64
 9   license_location    26110 non-null  float64
 10  color               26110 non-null  float64
 11  motor_power         2619 non-null   float64
 12  battery_capacity    2549 non-null   float64
 13  battery_type        2620 non-null   object 
 14  energy_type         26110 non-null  int64  
 15  displacement        23490 non-null  float64
 16  keys

In [326]:
# 挑出油车
petrol = clean_data[clean_data['energy_type']==0]
# 删除电车特征
petrol_not_null = petrol.drop(['official_endurance','motor_power','battery_capacity','battery_type'],axis=1)

In [327]:
#挑出电车
electric = clean_data[clean_data['energy_type']==1]
# 均值motor_power、battery_capacity、official_endurance # 众数battery_type #删除displacement
electric['motor_power'].fillna(electric['motor_power'].mean(), inplace=True)
electric['battery_capacity'].fillna(electric['battery_capacity'].mean(), inplace=True)
electric['official_endurance'].fillna(electric['official_endurance'].mean(), inplace=True)
electric['battery_type'].fillna('三元锂电池', inplace=True)
electric = electric.drop(['displacement'],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  electric['motor_power'].fillna(electric['motor_power'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  electric['battery_capacity'].fillna(electric['battery_capacity'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  electric['official_endurance'].fillna(electric['official_endurance'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.p

In [328]:
#挑出混合车
mixed = clean_data[clean_data['energy_type']==2]

In [None]:
#去掉表的第二列


In [329]:
# 写文件
def write_xlsx(df,path, sheetName):
    writer = pd.ExcelWriter(path, engine='openpyxl',mode='a', if_sheet_exists="overlay")
    df.to_excel(writer, sheet_name=sheetName)
    writer.close()

In [330]:
# 写入同一个文件三张表
path = '../data/cleaneddata/cleaned_car_all_shanghai.xlsx'
pd.DataFrame().to_excel(path,sheet_name='total') # 先创建文件
write_xlsx(petrol,path,'petrol')
write_xlsx(electric, path,'electric')
write_xlsx(mixed,path,'mixed')
write_xlsx(clean_data, path,'total')