In [1]:
import pandas as pd
import numpy as np
import arrow
import os

### 1.数据导入

In [2]:
file_path = r'D:\rui\code_analysis\课程资料\6. 综合专题二：基于高铁余票的客流行为特征及其效应分析\12306data'
file = os.path.join(file_path,'Ninghu-2016-12-21.csv')

In [3]:
df = pd.read_csv(file,names=['编号', '车次', '发车站', '到达站', '发车时间', '商务特等座',
                                    '动卧', '一等座', '二等座','无座','记录日期','记录时间'])

In [4]:
df = df[df['发车时间'].str.contains('0[3-9]:|1[0-9]:|2[0-3]:')]

### 2.数据整理

#### 2.1余票数量汇总

In [5]:
df['余票总数'] = df['商务特等座']+df['动卧']+df['一等座']+df['二等座']+df['无座']
df.head()

Unnamed: 0,编号,车次,发车站,到达站,发车时间,商务特等座,动卧,一等座,二等座,无座,记录日期,记录时间,余票总数
0,54000G702940,G7029,南京,镇江,05:49,0,0,94,993,0,2016-12-21,00:00,1087
1,54000G703350,G7033,南京,镇江,06:06,0,0,43,493,160,2016-12-21,00:00,696
2,54000G710140,G7101,南京,镇江,06:11,0,0,46,509,164,2016-12-21,00:00,719
3,54000G703530,G7035,南京,镇江,06:31,0,0,48,493,167,2016-12-21,00:00,708
4,5l000G758100,G7581,南京南,镇江,06:45,10,0,24,417,147,2016-12-21,00:00,598


#### 2.2筛选发车前30分钟余票信息

In [7]:
df['停售时间'] = df['发车时间'].apply(lambda x:arrow.get(x,'HH:mm').shift(minutes=-30).format('HH:mm'))
df.head()

Unnamed: 0,编号,车次,发车站,到达站,发车时间,商务特等座,动卧,一等座,二等座,无座,记录日期,记录时间,余票总数,停售时间
0,54000G702940,G7029,南京,镇江,05:49,0,0,94,993,0,2016-12-21,00:00,1087,05:19
1,54000G703350,G7033,南京,镇江,06:06,0,0,43,493,160,2016-12-21,00:00,696,05:36
2,54000G710140,G7101,南京,镇江,06:11,0,0,46,509,164,2016-12-21,00:00,719,05:41
3,54000G703530,G7035,南京,镇江,06:31,0,0,48,493,167,2016-12-21,00:00,708,06:01
4,5l000G758100,G7581,南京南,镇江,06:45,10,0,24,417,147,2016-12-21,00:00,598,06:15


In [6]:
arrow.get(df['发车时间'][0],'HH:mm').shift(minutes=-30).format('HH:mm')

'05:19'

#### 2.3记录时间==停售时间数据（高价值数据）

In [8]:
df = df[df['记录时间'] == df['停售时间']]

In [9]:
df.head()

Unnamed: 0,编号,车次,发车站,到达站,发车时间,商务特等座,动卧,一等座,二等座,无座,记录日期,记录时间,余票总数,停售时间
503668,4f0000D3080F,D305,南京,苏州,03:30,0,0,0,117,18,2016-12-21,03:00,135,03:00
503914,4f0000D3080F,D305,南京,上海,03:30,0,0,0,91,18,2016-12-21,03:00,109,03:00
746951,240000D3130S,D313,南京,苏州,04:57,0,0,0,11,13,2016-12-21,04:27,24,04:27
747196,240000D3130S,D313,南京,上海,04:57,0,0,0,0,0,2016-12-21,04:27,0,04:27
850902,4f0000D3080F,D305,苏州,上海,05:34,0,0,0,84,18,2016-12-21,05:04,102,05:04


#### 2.4记录出发站、到达站数据，便于后续相邻站的判断

In [10]:
df['始终站'] = df['发车站'] + ',' +  df['到达站']
df['始终站'] = df['始终站'].str.split(',')
df.head()

Unnamed: 0,编号,车次,发车站,到达站,发车时间,商务特等座,动卧,一等座,二等座,无座,记录日期,记录时间,余票总数,停售时间,始终站
503668,4f0000D3080F,D305,南京,苏州,03:30,0,0,0,117,18,2016-12-21,03:00,135,03:00,"[南京, 苏州]"
503914,4f0000D3080F,D305,南京,上海,03:30,0,0,0,91,18,2016-12-21,03:00,109,03:00,"[南京, 上海]"
746951,240000D3130S,D313,南京,苏州,04:57,0,0,0,11,13,2016-12-21,04:27,24,04:27,"[南京, 苏州]"
747196,240000D3130S,D313,南京,上海,04:57,0,0,0,0,0,2016-12-21,04:27,0,04:27,"[南京, 上海]"
850902,4f0000D3080F,D305,苏州,上海,05:34,0,0,0,84,18,2016-12-21,05:04,102,05:04,"[苏州, 上海]"


### 3.获取车次相邻站信息

#### 3.1获取所有沪宁沿线车次代码

In [32]:
import requests
import json

In [36]:
url = 'https://kyfw.12306.cn/otn/czxx/queryByTrainNo?train_no=4f000D265704&from_station_telecode=EAY&to_station_telecode=LAJ&depart_date=2020-04-18'
text = requests.get(url).text
js = json.loads(text)
js_use = js['data']['data']
df_js = pd.DataFrame(js_use)
df_js

Unnamed: 0,start_station_name,arrive_time,station_train_code,station_name,train_class_name,service_type,start_time,stopover_time,end_station_name,station_no,isEnabled
0,西安北,----,D2657,西安北,动车,1.0,12:03,----,兰州西,1,True
1,,12:42,,岐山,,,12:44,2分钟,,2,True
2,,12:58,,宝鸡南,,,13:00,2分钟,,3,True
3,,13:48,,天水南,,,13:52,4分钟,,4,True
4,,15:08,,兰州西,,,15:08,----,,5,True


In [39]:
#创建一个空的DataFrame对象，用于存需要的值
df_use = pd.DataFrame({'train_no':[''],'station_name':['']})
df_use

Unnamed: 0,train_no,station_name
0,,


In [49]:
df_use['train_no'] = '4f000D265704'
df_use['station_name'] = df_use['station_name'].apply(lambda x:df_js['station_name'].values)
df_use

Unnamed: 0,train_no,station_name
0,4f000D265704,"[西安北, 岐山, 宝鸡南, 天水南, 兰州西]"


#### 3.2代码整理

In [91]:
import requests
import json
import numpy as np
import pandas as pd
import time

def get_station(train_no):
    url = 'https://kyfw.12306.cn/otn/czxx/queryByTrainNo?train_no={}\
&from_station_telecode=SNH&to_station_telecode=NJH&depart_date=2020-04-18'.format(train_no)
    headers={
        "Accept": "text/html, application/xhtml+xml, application/xml; q=0.9, */*; q=0.8",
        "Connection": "Keep-Alive",
        "Cookie": "JSESSIONID=CE1E0910299807B0B5D6CE1AE4B37318; RAIL_DEVICEID=KIGorQUspNUQqHtpHqI4qcn_7n8neHBX58xwMxWqPc26JQIimw-uxozqOZEOb_cizOpzFiNc04ZF_sK3raRaFjH3Ao-IO7KIEkB24NnWaD_4ME1YfS8KrNhz637Z2NQIjcjzvKENUq6HfF0Thy8UHfZpscWtyFJf; RAIL_EXPIRATION=1587115346591; route=6f50b51faa11b987e576cdb301e545c4; BIGipServerotn=1490616586.50210.0000; _jc_save_toStation=%u5170%u5DDE%2CLZJ; _jc_save_wfdc_flag=dc; _jc_save_toDate=2020-04-14; _jc_save_fromDate=2020-04-14; _jc_save_fromStation=%u897F%u5B89%2CXAY",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362"
        }
    text = requests.get(url,headers=headers).text
    try:
        js = json.loads(text)
        js_use = js['data']['data']
        df_js = pd.DataFrame(js_use)
        df_use = pd.DataFrame({'train_no':[''],'station_name':['']})
        df_use['train_no'] = train_no
        df_use['station_name'] = df_use['station_name'].apply(lambda x:df_js['station_name'].values)
        df_use.to_csv(r'../../file/subject/station.csv',mode='a',index=False,header=False)
        time.sleep(0.5)
    except:
        print('to csv fail!')

if __name__ == '__main__':
    train_nos = pd.read_csv(r'../../file/subject/sj2_data.csv')['编号'].values
    for train_no in train_nos:
        get_station(train_no)
    print('finished!')

to csv fail!
finished!


#### 3.3将停靠站点信息转换为列表

In [85]:
station_file = os.path.join(file_path,'NJ-SH-train_names.csv')
df_station = pd.read_csv(station_file,names=['停靠站','编号']) 
df_station.head()

Unnamed: 0,停靠站,编号
0,"[西安北, 渭南北, 华山北, 三门峡南, 洛阳龙门, 郑州东, 徐州, 南京, 苏州, 上海]",4f0000D3080F
1,"[北京南, 南京, 苏州, 上海]",240000D3130S
2,"[南京, 镇江, 丹阳, 常州, 无锡, 苏州, 上海]",54000G702940
3,"[南京南, 丹阳, 常州, 惠山, 苏州, 昆山南, 上海]",5l000G703100
4,"[南京, 镇江, 常州, 无锡, 无锡新区, 苏州, 苏州园区, 安亭北, 上海]",54000G703350


In [86]:
df_station['停靠站'][0][1:-1].replace('\'','').replace(' ','').split(',')

['西安北', '渭南北', '华山北', '三门峡南', '洛阳龙门', '郑州东', '徐州', '南京', '苏州', '上海']

In [87]:
df_station['停靠站'] = df_station['停靠站'].apply(lambda x:x[1:-1].replace('\'','').replace(' ','').split(','))
df_station.head()

Unnamed: 0,停靠站,编号
0,"[西安北, 渭南北, 华山北, 三门峡南, 洛阳龙门, 郑州东, 徐州, 南京, 苏州, 上海]",4f0000D3080F
1,"[北京南, 南京, 苏州, 上海]",240000D3130S
2,"[南京, 镇江, 丹阳, 常州, 无锡, 苏州, 上海]",54000G702940
3,"[南京南, 丹阳, 常州, 惠山, 苏州, 昆山南, 上海]",5l000G703100
4,"[南京, 镇江, 常州, 无锡, 无锡新区, 苏州, 苏州园区, 安亭北, 上海]",54000G703350


#### 3.4获取相邻站点信息

In [88]:
lst = df_station['停靠站'][0]
lst

['西安北', '渭南北', '华山北', '三门峡南', '洛阳龙门', '郑州东', '徐州', '南京', '苏州', '上海']

In [89]:
print(lst[:-1])

['西安北', '渭南北', '华山北', '三门峡南', '洛阳龙门', '郑州东', '徐州', '南京', '苏州']


In [90]:
print(lst[1:])

['渭南北', '华山北', '三门峡南', '洛阳龙门', '郑州东', '徐州', '南京', '苏州', '上海']


In [91]:
#获取相邻站点，用zip函数将上述两个列表进行组合
list(zip(lst[:-1],lst[1:]))

[('西安北', '渭南北'),
 ('渭南北', '华山北'),
 ('华山北', '三门峡南'),
 ('三门峡南', '洛阳龙门'),
 ('洛阳龙门', '郑州东'),
 ('郑州东', '徐州'),
 ('徐州', '南京'),
 ('南京', '苏州'),
 ('苏州', '上海')]

In [92]:
#将zip组合后点元组转换为列表
[i for i in map(list,zip(lst[:-1],lst[1:]))]

[['西安北', '渭南北'],
 ['渭南北', '华山北'],
 ['华山北', '三门峡南'],
 ['三门峡南', '洛阳龙门'],
 ['洛阳龙门', '郑州东'],
 ['郑州东', '徐州'],
 ['徐州', '南京'],
 ['南京', '苏州'],
 ['苏州', '上海']]

In [93]:
df_station['相邻站'] = df_station['停靠站'].apply(lambda x:[i for i in map(list,zip(x[:-1],x[1:]))])
df_station.head()

Unnamed: 0,停靠站,编号,相邻站
0,"[西安北, 渭南北, 华山北, 三门峡南, 洛阳龙门, 郑州东, 徐州, 南京, 苏州, 上海]",4f0000D3080F,"[[西安北, 渭南北], [渭南北, 华山北], [华山北, 三门峡南], [三门峡南, 洛..."
1,"[北京南, 南京, 苏州, 上海]",240000D3130S,"[[北京南, 南京], [南京, 苏州], [苏州, 上海]]"
2,"[南京, 镇江, 丹阳, 常州, 无锡, 苏州, 上海]",54000G702940,"[[南京, 镇江], [镇江, 丹阳], [丹阳, 常州], [常州, 无锡], [无锡, ..."
3,"[南京南, 丹阳, 常州, 惠山, 苏州, 昆山南, 上海]",5l000G703100,"[[南京南, 丹阳], [丹阳, 常州], [常州, 惠山], [惠山, 苏州], [苏州,..."
4,"[南京, 镇江, 常州, 无锡, 无锡新区, 苏州, 苏州园区, 安亭北, 上海]",54000G703350,"[[南京, 镇江], [镇江, 常州], [常州, 无锡], [无锡, 无锡新区], [无锡..."


### 4.数据连接

In [95]:
df_com = pd.merge(df,df_station,on='编号')
df_com.tail()

Unnamed: 0,编号,车次,发车站,到达站,发车时间,商务特等座,动卧,一等座,二等座,无座,记录日期,记录时间,余票总数,停售时间,始终站,停靠站,相邻站
2738,4f000G194200,G1939,苏州北,上海虹桥,22:52,10,0,22,426,0,2016-12-21,22:22,458,22:22,"[苏州北, 上海虹桥]","[西安北, 渭南北, 洛阳龙门, 郑州西, 郑州东, 开封北, 民权北, 商丘, 砀山南, ...","[[西安北, 渭南北], [渭南北, 洛阳龙门], [洛阳龙门, 郑州西], [郑州西, 郑..."
2739,240000G1570H,G157,常州北,无锡东,22:35,6,0,45,551,0,2016-12-21,22:05,602,22:05,"[常州北, 无锡东]","[北京南, 德州东, 济南西, 曲阜东, 徐州东, 南京南, 常州北, 无锡东, 上海虹桥]","[[北京南, 德州东], [德州东, 济南西], [济南西, 曲阜东], [曲阜东, 徐州东..."
2740,240000G1570H,G157,常州北,上海虹桥,22:35,6,0,42,535,0,2016-12-21,22:05,583,22:05,"[常州北, 上海虹桥]","[北京南, 德州东, 济南西, 曲阜东, 徐州东, 南京南, 常州北, 无锡东, 上海虹桥]","[[北京南, 德州东], [德州东, 济南西], [济南西, 曲阜东], [曲阜东, 徐州东..."
2741,240000G1570H,G157,无锡东,上海虹桥,22:54,8,0,63,656,0,2016-12-21,22:24,727,22:24,"[无锡东, 上海虹桥]","[北京南, 德州东, 济南西, 曲阜东, 徐州东, 南京南, 常州北, 无锡东, 上海虹桥]","[[北京南, 德州东], [德州东, 济南西], [济南西, 曲阜东], [曲阜东, 徐州东..."
2742,24000000G701,G7,南京南,上海虹桥,22:48,5,0,22,283,0,2016-12-21,22:18,310,22:18,"[南京南, 上海虹桥]","[北京南, 济南西, 南京南, 上海虹桥]","[[北京南, 济南西], [济南西, 南京南], [南京南, 上海虹桥]]"


#### 4.1判断列出始终站是否在相邻站当中，如果是，则过滤出了每站上下客情况
#### 特别注意，对DataFrame对象而不是其某列使用apply 

In [104]:
#teb
df_neighbor = df_com[df_com.apply(lambda x:x['始终站'] in x['相邻站'],axis=1)]
df_neighbor.head()

Unnamed: 0,编号,车次,发车站,到达站,发车时间,商务特等座,动卧,一等座,二等座,无座,记录日期,记录时间,余票总数,停售时间,始终站,停靠站,相邻站
0,4f0000D3080F,D305,南京,苏州,03:30,0,0,0,117,18,2016-12-21,03:00,135,03:00,"[南京, 苏州]","[西安北, 渭南北, 华山北, 三门峡南, 洛阳龙门, 郑州东, 徐州, 南京, 苏州, 上海]","[[西安北, 渭南北], [渭南北, 华山北], [华山北, 三门峡南], [三门峡南, 洛..."
2,4f0000D3080F,D305,苏州,上海,05:34,0,0,0,84,18,2016-12-21,05:04,102,05:04,"[苏州, 上海]","[西安北, 渭南北, 华山北, 三门峡南, 洛阳龙门, 郑州东, 徐州, 南京, 苏州, 上海]","[[西安北, 渭南北], [渭南北, 华山北], [华山北, 三门峡南], [三门峡南, 洛..."
3,240000D3130S,D313,南京,苏州,04:57,0,0,0,11,13,2016-12-21,04:27,24,04:27,"[南京, 苏州]","[北京南, 南京, 苏州, 上海]","[[北京南, 南京], [南京, 苏州], [苏州, 上海]]"
5,240000D3130S,D313,苏州,上海,06:50,0,0,0,0,0,2016-12-21,06:20,0,06:20,"[苏州, 上海]","[北京南, 南京, 苏州, 上海]","[[北京南, 南京], [南京, 苏州], [苏州, 上海]]"
6,54000G702940,G7029,南京,镇江,05:49,0,0,95,983,0,2016-12-21,05:19,1078,05:19,"[南京, 镇江]","[南京, 镇江, 丹阳, 常州, 无锡, 苏州, 上海]","[[南京, 镇江], [镇江, 丹阳], [丹阳, 常州], [常州, 无锡], [无锡, ..."


In [105]:
df_neighbor.to_csv(r'../../file/subject/station_ana.csv')