In [1]:
# Create by Claude
# http://www.tianqihoubao.com/

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
import datetime
import numpy as np
import re

In [2]:
def get_weather_data(year, month):
    url = f"http://www.tianqihoubao.com/lishi/shanghai/month/{year}{month:02d}.html"
    
    try:
        # 设置请求头
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        
        # 发送请求并指定编码
        response = requests.get(url, headers=headers)
        response.encoding = 'gbk'  # 明确指定编码为 gbk
        
        # 解析HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 找到天气数据表格
        table = soup.find('table', class_='b')
        
        weather_data = []
        
        # 解析表格数据
        for tr in table.find_all('tr')[1:]:  # 跳过表头
            tds = tr.find_all('td')
            if len(tds) >= 4:
                date = tds[0].text.strip()
                weather = tds[1].text.strip()
                temperature = tds[2].text.strip()
                windPower = tds[3].text.strip()
                
                # 清洗温度数据
                temp_data = clean_temperature(temperature)
                
                weather_data.append({
                    'date': date,
                    'weather': weather,
                    'low_temperature': temp_data['low_temp'],
                    'high_temperature': temp_data['high_temp'],
                    'windPower': windPower
                })
        
        return weather_data
    
    except Exception as e:
        print(f"Error fetching data for {year}-{month}: {str(e)}")
        return []

def clean_temperature(temp_str):
    try:
        temp_range = temp_str.replace('℃', '').split('/')
        if len(temp_range) == 2:
            low = float(temp_range[0].strip())
            high = float(temp_range[1].strip())
            return {'low_temp': low, 'high_temp': high}
    except:
        pass
    return {'low_temp': None, 'high_temp': None}


In [3]:
# 指定要爬取的年份和月份
year = 2017
months = [5, 6, 7, 8]

# 存储所有月份的天气数据
all_weather_data = []

# 爬取每个月的数据
for month in months:
    print(f"Fetching data for {year}-{month}")
    monthly_data = get_weather_data(year, month)
    all_weather_data.extend(monthly_data)
    
    time.sleep(0.2)

# 将数据转换为DataFrame
df = pd.DataFrame(all_weather_data)
    

Fetching data for 2017-5
Fetching data for 2017-6
Fetching data for 2017-7
Fetching data for 2017-8


In [4]:
df

Unnamed: 0,date,weather,low_temperature,high_temperature,windPower
0,2017年05月01日,多云\r\n /小雨,29.0,19.0,东南风 ≤3级\r\n ...
1,2017年05月02日,阵雨\r\n /小雨,21.0,18.0,东风 3-4级\r\n ...
2,2017年05月03日,阴\r\n /小雨,22.0,18.0,东南风 ≤3级\r\n ...
3,2017年05月04日,小雨\r\n /阴,21.0,18.0,东南风 ≤3级\r\n ...
4,2017年05月05日,阴\r\n /多云,22.0,17.0,西北风 ≤3级\r\n ...
...,...,...,...,...,...
118,2017年08月27日,小雨\r\n /多云,33.0,27.0,东南风 ≤3级\r\n ...
119,2017年08月28日,晴\r\n /多云,36.0,27.0,南风 ≤3级\r\n ...
120,2017年08月29日,大雨\r\n /大雨,32.0,26.0,北风 3-4级\r\n ...
121,2017年08月30日,阴\r\n /阴,30.0,25.0,东北风 ≤3级\r\n ...


In [5]:
# Post processing
cleanedData = []

for line in df.values:
    date = datetime.datetime.strptime(line[0],'%Y年%m月%d日')
    date = datetime.datetime.strftime(date, '%Y%m%d')

    weather = 0
    if '雨' in line[1]:
        weather = 1

    windPower = np.mean([float(num) for num in re.findall(r'\d', line[4])])
    cleanedData.append([date, weather, line[2], line[3], (line[2]+line[3])/2, round(windPower,2)])

myDataFrame = pd.DataFrame(cleanedData, columns=['date','isRainy','highTemp','lowTemp','avgTemp','windPower'])
myDataFrame

Unnamed: 0,date,isRainy,highTemp,lowTemp,avgTemp,windPower
0,20170501,1,29.0,19.0,24.0,3.00
1,20170502,1,21.0,18.0,19.5,3.50
2,20170503,1,22.0,18.0,20.0,3.33
3,20170504,1,21.0,18.0,19.5,3.00
4,20170505,0,22.0,17.0,19.5,3.00
...,...,...,...,...,...,...
118,20170827,1,33.0,27.0,30.0,3.00
119,20170828,0,36.0,27.0,31.5,3.00
120,20170829,1,32.0,26.0,29.0,3.33
121,20170830,0,30.0,25.0,27.5,3.00


In [6]:
myDataFrame.to_csv('../../MetaData/shanghai_weather.csv',index=None)