### Weather Data Scraping

In [17]:
import pandas as pd
import requests
from datetime import timedelta
from time import sleep
from bs4 import BeautifulSoup as bs
import numpy as np
from tqdm import tqdm
import warnings
# Ignore warnings
warnings.filterwarnings("ignore", category=UserWarning)

year = 2024
month = 5
day = 29
hour = 10
path = "D:\\OneDrive\\Jupyter_Cloud\\Machine_learning\\Big Data\\Airline-Analysis\\Weather_api_data\\"
url = f'https://www.wx-now.com/Archival/ZGSZ/{year}/{month}/{day}/{hour}/00'

#### Approach 1: `pd.read_html`

In [14]:
def Generate_Data(month, day, hour):
    url = f'https://www.wx-now.com/Archival/ZGSZ/{year}/{month}/{day}/{hour}/00'
    
    df = pd.read_html(url, encoding='gbk', header=0)[0]
    Combine = df.copy()
    data = np.array(Combine.columns)
    data_df = pd.DataFrame([data], columns=[0, 1])
    Combine.columns = [0, 1]
    Combine = pd.concat([data_df, Combine]).reset_index(drop=True)
    Combine.to_excel(path + f'weather_data_2024_{month}_{day}_{hour}.xlsx', index=False)

In [18]:
# UTC time
begin_date = pd.to_datetime('2024-4-19 00:00') - timedelta(hours=8)
end_date = pd.to_datetime('2024-5-20 00:00') - timedelta(hours=8)
date_index = begin_date
date_list = []
while date_index != end_date:
    month = date_index.month
    day = date_index.day
    hour = date_index.hour
    date_list.append([month, day, hour])
    date_index += timedelta(hours=1)

In [19]:
for date in tqdm(date_list, ncols=100):
    Generate_Data(date[0], date[1], date[2])
    sleep(3)

100%|███████████████████████████████████████████████████████████| 744/744 [1:29:22<00:00,  7.21s/it]


#### Approach 2: Request+Beautifulsoup

In [5]:
r = requests.request('GET', url=url)

In [6]:
print(r.encoding)
print(r.apparent_encoding)
# r.encoding是根据http header推测出的编码方式,如果没有header,默认为ISO-8859-1。
# r.apparent_encoding是根据内容分析出来的编码方式,用这个代替r.encoding可以解析出中文。
# 可以在发送request请求前，先设置响应的编码方式。
# 如：r.encoding = "utf-8" 或"iso-8859-1"或"ascii"或"unicode-escape"

utf-8
Windows-1252


In [7]:
print(type(r.text))
print(type(r.content))

<class 'str'>
<class 'bytes'>


In [8]:
Weather_text = r.content.decode()

In [9]:
Weather_text



In [10]:
Weather_soup = bs(Weather_text, 'html.parser')
Weather_soup


<!DOCTYPE html>

<html lang="en">
<head>
<title>Weather Now</title>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="Inner Drive Techology, Chicago" name="Author">
<base href="/"/>
<link crossorigin="anonymous" href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" integrity="sha384-1BmE4kWBq78iYhFldvKuhfTAU6auU8tT94WrHftjDbrCEXSU1oBoqyl2QvZ6jIW3" rel="stylesheet"/>
<link href="https://cdn.jsdelivr.net/npm/metismenu/dist/metisMenu.min.css" rel="stylesheet"/>
<link href="css/site.css" rel="stylesheet"/>
<link href="css/weather-now-5.css" rel="stylesheet"/>
<link href="_content/Blazorise/blazorise.css" rel="stylesheet"/>
<link href="_content/Blazorise.Bootstrap/blazorise.bootstrap.css" rel="stylesheet"/>
<link href="InnerDrive.Weather.WebUI.styles.css" rel="stylesheet" type="text/css"/>
<script crossorigin="anonymous" src="https://kit.fontawesome.com/a6344e4920.js"></script>
<!-- Global site tag (gtag.

In [11]:
Weather_soup.prettify()



##### Weather Features

In [12]:
Weather_table = Weather_soup.find("table", {"class":"b-table table table-borderless"})
Weather_table

<table class="b-table table table-borderless"><tbody>
<tr><th scope="row">Temperature</th><td>29.0°C (84.2°F)</td></tr>
<tr><th scope="row">Dew point</th><td>20.0°C (68.0°F)</td></tr><tr><th scope="row">Winds</th>
<td>7.8 kts (4 m/s, 8.9 mph) Vrb </td></tr><tr><th scope="row">Winds variable</th><td>Winds variable from East-Northeast (70°) to Southeast (130°)</td></tr>
<tr><th scope="row">Visibility</th><td>6.2 mi. (10 km)</td></tr>
<tr><th scope="row">Clouds</th>
<td><strong>Ceiling: </strong>Broken clouds 4,000 ft. (1,219 m)</td></tr>
<tr><th scope="row">Altimeter</th><td>29.65 in.Hg (1004 mB)</td></tr>
<tr><th scope="row">Relative humidity</th><td>58 %</td></tr>
<tr><th scope="row">Density altitude</th><td>2,236 ft. (682 m)</td></tr>
<tr><th scope="row">Remarks</th>
<td>No significant changes expected in the next two hours<br/></td></tr><tr><th scope="row">Flight conditions</th>
<td class="table-success">Visual Meteorological Conditions</td></tr><tr><th scope="row">Raw data</th><td>Z

In [13]:
Weather_tr = Weather_table.find_all("tr")
Weather_tr

[<tr><th scope="row">Temperature</th><td>29.0°C (84.2°F)</td></tr>,
 <tr><th scope="row">Dew point</th><td>20.0°C (68.0°F)</td></tr>,
 <tr><th scope="row">Winds</th>
 <td>7.8 kts (4 m/s, 8.9 mph) Vrb </td></tr>,
 <tr><th scope="row">Winds variable</th><td>Winds variable from East-Northeast (70°) to Southeast (130°)</td></tr>,
 <tr><th scope="row">Visibility</th><td>6.2 mi. (10 km)</td></tr>,
 <tr><th scope="row">Clouds</th>
 <td><strong>Ceiling: </strong>Broken clouds 4,000 ft. (1,219 m)</td></tr>,
 <tr><th scope="row">Altimeter</th><td>29.65 in.Hg (1004 mB)</td></tr>,
 <tr><th scope="row">Relative humidity</th><td>58 %</td></tr>,
 <tr><th scope="row">Density altitude</th><td>2,236 ft. (682 m)</td></tr>,
 <tr><th scope="row">Remarks</th>
 <td>No significant changes expected in the next two hours<br/></td></tr>,
 <tr><th scope="row">Flight conditions</th>
 <td class="table-success">Visual Meteorological Conditions</td></tr>,
 <tr><th scope="row">Raw data</th><td>ZGSZ 291000Z 10004MPS 07

In [44]:
for tr in Weather_tr:
    print(tr)

<tr><th scope="row">Temperature</th><td>29.0°C (84.2°F)</td></tr>
<tr><th scope="row">Dew point</th><td>20.0°C (68.0°F)</td></tr>
<tr><th scope="row">Winds</th>
<td>7.8 kts (4 m/s, 8.9 mph) Vrb </td></tr>
<tr><th scope="row">Winds variable</th><td>Winds variable from East-Northeast (70°) to Southeast (130°)</td></tr>
<tr><th scope="row">Visibility</th><td>6.2 mi. (10 km)</td></tr>
<tr><th scope="row">Clouds</th>
<td><strong>Ceiling: </strong>Broken clouds 4,000 ft. (1,219 m)</td></tr>
<tr><th scope="row">Altimeter</th><td>29.65 in.Hg (1004 mB)</td></tr>
<tr><th scope="row">Relative humidity</th><td>58 %</td></tr>
<tr><th scope="row">Density altitude</th><td>2,236 ft. (682 m)</td></tr>
<tr><th scope="row">Remarks</th>
<td>No significant changes expected in the next two hours<br/></td></tr>
<tr><th scope="row">Flight conditions</th>
<td class="table-success">Visual Meteorological Conditions</td></tr>
<tr><th scope="row">Raw data</th><td>ZGSZ 291000Z 10004MPS 070V130 9999 BKN040 29/20 Q10

`find_all()` is a method used in the BeautifulSoup library, which is commonly used for web scraping in Python. This method is used to extract all the elements that match a specified set of criteria from the HTML or XML content.

For example, if you have a BeautifulSoup object called `soup`, you can use the `find_all()` method to find all instances of a certain tag, class, or other attribute within the HTML document, and it will return a list of all matching elements.

Here's an example of how you might use `find_all()` to extract all the hyperlinks (`<a>` tags) from a webpage:

```python
from bs4 import BeautifulSoup
import requests

# Make a request to the webpage
url = 'https://example.com'
page = requests.get(url)

# Create a BeautifulSoup object
soup = BeautifulSoup(page.content, 'html.parser')

# Find all the hyperlinks on the page
links = soup.find_all('a')

# Print the URLs of all the hyperlinks
for link in links:
    print(link.get('href'))
```

In this example, `soup.find_all('a')` returns a list of all the `<a>` tags found in the HTML content, and then we iterate through the list to print out the URLs.

This method is very powerful for extracting specific information from web pages when performing web scraping or parsing HTML documents.

In [14]:
len(Weather_tr)

13

In [25]:
print(Weather_tr[0])
print(Weather_tr[0].find_all("th"))
print(Weather_tr[0].find_all("td"))
print(Weather_tr[0].find_all("th")[0])
print(Weather_tr[0].find_all("th")[0].string)
print(Weather_tr[0].find_all("td")[0])
print(Weather_tr[0].find_all("td")[0].string)

<tr><th scope="row">Temperature</th><td>29.0°C (84.2°F)</td></tr>
[<th scope="row">Temperature</th>]
[<td>29.0°C (84.2°F)</td>]
<th scope="row">Temperature</th>
Temperature
<td>29.0°C (84.2°F)</td>
29.0°C (84.2°F)


In [56]:
print(Weather_tr[6])
print(Weather_tr[6].find_all("th"))
print(Weather_tr[6].find_all("td"))
print(Weather_tr[6].find_all("th")[0])
print(Weather_tr[6].find_all("th")[0].string)
print(Weather_tr[6].find_all("td")[0])
print(Weather_tr[6].find_all("td")[0].string)

<tr><th scope="row">Clouds</th>
<td><strong>Ceiling: </strong>Broken clouds 4,000 ft. (1,219 m)</td></tr>
[<th scope="row">Clouds</th>]
[<td><strong>Ceiling: </strong>Broken clouds 4,000 ft. (1,219 m)</td>]
<th scope="row">Clouds</th>
Clouds
<td><strong>Ceiling: </strong>Broken clouds 4,000 ft. (1,219 m)</td>
None


In [76]:
special = str(Weather_tr[6].find_all("td")[0])
special = bs(special, 'html.parser')
special.td.get_text()

'Ceiling: Broken clouds 4,000 ft. (1,219 m)'

In [67]:
html = '<td><strong>Ceiling: </strong>Broken clouds 4,000 ft. (1,219 m)</td>'
soup = bs(html, 'html.parser')

target_info = soup.td.get_text()
print(target_info)

Ceiling: Broken clouds 4,000 ft. (1,219 m)


In [66]:
Weather_tr[6].find_all("td")[0].td

In [16]:
for thx in Weather_tr[0]:
    print(thx, type(thx))

<th scope="row">Temperature</th> <class 'bs4.element.Tag'>
<td>29.0°C (84.2°F)</td> <class 'bs4.element.Tag'>


In [17]:
list(thx.string for thx in Weather_tr[0])

['Temperature', '29.0°C (84.2°F)']

In [55]:
Columns_1 = []
Values_1 = []
for trx1 in Weather_tr: # Layer 1
    print(trx1.find_all("th")[0])
    print(trx1.find_all("td")[0])
    print()
    # column = trx1.find_all("th")[0]
    # value = trx1.find_all("td")[0]
    # Columns_1.append(column)
    # Values_1.append(value)

<th scope="row">Temperature</th>
<td>29.0°C (84.2°F)</td>

<th scope="row">Heat index</th>

<th scope="row">Dew point</th>
<td>20.0°C (68.0°F)</td>

<th scope="row">Winds</th>
<td>7.8 kts (4 m/s, 8.9 mph) Vrb </td>

<th scope="row">Winds variable</th>
<td>Winds variable from East-Northeast (70°) to Southeast (130°)</td>

<th scope="row">Visibility</th>
<td>6.2 mi. (10 km)</td>

<th scope="row">Clouds</th>
<td><strong>Ceiling: </strong>Broken clouds 4,000 ft. (1,219 m)</td>

<th scope="row">Altimeter</th>
<td>29.65 in.Hg (1004 mB)</td>

<th scope="row">Relative humidity</th>
<td>58 %</td>

<th scope="row">Density altitude</th>
<td>2,236 ft. (682 m)</td>

<th scope="row">Remarks</th>
<td>No significant changes expected in the next two hours<br/></td>

<th scope="row">Flight conditions</th>
<td class="table-success">Visual Meteorological Conditions</td>

<th scope="row">Raw data</th>
<td>ZGSZ 291000Z 10004MPS 070V130 9999 BKN040 29/20 Q1004 NOSIG</td>



In [43]:
Values_1

['29.0°C (84.2°F)',
 '31.0°C (87.8°F)',
 '20.0°C (68.0°F)',
 '7.8 kts (4 m/s, 8.9 mph) Vrb ',
 'Winds variable from East-Northeast (70°) to Southeast (130°)',
 '6.2 mi. (10 km)',
 None,
 '29.65 in.Hg (1004 mB)',
 '58 %',
 '2,236 ft. (682 m)',
 None,
 'Visual Meteorological Conditions',
 'ZGSZ 291000Z 10004MPS 070V130 9999 BKN040 29/20 Q1004 NOSIG']

In [46]:
Columns_1

['Temperature',
 'Heat index',
 'Dew point',
 'Winds',
 'Winds variable',
 'Visibility',
 'Clouds',
 'Altimeter',
 'Relative humidity',
 'Density altitude',
 'Remarks',
 'Flight conditions',
 'Raw data']

In [47]:
df

Unnamed: 0,Temperature,29.0°C (84.2°F)
0,Heat index,31.0°C (87.8°F)
1,Dew point,20.0°C (68.0°F)
2,Winds,"7.8 kts (4 m/s, 8.9 mph) Vrb"
3,Winds variable,Winds variable from East-Northeast (70°) to So...
4,Visibility,6.2 mi. (10 km)
5,Clouds,"Ceiling: Broken clouds 4,000 ft. (1,219 m)"
6,Altimeter,29.65 in.Hg (1004 mB)
7,Relative humidity,58 %
8,Density altitude,"2,236 ft. (682 m)"
9,Remarks,No significant changes expected in the next tw...


In [51]:
Weather_df_1 = pd.DataFrame(np.array([Values_1]), columns=Columns_1)
Weather_df_1

Unnamed: 0,Temperature,Heat index,Dew point,Winds,Winds variable,Visibility,Clouds,Altimeter,Relative humidity,Density altitude,Remarks,Flight conditions,Raw data
0,29.0°C (84.2°F),31.0°C (87.8°F),20.0°C (68.0°F),"7.8 kts (4 m/s, 8.9 mph) Vrb",Winds variable from East-Northeast (70°) to So...,6.2 mi. (10 km),,29.65 in.Hg (1004 mB),58 %,"2,236 ft. (682 m)",,Visual Meteorological Conditions,ZGSZ 291000Z 10004MPS 070V130 9999 BKN040 29/2...


In [80]:
Weather_tr_str = str(Weather_tr).replace('[', '').replace(']', '')
Weather_tr_soup = bs(Weather_tr_str, 'html.parser')

This code snippet utilizes the BeautifulSoup library to parse HTML pages. It begins by finding all `<tr>` tags, which represent rows in an HTML table, within the BeautifulSoup object (named `soup`). Then, it iterates over each row:

1. First, it searches for the `<th>` tag within the current row with the attribute `scope='row'`, indicating a table header cell.
2. Next, it finds the `<td>` tag within the current row, representing a data cell in the table.
3. If both the table header and data cells are found, it prints out the text content of both cells (stripped of leading and trailing whitespace).

The `get_text(strip=True)` method is used to extract the text content from a BeautifulSoup Tag object, which represents an HTML element. Here's what each part does:

- `get_text()`: This method retrieves all the text within the tag, including any text within its children tags.
- `strip=True`: This parameter removes leading and trailing whitespace from the extracted text.

`td.get_text(strip=True)` specifically extracts the text from the `<td>` tag (data cell) and removes any leading or trailing whitespace before returning it. This ensures that you get clean, stripped text content from the data cell.

So, the purpose of this code is to iterate through each row of an HTML table, extract the table header and data, and print them out.

In [92]:
Columns_2 = []
Values_2 = []
for tr in Weather_tr_soup.find_all('tr'):
    th = tr.find('th', scope='row')
    td = tr.find('td')
    if th and td:
        column = th.get_text()
        value = td.get_text(strip=True)
        Columns_2.append(column)
        Values_2.append(value)
        print(f'{column}:', td.get_text(strip=False))

Temperature: 29.0°C (84.2°F)
Heat index: 31.0°C (87.8°F)
Dew point: 20.0°C (68.0°F)
Winds: 7.8 kts (4 m/s, 8.9 mph) Vrb 
Winds variable: Winds variable from East-Northeast (70°) to Southeast (130°)
Visibility: 6.2 mi. (10 km)
Clouds: Ceiling: Broken clouds 4,000 ft. (1,219 m)
Altimeter: 29.65 in.Hg (1004 mB)
Relative humidity: 58 %
Density altitude: 2,236 ft. (682 m)
Remarks: No significant changes expected in the next two hours
Flight conditions: Visual Meteorological Conditions
Raw data: ZGSZ 291000Z 10004MPS 070V130 9999 BKN040 29/20 Q1004 NOSIG


In [93]:
Weather_df_2 = pd.DataFrame(np.array([Values_2]), columns=Columns_2)
Weather_df_2

Unnamed: 0,Temperature,Heat index,Dew point,Winds,Winds variable,Visibility,Clouds,Altimeter,Relative humidity,Density altitude,Remarks,Flight conditions,Raw data
0,29.0°C (84.2°F),31.0°C (87.8°F),20.0°C (68.0°F),"7.8 kts (4 m/s, 8.9 mph) Vrb",Winds variable from East-Northeast (70°) to So...,6.2 mi. (10 km),"Ceiling:Broken clouds 4,000 ft. (1,219 m)",29.65 in.Hg (1004 mB),58 %,"2,236 ft. (682 m)",No significant changes expected in the next tw...,Visual Meteorological Conditions,ZGSZ 291000Z 10004MPS 070V130 9999 BKN040 29/2...


##### Weather Date

In [98]:
Date_value = Weather_soup.find('p', {'class': 'card-title-desc'})
Date_value = Date_value.get_text(strip=True)
Date_value

'Report from: Wednesday, May 29, 2024 6:00 PM CSTCurrent time: Thursday, May 30, 2024 3:49 PM CST'