# 第一步，导入网页链接CSV文件，把每个网页转换成单独文本文件保存

In [1]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup

# 步骤1：从Excel文件读取URLs
# 注意：请将下面的路径替换为你的Excel文件的真实路径
# 并确保Excel文件中的工作表名和列名与下面的代码一致
excel_file_path = 'doc_and_postdoc/doc/doc_position.xlsx'
sheet_name = 'Sheet1'  # 更改为你的sheet名
url_column = '项目网站'   # 更改为含有URL的列名


In [2]:
df = pd.read_excel(excel_file_path, sheet_name=sheet_name, engine='openpyxl')
for i, url in enumerate(df[url_column]):
    print(url) #检查确认每个网址都被导入

https://kth.varbi.com/en/what:job/jobID:714021/type:job/where:51/apply:1
https://www.ncl.ac.uk/postgraduate/fees-funding/search-funding/?code=ph044
https://www.bcu.ac.uk/research/our-phds/phd-opportunities/the-impact-of-alternative-active-travel-on-peoples-sport-health-and-mobile-lives
https://efzu.fa.em2.oraclecloud.com/hcmUI/CandidateExperience/en/sites/CX_1/job/3173/
https://kth.varbi.com/en/what:job/jobID:714397/type:job/where:51/apply:1
https://vacatures.uva.nl/UvA/job/PhD-Biodiverse-Urban-Waterfronts/792378702/
https://www.kth.se/lediga-jobb/706790?l=en
https://www.ru.nl/en/working-at/job-opportunities/phd-candidate-spatial-planning-and-flood-risk-management
https://www.ncl.ac.uk/postgraduate/fees-funding/search-funding/?code=ph047
https://www.academictransfer.com/en/338098/phd-position-in-public-economics-of-housing-10-fte/?utm_source=ATemailalert&utm_medium=email&utm_campaign=job_click
https://www.uu.nl/en/organisation/working-at-utrecht-university/jobs/three-phd-positions-on-d

In [3]:
# 定义一个文件夹来存储抓取的文本文件
folder_path = 'doc_and_postdoc/doc'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [4]:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException 
import time

valid_page = []

options = webdriver.ChromeOptions()
options.add_argument('--headless')  # 使用无头模式运行（无GUI）
options.add_argument('--disable-gpu')  # 禁用GPU加速（通常用在无GUI环境下）
options.add_argument('--no-sandbox')

for i, url in enumerate(df[url_column]):
    
    print("Processing: " + str(i))
    print(url)

    try:
        # 使用Chrome WebDriver
        with webdriver.Chrome(options=options) as driver:
            driver.get(url)

            # 可选：等待一些时间，让JavaScript有时间加载和重定向
            time.sleep(5)  

            # 获取网页的源代码
            page_source = driver.page_source

        # 使用BeautifulSoup解析网页内容
        soup = BeautifulSoup(page_source, 'html.parser')
        
        # 移除不需要的标签
        for tag in soup(['script', 'style', 'header', 'nav', 'footer', 'aside']):
            tag.extract()

        # 提取纯文本内容（无HTML标签）
        page_content = soup.get_text(separator='\n', strip=True)
        #page_content += '\n' + 'The web URL: ' + url

        # 定义文件名及其路径
        index_i = ""
        for x in range(4-len(str(i))):
            index_i += "0"
        index_i += str(i)   
        file_name = f'page_{index_i}.txt'  
        file_path = os.path.join(folder_path, file_name)
        
        #收集有效网址
        valid_page.append(url)


        # 将内容写入文件
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(page_content)

        print("---- collected. Save as " + file_name)
        
    except TimeoutException:
        # 处理超时异常
        print(f"Timeout when accessing {url}. Skipping...")
        
    except Exception as e:
        # 处理其他类型的异常
        print(f"An error occurred when processing {url}: {str(e)}. Skipping...")


print("web information collected")

Processing: 0
https://kth.varbi.com/en/what:job/jobID:714021/type:job/where:51/apply:1
---- collected. Save as page_0000.txt
Processing: 1
https://www.ncl.ac.uk/postgraduate/fees-funding/search-funding/?code=ph044
---- collected. Save as page_0001.txt
Processing: 2
https://www.bcu.ac.uk/research/our-phds/phd-opportunities/the-impact-of-alternative-active-travel-on-peoples-sport-health-and-mobile-lives
---- collected. Save as page_0002.txt
Processing: 3
https://efzu.fa.em2.oraclecloud.com/hcmUI/CandidateExperience/en/sites/CX_1/job/3173/
---- collected. Save as page_0003.txt
Processing: 4
https://kth.varbi.com/en/what:job/jobID:714397/type:job/where:51/apply:1
---- collected. Save as page_0004.txt
Processing: 5
https://vacatures.uva.nl/UvA/job/PhD-Biodiverse-Urban-Waterfronts/792378702/
---- collected. Save as page_0005.txt
Processing: 6
https://www.kth.se/lediga-jobb/706790?l=en
---- collected. Save as page_0006.txt
Processing: 7
https://www.ru.nl/en/working-at/job-opportunities/phd-ca

In [5]:
print(valid_page) #查看有效网页

['https://kth.varbi.com/en/what:job/jobID:714021/type:job/where:51/apply:1', 'https://www.ncl.ac.uk/postgraduate/fees-funding/search-funding/?code=ph044', 'https://www.bcu.ac.uk/research/our-phds/phd-opportunities/the-impact-of-alternative-active-travel-on-peoples-sport-health-and-mobile-lives', 'https://efzu.fa.em2.oraclecloud.com/hcmUI/CandidateExperience/en/sites/CX_1/job/3173/', 'https://kth.varbi.com/en/what:job/jobID:714397/type:job/where:51/apply:1', 'https://vacatures.uva.nl/UvA/job/PhD-Biodiverse-Urban-Waterfronts/792378702/', 'https://www.kth.se/lediga-jobb/706790?l=en', 'https://www.ru.nl/en/working-at/job-opportunities/phd-candidate-spatial-planning-and-flood-risk-management', 'https://www.ncl.ac.uk/postgraduate/fees-funding/search-funding/?code=ph047', 'https://www.academictransfer.com/en/338098/phd-position-in-public-economics-of-housing-10-fte/?utm_source=ATemailalert&utm_medium=email&utm_campaign=job_click', 'https://www.uu.nl/en/organisation/working-at-utrecht-universi

In [6]:
#将有效网址写入csv
import csv
valid_page_path = 'doc_and_postdoc/doc/valid_page.csv'

# 打开CSV文件
with open(valid_page_path, 'w', newline='') as file:
    writer = csv.writer(file)
    column = ['valid_page']
    writer.writerow(column)
    # 写入列表数据
    for value in valid_page:
        writer.writerow([value])

# 第二步，调用gpt，把每个txt内容总结概括

In [10]:
!pip install --upgrade openai #注意每次GPT版本更新

Collecting openai
  Obtaining dependency information for openai from https://files.pythonhosted.org/packages/19/50/5c4a8bdc5891d18d8e08a5d6c6a157dd0edfe0263470a32ba6e955b72b28/openai-1.23.1-py3-none-any.whl.metadata
  Downloading openai-1.23.1-py3-none-any.whl.metadata (21 kB)
Downloading openai-1.23.1-py3-none-any.whl (310 kB)
   ---------------------------------------- 0.0/311.0 kB ? eta -:--:--
   - -------------------------------------- 10.2/311.0 kB ? eta -:--:--
   --- ------------------------------------ 30.7/311.0 kB 1.3 MB/s eta 0:00:01
   ------- ------------------------------- 61.4/311.0 kB 544.7 kB/s eta 0:00:01
   ---------- ---------------------------- 81.9/311.0 kB 657.6 kB/s eta 0:00:01
   --------------- ---------------------- 122.9/311.0 kB 722.1 kB/s eta 0:00:01
   -------------------- ----------------- 163.8/311.0 kB 701.4 kB/s eta 0:00:01
   --------------------- ---------------- 174.1/311.0 kB 615.9 kB/s eta 0:00:01
   ----------------------- -------------- 194.6/

In [11]:
from openai import OpenAI
import os
import csv


from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file


In [12]:
#重新定义调用gpt的函数
def get_response(prompt):
    
    messages = [ {"role":"user", "content": prompt} ]
    
    client = OpenAI(api_key=  'MY_OPENAI_API_KEY')
    
    response = client.chat.completions.create(
        model = "gpt-4",
        messages = messages,
        temperature = 0.1
        )
    
    return response.choices[0].message.content.strip( ) # 只选择结果中gpt回复我们的话

In [13]:
#打开之前存好的有效网址表格

f = open('doc_and_postdoc/doc/valid_page.csv')
csv_reader = csv.reader(f)

valid_page = []

index = 0
for line in csv_reader:
    if index != 0:
        valid_page.append(line[0])
    
    index +=1

In [14]:
print(valid_page)

['https://kth.varbi.com/en/what:job/jobID:714021/type:job/where:51/apply:1', 'https://www.ncl.ac.uk/postgraduate/fees-funding/search-funding/?code=ph044', 'https://www.bcu.ac.uk/research/our-phds/phd-opportunities/the-impact-of-alternative-active-travel-on-peoples-sport-health-and-mobile-lives', 'https://efzu.fa.em2.oraclecloud.com/hcmUI/CandidateExperience/en/sites/CX_1/job/3173/', 'https://kth.varbi.com/en/what:job/jobID:714397/type:job/where:51/apply:1', 'https://vacatures.uva.nl/UvA/job/PhD-Biodiverse-Urban-Waterfronts/792378702/', 'https://www.kth.se/lediga-jobb/706790?l=en', 'https://www.ru.nl/en/working-at/job-opportunities/phd-candidate-spatial-planning-and-flood-risk-management', 'https://www.ncl.ac.uk/postgraduate/fees-funding/search-funding/?code=ph047', 'https://www.academictransfer.com/en/338098/phd-position-in-public-economics-of-housing-10-fte/?utm_source=ATemailalert&utm_medium=email&utm_campaign=job_click', 'https://www.uu.nl/en/organisation/working-at-utrecht-universi

In [15]:
#打开之前存好的各个学校文本数据

folder_path = "doc_and_postdoc/doc"

txt_contents = []
txt_summarized_list = []

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        filepath = os.path.join(folder_path, filename)
        with open(filepath, 'r', encoding = 'utf-8') as file:
            txt_contents.append(file.read())
            
print(len(txt_contents))

18


In [16]:
#查看每个网页的字数，过少或者过多的证明爬取有问题
for i in range(len(txt_contents)):
    print(len(txt_contents[i]))

7648
4323
5095
8097
7570
9955
7894
10442
3520
4217
7482
3893
8675
9259
10041
9161
3784
4073


In [19]:

for i in range(len(txt_contents)):
    prompt = f"""
    
    You are helping your chinese clients to quickly extract information from PhD positions from website. 
    Please summarize them with the following format below:
    ```
    岗位名称：
    岗位类型：
    岗位信息概述：
    雇主：
    所属系/学院/研究组：
    研究关键词：
    相关背景与技能要求：
    岗位时长与薪资：
    申请截止日期：
    ```
    And here is the example: 
    ```
    岗位名称：城市交通规划方向博士
    
    岗位类型：博士
    
    岗位信息概述： 此博士职位旨在通过设计创新方法和算法来提高共享出行系统的操作效率。研究重点包括匹配、定价、车辆重定位和路线规划等关键优化决策，以应对这些系统中供需的随机性、非稳定性，以及操作决策的时间和地点依赖性。成功的研究将有助于实现更高效、可持续和公平的交通未来。

    雇主： Delft University of Technology（代尔夫特理工大学）
    所属系/学院/研究组： Faculty Civil Engineering & Geosciences（土木工程与地球科学学院）
    研究关键词： 共享出行系统，操作效率优化，匹配算法，定价策略，车辆重定位，路线规划

    相关背景与技能要求：
    应用数学、运筹学、计算机科学、工业工程或交通工程等领域的硕士学位，具备强大的优化和数学建模背景。
    对技术思维、与实践密切合作的科学研究、学习新工具充满热情，精通编程语言（首选Python）。
    岗位时长与薪资： 四年，工资首年为每月EUR 2,770（税前），并逐年增长，将在最后一年增至EUR 3,539（税前）。此外还提供福利待遇，包括8%的假期津贴和8.3%的年终奖金。

    申请截止日期： 2023/11/12
    ```
 
    Be aware:
    Do not copy the example.
    For the "岗位名称", you should summarize it into around 10-15 Chinese words, and the position name in original language if avaliable.
    For the "雇主" and the "所属系/学院/研究组" , you should provide the both original language and Chinese.
    For the "岗位信息概述" and the "相关背景与技能要求" part, it should be less than around 150 Chinese charactors.
    
    The information is download from website, and there are many not related information will disturb and confuse you.
    Please distinguish what are useful information for PhD/Post-Doc positions, and what are the useless infromation and element from website.
    Please note that, Some websites are invalid, please do not make the information up.

    Here is the PhD/Post-Doc positions information ```{txt_contents[i]}```
    """

    print("processing:" + str(i+1) + "/" + str(len(txt_contents)))
    response = get_response(prompt)
    head_text = "["+ str(i+1) + "]"+"\n\n"
    end_text = "\n\n" + "网页链接： " + valid_page[i] + "\n\n\n\n"
    
    this_Text = head_text + response + end_text
    
    txt_summarized_list.append(this_Text)
    
    # 定义文件名及其路径
    file_name = f'gpt_summarized_{i}.txt'  
    file_path = os.path.join('doc_and_postdoc/doc/summary', file_name)

    # 将内容写入文件
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(this_Text)
    
    print("----done")
    
print("All done")

processing:1/18
----done
processing:2/18
----done
processing:3/18
----done
processing:4/18
----done
processing:5/18
----done
processing:6/18
----done
processing:7/18
----done
processing:8/18
----done
processing:9/18
----done
processing:10/18
----done
processing:11/18
----done
processing:12/18
----done
processing:13/18
----done
processing:14/18
----done
processing:15/18
----done
processing:16/18
----done
processing:17/18
----done
processing:18/18
----done
All done
