In [1]:
import re
import json

# First - Find Non related Urls

In [2]:
def convert_glassdoor_url(url):
    # 检查是否为需要转换的Reviews格式
    if '/Reviews/' in url:
        # 提取公司名称
        # 从URL中找到公司名称部分
        company_pattern = r'/Reviews/(.*?)-Reviews-E(\d+)\.htm'
        match = re.search(company_pattern, url)
        
        if match:
            company_name = match.group(1)
            company_id = match.group(2)
            # 构建新的URL格式
            new_url = f'https://www.glassdoor.com/Overview/Working-at-{company_name}-EI_IE{company_id}.11.htm'
            return new_url
    # 如果已经是正确的格式，直接返回
    return url

def process_url_list(company_url_list):
    processed_list = []
    
    for item in company_url_list:
        company = item.get('company', '')
        original_url = item.get('url', '')
        
        # 转换URL
        converted_url = convert_glassdoor_url(original_url)
        
        # 添加到新列表
        processed_list.append({
            'company': company,
            'url': converted_url
        })
        
        # 打印转换信息（可选）
        if original_url != converted_url:
            print(f"转换: {company}")
            print(f"  原始: {original_url}")
            print(f"  转换后: {converted_url}")
            print()
    
    return processed_list

In [3]:
# 示例数据
# 1. 从JSON文件读取数据
input_file = 'C:\\Users\\Pratt\\Desktop\\HKUST-RA\\[-Ongoing-] Text analysis in Glassdoor data\\Url-Scrap-Method-1-Tansfer\\Urls-1.1.json'  # 输入文件名

try:
    # 读取JSON文件
    with open(input_file, 'r', encoding='utf-8') as f:
        company_url_list = json.load(f)
except FileNotFoundError:
    print(f"错误: 找不到文件 {input_file}")
    # 如果文件不存在，可以退出程序或使用默认数据
    raise

In [4]:
# 处理URL列表
processed_list = process_url_list(company_url_list)

# 输出结果
print("处理后的列表:")
for item in processed_list:
    print(f"{{'company': '{item['company']}', 'url': '{item['url']}'}}")

转换: 1-800-Pack-Rat LLC
  原始: https://www.glassdoor.com/Reviews/1-800-PACK-RAT-Reviews-E400743.htm
  转换后: https://www.glassdoor.com/Overview/Working-at-1-800-PACK-RAT-EI_IE400743.11.htm

处理后的列表:
{'company': '"Sales Manager" Software BV', 'url': 'https://www.glassdoor.com/Overview/Working-at-Sales-Manager-EI_IE5041627.11,24.htm'}
{'company': '& Other Stories AB & Co. KG', 'url': 'https://www.glassdoor.com/Overview/Working-at--and-Other-Stories-EI_IE1102105.11,29.htm'}
{'company': '&Pizza', 'url': 'https://www.glassdoor.com/Overview/Working-at--and-pizza-EI_IE1275069.11,21.htm'}
{'company': '1-800 Accountant LLC', 'url': 'https://www.glassdoor.com/Overview/Working-at-1-800Accountant-EI_IE784023.11,26.htm'}
{'company': '1-800 Contacts, Inc.', 'url': 'https://www.glassdoor.com/Overview/Working-at-1-800-Contacts-EI_IE7501.11,25.htm'}
{'company': '1-800 Radiator & A/C', 'url': 'https://www.glassdoor.com/Overview/Working-at-1-800-Radiator-EI_IE343140.11,25.htm'}
{'company': '1-800-FLOWERS.COM,

In [5]:
output_file = 'C:\\Users\\Pratt\\Desktop\\HKUST-RA\\[-Ongoing-] Text analysis in Glassdoor data\\Page-Source-Scraping\\Page_Url_Formatting_Trans\\Urls-1.1.json'  # 输出文件名
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(processed_list, f, ensure_ascii=False, indent=2)