In [12]:
import json

In [13]:
def clean_company_data(data):
    cleaned = []
    
    for company in data:
        cleaned_company = {
            "company": company.get("company", ""),
            "url": company.get("url", ""),
            "website": "",
            "location": "",
            "employees": "",
            "type": "",
            "revenue": "",
            "industry": ""
        }
        
        # 收集所有info字段的值
        info_values = []
        for key in company:
            if key.startswith("info-"):
                info_values.append(str(company[key]))
        
        # 先找明确的字段
        for value in info_values:
            value_lower = value.lower()
            
            # website - 有域名
            if not cleaned_company["website"] and any(domain in value_lower for domain in [".com", ".net", ".org", ".edu", "www.", "http"]):
                cleaned_company["website"] = value
            
            # employees - 有"employees"字样
            elif not cleaned_company["employees"] and "employees" in value_lower:
                cleaned_company["employees"] = value
            
            # type - 有"type:"字样
            elif not cleaned_company["type"] and "type:" in value_lower:
                cleaned_company["type"] = value
            
            # revenue - 有"revenue:"字样
            elif not cleaned_company["revenue"] and "revenue:" in value_lower:
                cleaned_company["revenue"] = value
        
        # 在前3个字段中找location（包含逗号）
        for i, value in enumerate(info_values[:3]):
            if "," in value:
                cleaned_company["location"] = value
                break
        
        # 从最后几个字段中找industry（排除干扰项）
        # 干扰项关键词
        non_industry = ["revenue:", "founded", "type", "employees", "awards"]
        
        # 从后往前检查
        for value in reversed(info_values):
            value_lower = value.lower()
            
            # 跳过已使用的值
            if value in cleaned_company.values():
                continue
            
            # 检查是否干扰项
            is_interference = any(keyword in value_lower for keyword in non_industry)
            
            # 不是干扰项，且不在前3个位置，就当作industry
            if not is_interference and value in info_values[-2:]:
                cleaned_company["industry"] = value
                break
        
        cleaned.append(cleaned_company)
    
    return cleaned

In [14]:
input_file = 'C:\\Users\\Pratt\\Desktop\\HKUST-RA\\[-Ongoing-] Text analysis in Glassdoor data\\Page-Source-Scraping\\Page_Scrape_Check_Trans\\Urls-1.1.json'
output_file = 'C:\\Users\\Pratt\\Desktop\\HKUST-RA\\[-Ongoing-] Text analysis in Glassdoor data\\Page-Source-Scraping\\Page_Scrape_Formatting_Trans\\Urls-1.1.json'
# 假设你的JSON数据已经加载
with open(input_file, 'r', encoding='utf-8') as f:
    data = json.load(f)
# 清洗数据
cleaned_data = clean_company_data(data)

In [15]:
# 保存到新文件
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(cleaned_data, f, ensure_ascii=False, indent=2)