In [None]:
import pandas as pd
import json, ast

In [43]:
original_df = pd.read_csv('C:/Users/29400/Desktop/data/original_data.csv')
ai_df = pd.read_csv('C:/Users/29400/Desktop/data/ai.csv')
non_ai_df = pd.read_csv('C:/Users/29400/Desktop/data/non_ai.csv')

In [49]:
def parse_languages(languages):
    # **检测 NaN（None, np.nan, float('nan')）**
    if pd.isna(languages):
        return {}  # NaN 处理为空字典

    if isinstance(languages, str):
        try:
            return json.loads(languages)  # 直接尝试用 JSON 解析
        except json.JSONDecodeError:
            try:
                return ast.literal_eval(languages)  # 备用解析（支持 'aaa'bbb'）
            except Exception as e:
                print(f"Skipping invalid data: {languages} - Error: {e}")
                return {}  # 解析失败，返回空字典
    return languages  # 如果已经是字典，则直接返回


original_df['languages'] = original_df['languages'].apply(parse_languages)
ai_df['languages'] = ai_df['languages'].apply(parse_languages)
non_ai_df['languages'] = non_ai_df['languages'].apply(parse_languages)

In [50]:
popular_threshold = 0.3

# 计算要选择的数量
top_30_percent_count_original = int(len(original_df) * popular_threshold)
top_30_percent_count_ai = int(len(ai_df) * popular_threshold)
top_30_percent_count_non_ai = int(len(non_ai_df) * popular_threshold)

# 按照 stars 排序（降序）
original_df_sorted = original_df.sort_values(by='stars', ascending=False)
ai_df_sorted = ai_df.sort_values(by='stars', ascending=False)
non_ai_df_sorted = non_ai_df.sort_values(by='stars', ascending=False)

# 选择前30%的仓库
popular_original_df = original_df_sorted.head(top_30_percent_count_original)
popular_ai_df = ai_df_sorted.head(top_30_percent_count_ai)
popular_non_ai_df = non_ai_df_sorted.head(top_30_percent_count_non_ai)

In [59]:
def calculate_languages(df) -> {}:
    languages_count = {}
    for index, row in df.iterrows():
        languages = row['languages']
        for language in languages:
            if language not in languages_count:
                languages_count[language] = 1
            else:
                languages_count[language] += 1
    return languages_count


original_languages_count = calculate_languages(popular_original_df)
ai_languages_count = calculate_languages(popular_ai_df)
non_ai_languages_count = calculate_languages(popular_non_ai_df)

In [60]:
sorted_original_languages_count = sorted(original_languages_count.items(), key=lambda item: item[1], reverse=True)
sorted_ai_languages_count = sorted(ai_languages_count.items(), key=lambda item: item[1], reverse=True)
sorted_non_ai_languages_count = sorted(non_ai_languages_count.items(), key=lambda item: item[1], reverse=True)

[('Shell', 20610),
 ('JavaScript', 19531),
 ('Python', 16263),
 ('HTML', 15536),
 ('CSS', 11792),
 ('Makefile', 9640),
 ('Dockerfile', 8339),
 ('C', 7951),
 ('C++', 6905),
 ('Java', 6509),
 ('TypeScript', 6471),
 ('Ruby', 5008),
 ('Go', 4359),
 ('Batchfile', 4137),
 ('CMake', 3800),
 ('Objective-C', 3419),
 ('PHP', 2821),
 ('C#', 2752),
 ('SCSS', 2747),
 ('Swift', 2281),
 ('Rust', 2110),
 ('Jupyter Notebook', 2060),
 ('PowerShell', 1925),
 ('Kotlin', 1800),
 ('Vue', 1631),
 ('Perl', 1563),
 ('Assembly', 1371),
 ('Lua', 1304),
 ('Roff', 1260),
 ('Less', 916),
 ('M4', 887),
 ('Objective-C++', 816),
 ('Smarty', 808),
 ('GLSL', 804),
 ('TeX', 782),
 ('Vim Script', 703),
 ('Nix', 699),
 ('Cuda', 689),
 ('Starlark', 642),
 ('XSLT', 546),
 ('Dart', 532),
 ('Scala', 525),
 ('Groovy', 502),
 ('Emacs Lisp', 495),
 ('PLpgSQL', 468),
 ('Handlebars', 467),
 ('HCL', 454),
 ('MDX', 450),
 ('R', 450),
 ('CoffeeScript', 436),
 ('MATLAB', 412),
 ('Yacc', 412),
 ('Jinja', 392),
 ('EJS', 386),
 ('Procfile