In [None]:
import pandas as pd
import json, ast

In [43]:
original_df = pd.read_csv('C:/Users/29400/Desktop/data/original_data.csv')
ai_df = pd.read_csv('C:/Users/29400/Desktop/data/ai.csv')
non_ai_df = pd.read_csv('C:/Users/29400/Desktop/data/non_ai.csv')

In [49]:
def parse_languages(languages):
    # **检测 NaN（None, np.nan, float('nan')）**
    if pd.isna(languages):
        return {}  # NaN 处理为空字典

    if isinstance(languages, str):
        try:
            return json.loads(languages)  # 直接尝试用 JSON 解析
        except json.JSONDecodeError:
            try:
                return ast.literal_eval(languages)
            except Exception as e:
                print(f"Skipping invalid data: {languages} - Error: {e}")
                return {}  # 解析失败，返回空字典
    return languages  # 如果已经是字典，则直接返回


original_df['languages'] = original_df['languages'].apply(parse_languages)
ai_df['languages'] = ai_df['languages'].apply(parse_languages)
non_ai_df['languages'] = non_ai_df['languages'].apply(parse_languages)

In [50]:
popular_threshold = 0.3

# 计算要选择的数量
top_30_percent_count_original = int(len(original_df) * popular_threshold)
top_30_percent_count_ai = int(len(ai_df) * popular_threshold)
top_30_percent_count_non_ai = int(len(non_ai_df) * popular_threshold)

# 按照 stars 排序（降序）
original_df_sorted = original_df.sort_values(by='stars', ascending=False)
ai_df_sorted = ai_df.sort_values(by='stars', ascending=False)
non_ai_df_sorted = non_ai_df.sort_values(by='stars', ascending=False)

# 选择前30%的仓库
popular_original_df = original_df_sorted.head(top_30_percent_count_original)
popular_ai_df = ai_df_sorted.head(top_30_percent_count_ai)
popular_non_ai_df = non_ai_df_sorted.head(top_30_percent_count_non_ai)

In [69]:
def calculate_languages(df) -> {}:
    languages_count = {}
    for index, row in df.iterrows():
        languages = row['languages']
        for language in languages:
            if language not in languages_count:
                languages_count[language] = 1
            else:
                languages_count[language] += 1
    return languages_count


top_original_languages_count = calculate_languages(popular_original_df)
top_ai_languages_count = calculate_languages(popular_ai_df)
top_non_ai_languages_count = calculate_languages(popular_non_ai_df)
original_language_count = calculate_languages(original_df)
ai_language_count = calculate_languages(ai_df)
non_ai_language_count = calculate_languages(non_ai_df)

In [84]:
sorted_top_original_languages_count = sorted(top_original_languages_count.items(), key=lambda item: item[1], reverse=True)
sorted_top_ai_languages_count = sorted(top_ai_languages_count.items(), key=lambda item: item[1], reverse=True)
sorted_top_non_ai_languages_count = sorted(top_non_ai_languages_count.items(), key=lambda item: item[1], reverse=True)
sorted_original_language_count = sorted(original_language_count.items(), key=lambda item: item[1], reverse=True)
sorted_ai_language_count = sorted(ai_language_count.items(), key=lambda item: item[1], reverse=True)
sorted_non_ai_languages_count = sorted(non_ai_language_count.items(), key=lambda item: item[1], reverse=True)

top10_sorted_top_original_languages = sorted_top_original_languages_count[:10]
top10_sorted_top_ai_languages = sorted_top_ai_languages_count[:10]
top10_sorted_top_non_ai_languages = sorted_top_non_ai_languages_count[:10]
top10_sorted_original_languages = sorted_original_language_count[:10]
top10_sorted_ai_languages = sorted_ai_language_count[:10]
top10_sorted_non_ai_languages = sorted_non_ai_languages_count[:10]

In [104]:
print("Original Data")
print(f"Total Data Entries: {len(original_df)}")
for key in top10_sorted_original_languages:
    print(f"{key[0]:<20}{key[1]:<10}{key[1] / len(original_df) * 100:.2f}%")

print("\nAI Data")
print(f"Total Data Entries: {len(ai_df)}")
for key in top10_sorted_ai_languages:
    print(f"{key[0]:<20}{key[1]:<10}{key[1] / len(ai_df) * 100:.2f}%")

print("\nNon-AI Data")
print(f"Total Data Entries: {len(non_ai_df)}")
for key in top10_sorted_non_ai_languages:
    print(f"{key[0]:<20}{key[1]:<10}{key[1] / len(non_ai_df) * 100:.2f}%")

print(f"\nTop {popular_threshold * 100:.0f}% Original Data")
print(f"Total Data Entries: {len(popular_original_df)}")
for key in top10_sorted_top_original_languages:
    print(f"{key[0]:<20}{key[1]:<10}{key[1] / len(popular_original_df) * 100:.2f}%")

print(f"\nTop {popular_threshold * 100:.0f}% AI Data")
print(f"Total Data Entries: {len(popular_ai_df)}")
for key in top10_sorted_top_ai_languages:
    print(f"{key[0]:<20}{key[1]:<10}{key[1] / len(popular_ai_df) * 100:.2f}%")

print(f"\nTop {popular_threshold * 100:.0f}% Non-AI Data")
print(f"Total Data Entries: {len(non_ai_df)}")
for key in top10_sorted_top_non_ai_languages:
    print(f"{key[0]:<20}{key[1]:<10}{key[1] / len(non_ai_df) * 100:.2f}%")

Original Data
Total Data Entries: 185876
JavaScript          60263     32.42%
Shell               55741     29.99%
Python              49173     26.45%
HTML                45123     24.28%
CSS                 35014     18.84%
Makefile            26761     14.40%
C                   23646     12.72%
C++                 20895     11.24%
Java                20391     10.97%
Dockerfile          18267     9.83%

AI Data
Total Data Entries: 6545
Python              4855      74.18%
Shell               2356      36.00%
Jupyter Notebook    1531      23.39%
Dockerfile          1059      16.18%
HTML                972       14.85%
Makefile            914       13.96%
JavaScript          888       13.57%
C++                 875       13.37%
CSS                 742       11.34%
C                   568       8.68%

Non-AI Data
Total Data Entries: 179331
JavaScript          59375     33.11%
Shell               53385     29.77%
Python              44318     24.71%
HTML                44151     24.62%