In [1]:
import pandas as pd
import json, ast

In [2]:
original_df = pd.read_csv('C:/Users/29400/Desktop/data/original_data.csv')
ai_df = pd.read_csv('C:/Users/29400/Desktop/data/ai.csv')
non_ai_df = pd.read_csv('C:/Users/29400/Desktop/data/non_ai.csv')

In [3]:
def parse_languages(languages):
    # **检测 NaN（None, np.nan, float('nan')）**
    if pd.isna(languages):
        return {}  # NaN 处理为空字典

    if isinstance(languages, str):
        try:
            return json.loads(languages)  # 直接尝试用 JSON 解析
        except json.JSONDecodeError:
            try:
                return ast.literal_eval(languages)
            except Exception as e:
                print(f"Skipping invalid data: {languages} - Error: {e}")
                return {}  # 解析失败，返回空字典
    return languages  # 如果已经是字典，则直接返回


original_df['languages'] = original_df['languages'].apply(parse_languages)
ai_df['languages'] = ai_df['languages'].apply(parse_languages)
non_ai_df['languages'] = non_ai_df['languages'].apply(parse_languages)

In [4]:
popular_threshold = 0.3

# 计算要选择的数量
top_30_percent_count_original = int(len(original_df) * popular_threshold)
top_30_percent_count_ai = int(len(ai_df) * popular_threshold)
top_30_percent_count_non_ai = int(len(non_ai_df) * popular_threshold)

# 按照 stars 排序（降序）
original_df_sorted = original_df.sort_values(by='stars', ascending=False)
ai_df_sorted = ai_df.sort_values(by='stars', ascending=False)
non_ai_df_sorted = non_ai_df.sort_values(by='stars', ascending=False)

# 选择前30%的仓库
popular_original_df = original_df_sorted.head(top_30_percent_count_original)
popular_ai_df = ai_df_sorted.head(top_30_percent_count_ai)
popular_non_ai_df = non_ai_df_sorted.head(top_30_percent_count_non_ai)

In [5]:
def calculate_languages(df) -> {}:
    languages_count = {}
    for index, row in df.iterrows():
        languages = row['languages']
        for language in languages:
            if language not in languages_count:
                languages_count[language] = 1
            else:
                languages_count[language] += 1
    return languages_count


top_original_languages_count = calculate_languages(popular_original_df)
top_ai_languages_count = calculate_languages(popular_ai_df)
top_non_ai_languages_count = calculate_languages(popular_non_ai_df)
original_language_count = calculate_languages(original_df)
ai_language_count = calculate_languages(ai_df)
non_ai_language_count = calculate_languages(non_ai_df)

In [6]:
sorted_top_original_languages_count = sorted(top_original_languages_count.items(), key=lambda item: item[1], reverse=True)
sorted_top_ai_languages_count = sorted(top_ai_languages_count.items(), key=lambda item: item[1], reverse=True)
sorted_top_non_ai_languages_count = sorted(top_non_ai_languages_count.items(), key=lambda item: item[1], reverse=True)
sorted_original_language_count = sorted(original_language_count.items(), key=lambda item: item[1], reverse=True)
sorted_ai_language_count = sorted(ai_language_count.items(), key=lambda item: item[1], reverse=True)
sorted_non_ai_languages_count = sorted(non_ai_language_count.items(), key=lambda item: item[1], reverse=True)

top10_sorted_top_original_languages = sorted_top_original_languages_count[:10]
top10_sorted_top_ai_languages = sorted_top_ai_languages_count[:10]
top10_sorted_top_non_ai_languages = sorted_top_non_ai_languages_count[:10]
top10_sorted_original_languages = sorted_original_language_count[:10]
top10_sorted_ai_languages = sorted_ai_language_count[:10]
top10_sorted_non_ai_languages = sorted_non_ai_languages_count[:10]

In [7]:
print("Original Data")
print(f"Total Data Entries: {len(original_df)}")
for key in top10_sorted_original_languages:
    print(f"{key[0]:<20}{key[1]:<10}{key[1] / len(original_df) * 100:.2f}%")

print("\nAI Data")
print(f"Total Data Entries: {len(ai_df)}")
for key in top10_sorted_ai_languages:
    print(f"{key[0]:<20}{key[1]:<10}{key[1] / len(ai_df) * 100:.2f}%")

print("\nNon-AI Data")
print(f"Total Data Entries: {len(non_ai_df)}")
for key in top10_sorted_non_ai_languages:
    print(f"{key[0]:<20}{key[1]:<10}{key[1] / len(non_ai_df) * 100:.2f}%")

print(f"\nTop {popular_threshold * 100:.0f}% Original Data")
print(f"Total Data Entries: {len(popular_original_df)}")
for key in top10_sorted_top_original_languages:
    print(f"{key[0]:<20}{key[1]:<10}{key[1] / len(popular_original_df) * 100:.2f}%")

print(f"\nTop {popular_threshold * 100:.0f}% AI Data")
print(f"Total Data Entries: {len(popular_ai_df)}")
for key in top10_sorted_top_ai_languages:
    print(f"{key[0]:<20}{key[1]:<10}{key[1] / len(popular_ai_df) * 100:.2f}%")

print(f"\nTop {popular_threshold * 100:.0f}% Non-AI Data")
print(f"Total Data Entries: {len(non_ai_df)}")
for key in top10_sorted_top_non_ai_languages:
    print(f"{key[0]:<20}{key[1]:<10}{key[1] / len(non_ai_df) * 100:.2f}%")

Original Data
Total Data Entries: 185876
JavaScript          60263     32.42%
Shell               55741     29.99%
Python              49173     26.45%
HTML                45123     24.28%
CSS                 35014     18.84%
Makefile            26761     14.40%
C                   23646     12.72%
C++                 20895     11.24%
Java                20391     10.97%
Dockerfile          18267     9.83%

AI Data
Total Data Entries: 6545
Python              4855      74.18%
Shell               2356      36.00%
Jupyter Notebook    1531      23.39%
Dockerfile          1059      16.18%
HTML                972       14.85%
Makefile            914       13.96%
JavaScript          888       13.57%
C++                 875       13.37%
CSS                 742       11.34%
C                   568       8.68%

Non-AI Data
Total Data Entries: 179331
JavaScript          59375     33.11%
Shell               53385     29.77%
Python              44318     24.71%
HTML                44151     24.62%

In [8]:
original_df

Unnamed: 0.1,Unnamed: 0,url,name,owner,forks,stars,languages,commits,creation_date,contributors,...,releases,pull_requests,readme_size,commits_freq,releases_freq,lines_of_codes,popularity_score_1,popularity_score_2,popularity_score_3,releases_count
0,0,https://api.github.com/repos/0-1-0/lightblue-0.4,lightblue-0.4,0-1-0,91,95,"{'Python': 415988, 'Objective-C': 169036, 'C':...","['2020-10-18T21:26:07Z', '2015-02-11T14:38:50Z...",2012-09-24T02:58:25Z,4,...,[],6,3582,8,0,686294,222,50,186,0
1,1,https://api.github.com/repos/0-8-4/miui-auto-t...,miui-auto-tasks,0-8-4,180,991,"{'Python': 61598, 'Shell': 1087, 'Dockerfile':...","['2025-01-27T05:35:37Z', '2025-01-25T14:24:35Z...",2021-08-16T10:04:15Z,17,...,"[{'name': 'v1.8.2-hotfix1', 'date': '2025-01-2...",113,3901,30,51,63304,13940,12776,1171,51
2,2,https://api.github.com/repos/00-Evan/shattered...,shattered-pixel-dungeon,00-Evan,1158,4936,"{'Java': 6219833, 'Shell': 1977}","['2024-10-09T16:48:17Z', '2024-10-08T18:41:17Z...",2014-07-31T21:56:31Z,2,...,"[{'name': 'v3.0.0-BETA-2.1', 'date': '2024-12-...",46,2467,30,49,6221810,8210,2222,6094,49
3,3,https://api.github.com/repos/0015/ThatProject,ThatProject,0015,947,1921,"{'C': 41821057, 'C++': 1327597, 'Dart': 510279...","['2024-07-10T06:52:50Z', '2024-06-09T07:27:22Z...",2019-07-04T08:16:56Z,2,...,[],34,27483,30,0,43896590,4024,1285,2868,0
4,4,https://api.github.com/repos/007revad/Synology...,Synology_HDD_db,007revad,195,2913,{'Shell': 86699},"['2025-01-23T06:12:05Z', '2025-01-23T06:11:28Z...",2023-02-26T02:59:13Z,11,...,"[{'name': 'v3.6.110', 'date': '2024-12-31T10:1...",226,16443,30,88,86699,54184,51127,3108,88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185871,185871,https://api.github.com/repos/zzzprojects/Syste...,System.Linq.Dynamic.Core,zzzprojects,230,1597,"{'C#': 1751766, 'HTML': 38859, 'CSS': 26919, '...","['2025-01-19T07:29:42Z', '2024-12-25T08:10:24Z...",2016-04-08T16:41:51Z,30,...,"[{'name': 'v.1.5.1', 'date': '2024-12-22T10:28...",318,6902,30,54,1819025,102951,101176,1827,54
185872,185872,https://api.github.com/repos/zzzprojects/Syste...,System.Linq.Dynamic,zzzprojects,194,424,{},"['2024-03-20T20:53:54Z', '2021-04-06T14:35:00Z...",2011-12-12T19:01:13Z,11,...,[],41,2342,30,0,0,2299,1738,618,0
185873,185873,https://api.github.com/repos/zzzprojects/Z.Ext...,Z.ExtensionMethods,zzzprojects,327,1623,"{'C#': 2614721, 'Visual Basic .NET': 1411898}","['2024-03-20T20:50:19Z', '2023-07-05T14:46:07Z...",2015-03-03T14:49:23Z,3,...,"[{'name': 'v2.1.1', 'date': '2019-01-12T18:26:...",15,3996,30,16,4026619,2175,333,1950,16
185874,185874,https://api.github.com/repos/zzzzbw/Fame,Fame,zzzzbw,72,201,"{'Java': 215298, 'Vue': 121055, 'JavaScript': ...","['2022-12-20T14:34:51Z', '2022-12-14T12:38:56Z...",2017-08-07T13:44:22Z,3,...,"[{'name': '', 'date': '2022-12-20T10:31:45Z'},...",14,2431,30,3,391989,469,210,273,3
