| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,129 @@ | ||
| import pandas as pd | ||
| import numpy as np | ||
| import datetime | ||
| import time | ||
| from github import Github, RateLimitExceededException | ||
| from ShareCode.pygit_func import github_ratelimit | ||
|
|
||
| # Original Url: https://github.com/tue-mdse/genderComputer | ||
| from genderComputer import GenderComputer | ||
|
|
||
|
|
||
| def extract_python_files(contents): | ||
| # Function to extract Python files from repository contents | ||
| python_list = [] | ||
|
|
||
| while contents: | ||
| file_content = contents.pop(0) | ||
| if file_content.path.endswith(".py") and "__" not in file_content.path: | ||
| python_list.append(file_content.path) | ||
| if file_content.type == "dir": | ||
| contents.extend(repo.get_contents(file_content.path)) | ||
|
|
||
| return python_list | ||
|
|
||
|
|
||
| def process_repo(repo, gc, already_inc, fem_dp, repo_sample): | ||
| start_rate_limit = g.get_rate_limit().core | ||
|
|
||
| # Two calls to the API to get the number of contributors | ||
| contrib = repo.get_contributors(anon=True) | ||
| num_contrib = contrib.totalCount | ||
| files = repo.get_contents("") | ||
| num_files = len(list(files)) | ||
| owner_type = repo.owner.type | ||
| repo_name = repo.name | ||
|
|
||
| login = repo.owner.login | ||
| name = repo.owner.name | ||
| local = repo.owner.location | ||
|
|
||
| if ((already_inc["Repo Name"] == repo_name) & (already_inc["Repo Owner Login"] == login)).any(): | ||
| print("\n\tDatapoint already exists in dataset.") | ||
| return fem_dp | ||
|
|
||
| # One contributor and less than 50 files. | ||
| if num_contrib == 1 and owner_type == "User": | ||
| try: | ||
| gender_name = gc.resolveGender(name, local) | ||
| gender_login = gc.resolveGender(login, local) | ||
|
|
||
| if gender_name or gender_login == "female": | ||
| fem_dp += 1 | ||
| print("\n\t{0} Feminine Data Point Identified.".format(fem_dp)) | ||
|
|
||
| repo_dict = {"Repo Name": repo_name, "Repo Owner Login": login, "Repo Owner Name": name, | ||
| "Repo Owner ID": repo.owner.id, "Repo Owner Gender (Name)": gender_name, | ||
| "Repo Owner Gender (Login)": gender_login, "Repo Owner Location": local, | ||
| "Repo Owner Type": repo.owner.type, "Repo Owner Bio": repo.owner.bio, | ||
| "Repo Owner Email": repo.owner.email, "Repo Owner Collaborators": repo.owner.collaborators, | ||
| "Repo Owner Followers": repo.owner.followers, "Repo Owner Following": repo.owner.following, | ||
| "Repo Created At": repo.created_at.strftime("%Y-%m-%d"), | ||
| "Repo Updated At": repo.updated_at.strftime("%Y-%m-%d"), | ||
| "Repo Description": repo.description, "Repo Language": repo.language, | ||
| "Repo Is Fork": repo.fork, "Repo Forks Counts": repo.forks, | ||
| "Repo Organization": repo.organization, "Repo Labels": [i.name for i in repo.get_labels()], | ||
| "Num Contributors": num_contrib, "Contributors": contrib, | ||
| "Collaborators": repo.owner.collaborators, "Num Files": num_files, | ||
| "Num Commits": repo.get_commits().totalCount, "Contents": files, | ||
| "Python Files": extract_python_files(files), "Pylint Scores": np.nan} | ||
|
|
||
| repo_sample.append(repo_dict) | ||
| save_repo_data(repo_sample) | ||
|
|
||
| except: | ||
| pass | ||
|
|
||
| return fem_dp | ||
|
|
||
|
|
||
| def save_repo_data(repo_sample): | ||
| temp_df = pd.DataFrame(repo_sample) | ||
| temp_df.to_csv("GitHub_Data_{0}.csv".format(start_date.strftime("%Y-%m-%d")), index=False) | ||
| print("\n\t Python GitHub Repos Collected:", len(repo_sample)) | ||
|
|
||
|
|
||
| # Initialize GitHub API instance | ||
| user_name = "" | ||
| client_secret = "" | ||
| client_id = "" | ||
| g = Github(client_id, client_secret, per_page=100) | ||
|
|
||
| # Initialize GenderComputer | ||
| gc = GenderComputer() | ||
|
|
||
| # Initialize date variables | ||
| start_date = datetime.datetime.strptime("2019-01-01", "%Y-%m-%d") | ||
| days_count = (datetime.datetime.now() - start_date).days | ||
|
|
||
| # Load existing data | ||
| already_inc = pd.read_csv("Already_Included.csv", index_col="Unnamed: 0") | ||
| already_inc = already_inc.drop_duplicates(subset=["Repo Name", "Repo Owner Login"]) | ||
|
|
||
| # Initialize feminine data point counter | ||
| fem_dp = 0 | ||
|
|
||
| # Initialize repo sample | ||
| repos_output = [] | ||
| repo_sample = [] | ||
|
|
||
| # Main loop | ||
| for dc in range(days_count): | ||
| date_from = start_date + datetime.timedelta(days=dc) | ||
| date_to = date_from + datetime.timedelta(days=1) | ||
|
|
||
| print("\n---------------------------------------------------------\n", dc, | ||
| "Querying Date Range:", str(date_from)[:10], "to", str(date_to)[:10]) | ||
|
|
||
| query = f"language:python forks:10..250 size:1000..1000000 stars:10..250 fork:false created:{str(date_from)[:10]}..{str(date_to)[:10]}" | ||
|
|
||
| try: | ||
| repositories = g.search_repositories(query=query) | ||
|
|
||
| for repo in repositories: | ||
| fem_dp = process_repo(repo, gc, already_inc, fem_dp, repo_sample) | ||
|
|
||
| except RateLimitExceededException: | ||
| github_ratelimit(g) | ||
|
|
||
| print("Time Finished:", time.strftime("%l:%M%p %Z on %b %d, %Y")) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,87 @@ | ||
| from geopy.extra.rate_limiter import RateLimiter | ||
| from geopy.geocoders import Nominatim | ||
| import pandas as pd | ||
| import os | ||
| import glob | ||
|
|
||
| # Original Url: https://github.com/tue-mdse/genderComputer | ||
| from genderComputer import GenderComputer | ||
|
|
||
| # Set the working directory | ||
| os.chdir("/Users/BrookeSJ/PycharmProjects/Leverhulme/Final Data/") | ||
|
|
||
| # Load all CSV files into a single DataFrame | ||
| all_data_files = glob.glob("*.csv") | ||
| github_repos_df = pd.concat([pd.read_csv(file, index_col=None, header=0, engine="python") for file in all_data_files]) | ||
|
|
||
| # Drop unnecessary columns and duplicates | ||
| github_repos_df = github_repos_df.drop(["Unnamed: 0", "Unnamed: 0.1", "level_0"], axis=1).drop_duplicates().reset_index(drop=True) | ||
|
|
||
| # Display user count | ||
| count_users = len(github_repos_df) | ||
| print("User Count:", count_users) | ||
|
|
||
| # Define a dictionary for text replacement | ||
| text_leet = {"8": "B", "5": "S", "0": "O", "|": "l", "1": "l", "7": "T", | ||
| "4": "A", "£": "E", "$": "S", "€": "E", "¥": "Y", "3": "E", | ||
| ".": " ", ",": " ", "@": " ", "-": " ", "_": ""} | ||
| # Apply text replacement to specified columns | ||
| for col in ["Repo Owner Email", "Repo Owner Login"]: | ||
| for k, v in text_leet.items(): | ||
| github_repos_df[col] = github_repos_df[col].str.replace(k, v) | ||
|
|
||
| # Clean email address prefix for 3rd name source | ||
| github_repos_df["Repo Owner Email Prefix"] = github_repos_df["Repo Owner Email"].str.replace(r"(\d+)(?=@)", "", regex=True) | ||
| github_repos_df["Repo Owner Email Prefix"] = github_repos_df["Repo Owner Email Prefix"].str.extract(r"(\S*)(?=@)") | ||
|
|
||
| # Initialize GenderComputer | ||
| gc = GenderComputer() | ||
|
|
||
| # Clean data from reading in | ||
| github_repos_df = github_repos_df[github_repos_df["Repo Name"] != "/appengine/db.py""] | ||
| github_repos_df = github_repos_df.loc[:, ~github_repos_df.columns.str.contains("^Unnamed")] | ||
|
|
||
| # Initialize geopy | ||
| geopy = Nominatim(user_agent="http") | ||
|
|
||
| # Iterate over rows and process gender information | ||
| for i, row in github_repos_df.iterrows(): | ||
| print("[Gender] Processing User:", i + 1, "/", count_users) | ||
|
|
||
| # Clean Location of Users | ||
| if pd.notna(row["Repo Owner Location"]): | ||
| try: | ||
| geocode = RateLimiter(geopy.geocode, min_delay_seconds=1) | ||
| github_repos_df.at[i, "Repo Owner Location"] = str(geocode(row["Repo Owner Location"])[0]) | ||
| except: | ||
| pass | ||
|
|
||
| # Get Gender from Name, if possible. | ||
| for col in ["Repo Owner Name", "Repo Owner Login", "Repo Owner Email Prefix"]: | ||
| try: | ||
| github_repos_df.at[i, f"Repo Owner Gender ({col})"] = gc.resolveGender(row[col], row["Repo Owner Location"]) | ||
| except: | ||
| pass | ||
|
|
||
| # Save as we go | ||
| github_repos_df.to_csv("GitHub_All_Repos_Gender.csv", index=False) | ||
|
|
||
| # Read the saved CSV | ||
| github_repos_df = pd.read_csv("GitHub_All_Repos_Gender.csv") | ||
|
|
||
| # Display gender sample | ||
| gender_columns = ["Repo Owner Gender (Name)", "Repo Owner Gender (Login)", "Repo Owner Gender (Email Prefix)"] | ||
| gender_sample = github_repos_df[gender_columns].apply(lambda x: x.value_counts(dropna=False)).T.rename_axis("Gender").reset_index(name="Frequency") | ||
| gender_sample.to_csv("GitHub_Repos_Gender_Sample.csv", index=False) | ||
|
|
||
| # Create a "Gender ID" column | ||
| github_repos_df["Gender ID"] = github_repos_df["Repo Owner Gender (Name)"].combine_first(github_repos_df["Repo Owner Gender (Login)"]) | ||
|
|
||
| # Iterate over rows and process gender information using email prefix | ||
| for i, row in github_repos_df.iterrows(): | ||
| if pd.notna(row["Repo Owner Email Prefix"]): | ||
| try: | ||
| print(row["Repo Owner Email Prefix"]) | ||
| github_repos_df.at[i, "Repo Owner Gender (Email Prefix)"] = gc.resolveGender(row["Repo Owner Email Prefix"], row["Repo Owner Location"]) | ||
| except: | ||
| pass |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,76 @@ | ||
| # Import libraries | ||
| import warnings | ||
| from github import Github, GithubException | ||
| import pandas as pd | ||
| from ast import literal_eval | ||
| import time | ||
| from pygit_func_stats import pylint_test, github_ratelimit | ||
|
|
||
|
|
||
| # GitHub Credentials | ||
| user_name = "" | ||
| client_secret = "" | ||
| client_id = "" | ||
|
|
||
| g = Github(client_id, client_secret, per_page=100) | ||
|
|
||
| github_repos_df = pd.read_csv("GitHub_All_Repos_Gender.csv") | ||
|
|
||
| # All repos below the 99 Quantile of the number of files (is 42 for whole data) | ||
| ceiling_file_num = github_repos_df["Num Files"].quantile(0.99) | ||
| print("--- The 0.99 Quantile for Number of Files in Repo is:", ceiling_file_num, "---") | ||
|
|
||
| user_count = len(github_repos_df) | ||
|
|
||
| for i, row in github_repos_df.iterrows(): | ||
| print("\n", i + 1, "/", user_count, "Github Repository") | ||
|
|
||
| py_file_list = literal_eval(row["Python Files"]) | ||
| num_py_files = len(py_file_list) | ||
| print("The number of files is", num_py_files) | ||
| repo_path = row["Repo Path"] | ||
|
|
||
| pylint_output = [] | ||
|
|
||
| if num_py_files > ceiling_file_num: | ||
| print("\t File ceiling exceeded.", num_py_files, ".py"s") | ||
| continue | ||
|
|
||
| for ix, py_file in enumerate(py_file_list): | ||
|
|
||
| for _ in range(3): | ||
|
|
||
| try: | ||
| # Access python file using REST API | ||
| py_data = g.get_repo(repo_path).get_contents(py_file) | ||
|
|
||
| try: | ||
| # Decode and write python file | ||
| file_content = py_data.decoded_content.decode() | ||
| with open("python_file.py", "w") as python_file: | ||
| python_file.write(file_content) | ||
|
|
||
| # Run pylint over .py | ||
| py_file_results = pylint_test("python_file.py") | ||
| py_res = {py_file: py_file_results} | ||
|
|
||
| # Add to list | ||
| pylint_output.append(py_res) | ||
|
|
||
| print("\t", ix + 1, "/", num_py_files, "Pylinting Complete.") | ||
|
|
||
| except AssertionError: | ||
| print("Assert Error") | ||
|
|
||
| except Exception as e: | ||
| print(f"Error encountered: {e}") | ||
| continue | ||
|
|
||
| except GithubException: | ||
| github_ratelimit(g) | ||
|
|
||
| # Save to DataFrame. | ||
| github_repos_df.loc[i, "Pylint Scores"] = str(pylint_output) | ||
| github_repos_df.to_csv("GitHub_Files_Data.csv", index=False) | ||
|
|
||
| print("Time Finished:", time.strftime("%l:%M%p %Z on %b %d, %Y")) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| from io import StringIO | ||
| import re | ||
| import time | ||
| from datetime import datetime | ||
| from pylint.lint import Run | ||
| from pylint.reporters import text | ||
| from astroid import MANAGER | ||
| import pandas as pd | ||
|
|
||
|
|
||
| def pylint_test(testfile): | ||
| """ | ||
| Runs the pylint assessment on a python file. | ||
| :param testfile: .py filename as string | ||
| :return: Dictionary of the report | ||
| """ | ||
|
|
||
| # Clear cache | ||
| MANAGER.astroid_cache.clear() | ||
|
|
||
| # Get Assessment of Code | ||
| pylint_output = StringIO() | ||
| reporter = text.TextReporter(pylint_output) | ||
| Run([testfile, "--reports=y", "--enable=all"], reporter=reporter, do_exit=False) | ||
| pylint_output.seek(0) | ||
| pylint_report = pylint_output.read() | ||
|
|
||
| # Regex to extract data from the pylint report | ||
| labels = [i.strip() for i in re.findall(r"(?<=\|)[a-z- ]+(?=\s+\|[0-9+])", pylint_report)] | ||
| label_nums = [i.strip() for i in re.findall(r"(?<=\s\|)([0-9.= |]+)(?=\|\n)", pylint_report)] | ||
| res = [char.replace(" ", "").split("|") for char in label_nums] | ||
|
|
||
| # Dictionary of Vars | ||
| report_dict = {labels[i]: res[i] for i in range(len(labels))} | ||
| report_dict["statements analysed"] = re.findall(r"([0-9]+)(?=\s[statements])", pylint_report) | ||
| report_dict["pylint rating"] = re.findall(r"([0-9].+)(?=/10\s)", pylint_report) | ||
| report_dict["pylint verbose"] = re.findall(r"(python-test\.py.*)", pylint_report) | ||
|
|
||
| # Output as dictionary | ||
| return report_dict | ||
|
|
||
|
|
||
| def github_ratelimit(g): | ||
| print("\n***** Rate Limit Reached *****") | ||
| print(time.strftime("\t%l:%M%p %Z on %b %d, %Y")) | ||
|
|
||
| core_rate_limit = g.get_rate_limit().core | ||
| now = datetime.now() | ||
| pause = core_rate_limit.reset - now | ||
| sleep_time = pause.seconds + 5 | ||
| print("\tPausing for:", round(sleep_time / 60), "mins\n") | ||
|
|
||
| time.sleep(sleep_time) |