129 changes: 129 additions & 0 deletions github_api_getdata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import pandas as pd
import numpy as np
import datetime
import time
from github import Github, RateLimitExceededException
from ShareCode.pygit_func import github_ratelimit

# Original Url: https://github.com/tue-mdse/genderComputer
from genderComputer import GenderComputer


def extract_python_files(contents):
# Function to extract Python files from repository contents
python_list = []

while contents:
file_content = contents.pop(0)
if file_content.path.endswith(".py") and "__" not in file_content.path:
python_list.append(file_content.path)
if file_content.type == "dir":
contents.extend(repo.get_contents(file_content.path))

return python_list


def process_repo(repo, gc, already_inc, fem_dp, repo_sample):
start_rate_limit = g.get_rate_limit().core

# Two calls to the API to get the number of contributors
contrib = repo.get_contributors(anon=True)
num_contrib = contrib.totalCount
files = repo.get_contents("")
num_files = len(list(files))
owner_type = repo.owner.type
repo_name = repo.name

login = repo.owner.login
name = repo.owner.name
local = repo.owner.location

if ((already_inc["Repo Name"] == repo_name) & (already_inc["Repo Owner Login"] == login)).any():
print("\n\tDatapoint already exists in dataset.")
return fem_dp

# One contributor and less than 50 files.
if num_contrib == 1 and owner_type == "User":
try:
gender_name = gc.resolveGender(name, local)
gender_login = gc.resolveGender(login, local)

if gender_name or gender_login == "female":
fem_dp += 1
print("\n\t{0} Feminine Data Point Identified.".format(fem_dp))

repo_dict = {"Repo Name": repo_name, "Repo Owner Login": login, "Repo Owner Name": name,
"Repo Owner ID": repo.owner.id, "Repo Owner Gender (Name)": gender_name,
"Repo Owner Gender (Login)": gender_login, "Repo Owner Location": local,
"Repo Owner Type": repo.owner.type, "Repo Owner Bio": repo.owner.bio,
"Repo Owner Email": repo.owner.email, "Repo Owner Collaborators": repo.owner.collaborators,
"Repo Owner Followers": repo.owner.followers, "Repo Owner Following": repo.owner.following,
"Repo Created At": repo.created_at.strftime("%Y-%m-%d"),
"Repo Updated At": repo.updated_at.strftime("%Y-%m-%d"),
"Repo Description": repo.description, "Repo Language": repo.language,
"Repo Is Fork": repo.fork, "Repo Forks Counts": repo.forks,
"Repo Organization": repo.organization, "Repo Labels": [i.name for i in repo.get_labels()],
"Num Contributors": num_contrib, "Contributors": contrib,
"Collaborators": repo.owner.collaborators, "Num Files": num_files,
"Num Commits": repo.get_commits().totalCount, "Contents": files,
"Python Files": extract_python_files(files), "Pylint Scores": np.nan}

repo_sample.append(repo_dict)
save_repo_data(repo_sample)

except:
pass

return fem_dp


def save_repo_data(repo_sample):
temp_df = pd.DataFrame(repo_sample)
temp_df.to_csv("GitHub_Data_{0}.csv".format(start_date.strftime("%Y-%m-%d")), index=False)
print("\n\t Python GitHub Repos Collected:", len(repo_sample))


# Initialize GitHub API instance
user_name = ""
client_secret = ""
client_id = ""
g = Github(client_id, client_secret, per_page=100)

# Initialize GenderComputer
gc = GenderComputer()

# Initialize date variables
start_date = datetime.datetime.strptime("2019-01-01", "%Y-%m-%d")
days_count = (datetime.datetime.now() - start_date).days

# Load existing data
already_inc = pd.read_csv("Already_Included.csv", index_col="Unnamed: 0")
already_inc = already_inc.drop_duplicates(subset=["Repo Name", "Repo Owner Login"])

# Initialize feminine data point counter
fem_dp = 0

# Initialize repo sample
repos_output = []
repo_sample = []

# Main loop
for dc in range(days_count):
date_from = start_date + datetime.timedelta(days=dc)
date_to = date_from + datetime.timedelta(days=1)

print("\n---------------------------------------------------------\n", dc,
"Querying Date Range:", str(date_from)[:10], "to", str(date_to)[:10])

query = f"language:python forks:10..250 size:1000..1000000 stars:10..250 fork:false created:{str(date_from)[:10]}..{str(date_to)[:10]}"

try:
repositories = g.search_repositories(query=query)

for repo in repositories:
fem_dp = process_repo(repo, gc, already_inc, fem_dp, repo_sample)

except RateLimitExceededException:
github_ratelimit(g)

print("Time Finished:", time.strftime("%l:%M%p %Z on %b %d, %Y"))
87 changes: 87 additions & 0 deletions github_gender_id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim
import pandas as pd
import os
import glob

# Original Url: https://github.com/tue-mdse/genderComputer
from genderComputer import GenderComputer

# Set the working directory
os.chdir("/Users/BrookeSJ/PycharmProjects/Leverhulme/Final Data/")

# Load all CSV files into a single DataFrame
all_data_files = glob.glob("*.csv")
github_repos_df = pd.concat([pd.read_csv(file, index_col=None, header=0, engine="python") for file in all_data_files])

# Drop unnecessary columns and duplicates
github_repos_df = github_repos_df.drop(["Unnamed: 0", "Unnamed: 0.1", "level_0"], axis=1).drop_duplicates().reset_index(drop=True)

# Display user count
count_users = len(github_repos_df)
print("User Count:", count_users)

# Define a dictionary for text replacement
text_leet = {"8": "B", "5": "S", "0": "O", "|": "l", "1": "l", "7": "T",
"4": "A", "£": "E", "$": "S", "€": "E", "¥": "Y", "3": "E",
".": " ", ",": " ", "@": " ", "-": " ", "_": ""}
# Apply text replacement to specified columns
for col in ["Repo Owner Email", "Repo Owner Login"]:
for k, v in text_leet.items():
github_repos_df[col] = github_repos_df[col].str.replace(k, v)

# Clean email address prefix for 3rd name source
github_repos_df["Repo Owner Email Prefix"] = github_repos_df["Repo Owner Email"].str.replace(r"(\d+)(?=@)", "", regex=True)
github_repos_df["Repo Owner Email Prefix"] = github_repos_df["Repo Owner Email Prefix"].str.extract(r"(\S*)(?=@)")

# Initialize GenderComputer
gc = GenderComputer()

# Clean data from reading in
github_repos_df = github_repos_df[github_repos_df["Repo Name"] != "/appengine/db.py""]
github_repos_df = github_repos_df.loc[:, ~github_repos_df.columns.str.contains("^Unnamed")]

# Initialize geopy
geopy = Nominatim(user_agent="http")

# Iterate over rows and process gender information
for i, row in github_repos_df.iterrows():
print("[Gender] Processing User:", i + 1, "/", count_users)

# Clean Location of Users
if pd.notna(row["Repo Owner Location"]):
try:
geocode = RateLimiter(geopy.geocode, min_delay_seconds=1)
github_repos_df.at[i, "Repo Owner Location"] = str(geocode(row["Repo Owner Location"])[0])
except:
pass

# Get Gender from Name, if possible.
for col in ["Repo Owner Name", "Repo Owner Login", "Repo Owner Email Prefix"]:
try:
github_repos_df.at[i, f"Repo Owner Gender ({col})"] = gc.resolveGender(row[col], row["Repo Owner Location"])
except:
pass

# Save as we go
github_repos_df.to_csv("GitHub_All_Repos_Gender.csv", index=False)

# Read the saved CSV
github_repos_df = pd.read_csv("GitHub_All_Repos_Gender.csv")

# Display gender sample
gender_columns = ["Repo Owner Gender (Name)", "Repo Owner Gender (Login)", "Repo Owner Gender (Email Prefix)"]
gender_sample = github_repos_df[gender_columns].apply(lambda x: x.value_counts(dropna=False)).T.rename_axis("Gender").reset_index(name="Frequency")
gender_sample.to_csv("GitHub_Repos_Gender_Sample.csv", index=False)

# Create a "Gender ID" column
github_repos_df["Gender ID"] = github_repos_df["Repo Owner Gender (Name)"].combine_first(github_repos_df["Repo Owner Gender (Login)"])

# Iterate over rows and process gender information using email prefix
for i, row in github_repos_df.iterrows():
if pd.notna(row["Repo Owner Email Prefix"]):
try:
print(row["Repo Owner Email Prefix"])
github_repos_df.at[i, "Repo Owner Gender (Email Prefix)"] = gc.resolveGender(row["Repo Owner Email Prefix"], row["Repo Owner Location"])
except:
pass
76 changes: 76 additions & 0 deletions github_pylint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Import libraries
import warnings
from github import Github, GithubException
import pandas as pd
from ast import literal_eval
import time
from pygit_func_stats import pylint_test, github_ratelimit


# GitHub Credentials
user_name = ""
client_secret = ""
client_id = ""

g = Github(client_id, client_secret, per_page=100)

github_repos_df = pd.read_csv("GitHub_All_Repos_Gender.csv")

# All repos below the 99 Quantile of the number of files (is 42 for whole data)
ceiling_file_num = github_repos_df["Num Files"].quantile(0.99)
print("--- The 0.99 Quantile for Number of Files in Repo is:", ceiling_file_num, "---")

user_count = len(github_repos_df)

for i, row in github_repos_df.iterrows():
print("\n", i + 1, "/", user_count, "Github Repository")

py_file_list = literal_eval(row["Python Files"])
num_py_files = len(py_file_list)
print("The number of files is", num_py_files)
repo_path = row["Repo Path"]

pylint_output = []

if num_py_files > ceiling_file_num:
print("\t File ceiling exceeded.", num_py_files, ".py"s")
continue

for ix, py_file in enumerate(py_file_list):

for _ in range(3):

try:
# Access python file using REST API
py_data = g.get_repo(repo_path).get_contents(py_file)

try:
# Decode and write python file
file_content = py_data.decoded_content.decode()
with open("python_file.py", "w") as python_file:
python_file.write(file_content)

# Run pylint over .py
py_file_results = pylint_test("python_file.py")
py_res = {py_file: py_file_results}

# Add to list
pylint_output.append(py_res)

print("\t", ix + 1, "/", num_py_files, "Pylinting Complete.")

except AssertionError:
print("Assert Error")

except Exception as e:
print(f"Error encountered: {e}")
continue

except GithubException:
github_ratelimit(g)

# Save to DataFrame.
github_repos_df.loc[i, "Pylint Scores"] = str(pylint_output)
github_repos_df.to_csv("GitHub_Files_Data.csv", index=False)

print("Time Finished:", time.strftime("%l:%M%p %Z on %b %d, %Y"))
53 changes: 53 additions & 0 deletions pygit_func.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from io import StringIO
import re
import time
from datetime import datetime
from pylint.lint import Run
from pylint.reporters import text
from astroid import MANAGER
import pandas as pd


def pylint_test(testfile):
"""
Runs the pylint assessment on a python file.
:param testfile: .py filename as string
:return: Dictionary of the report
"""

# Clear cache
MANAGER.astroid_cache.clear()

# Get Assessment of Code
pylint_output = StringIO()
reporter = text.TextReporter(pylint_output)
Run([testfile, "--reports=y", "--enable=all"], reporter=reporter, do_exit=False)
pylint_output.seek(0)
pylint_report = pylint_output.read()

# Regex to extract data from the pylint report
labels = [i.strip() for i in re.findall(r"(?<=\|)[a-z- ]+(?=\s+\|[0-9+])", pylint_report)]
label_nums = [i.strip() for i in re.findall(r"(?<=\s\|)([0-9.= |]+)(?=\|\n)", pylint_report)]
res = [char.replace(" ", "").split("|") for char in label_nums]

# Dictionary of Vars
report_dict = {labels[i]: res[i] for i in range(len(labels))}
report_dict["statements analysed"] = re.findall(r"([0-9]+)(?=\s[statements])", pylint_report)
report_dict["pylint rating"] = re.findall(r"([0-9].+)(?=/10\s)", pylint_report)
report_dict["pylint verbose"] = re.findall(r"(python-test\.py.*)", pylint_report)

# Output as dictionary
return report_dict


def github_ratelimit(g):
print("\n***** Rate Limit Reached *****")
print(time.strftime("\t%l:%M%p %Z on %b %d, %Y"))

core_rate_limit = g.get_rate_limit().core
now = datetime.now()
pause = core_rate_limit.reset - now
sleep_time = pause.seconds + 5
print("\tPausing for:", round(sleep_time / 60), "mins\n")

time.sleep(sleep_time)