In [1]:
import matplotlib as plt
import pandas as pd
import numpy as np
import itertools
import requests
import bs4
import os


# folder
folder = 'datasets/'

lis_datasets = os.listdir(folder)
lis_paths = [folder + dataset for dataset in lis_datasets]
lis_paths

['datasets/committs.csv',
 'datasets/licenses.csv',
 'datasets/sample_commits.csv',
 'datasets/sample_repos_1.csv',
 'datasets/sample_repos_2.csv']

In [2]:
# Define the web-scrapping function to get the languages from a git repository
def _scrap_technologies_from_GitHub(languages: list, source):
    """Web Scrapping to Fetch Languages used in a GitHub repository and save them in the languages input list.

    Args:
        languages: A list with empty or existing Languages values.
        source: raw response content from HTTP Request.

    Returns:
        None
    """

    soup = bs4.BeautifulSoup(source.text, "html.parser")
    divs = soup.find_all("div", class_="BorderGrid-row")
    for div in divs:

        section = div.div
        if section.h2 is not None:
            if section.h2.text == "Languages":
                # print(section.h2.text)
                for span in section.div.span:

                    if type(span) is bs4.element.Tag:
                        languageAndPercent = span.get("aria-label")
                        # print(languageAndPercent.split()[0])
                        languages.append(languageAndPercent.split()[0])
                # print()
    return None

In [3]:
# Read the datasets
committs            = pd.read_csv(lis_paths[0])
licenses            = pd.read_csv(lis_paths[1])
sample_commits = pd.read_csv(lis_paths[2])
sample_repos_1 = pd.read_csv(lis_paths[3])
sample_repos_2 = pd.read_csv(lis_paths[4])

In [4]:
# Append sample_repos and delete duplicates
sample_repos = pd.concat([sample_repos_1, sample_repos_2], ignore_index=True).drop_duplicates(subset=['repo_name'])

# Split repo_name and author
sample_repos['author'] = [item[0] for item in sample_repos['repo_name'].str.split('/')]
sample_repos['repo'] = [item[1] for item in sample_repos['repo_name'].str.split('/')]
sample_repos

Unnamed: 0,repo_name,watch_count,author,repo
0,0----0/Terra-3d-Experiments,3,0----0,Terra-3d-Experiments
1,0-1-0/lightblue-0.4,6,0-1-0,lightblue-0.4
2,0-14N/NDroid,4,0-14N,NDroid
3,0-Eclipse-0/Eclipse,3,0-Eclipse-0,Eclipse
4,0-Eclipse-0/Messenger,4,0-Eclipse-0,Messenger
...,...,...,...,...
399995,openstack/vmtp,7,openstack,vmtp
399996,openstack/vitrage-specs,7,openstack,vitrage-specs
399997,openstack/vitrage-dashboard,9,openstack,vitrage-dashboard
399998,openstack/vitrage,15,openstack,vitrage


In [5]:
# Add github url column
sample_repos['url'] = 'https://github.com/' + sample_repos['repo_name']

# Add license column
sample_repos = pd.merge(sample_repos, licenses, how='inner', on='repo_name')

# Re-order
sample_repos = sample_repos[['url', 'repo', 'author', 'license', 'watch_count']]

In [6]:
# Add languages column from web_scrapping function
global_lis = []

for i, url in enumerate(sample_repos['url']):
    lis = []
    
    if i < 10:
        try:
            source = requests.get(url.strip())
            if not source.ok:
                # print(source.reason)
                # print("There could be an error in the URL\n")
                lis = [None]
            else:
                _scrap_technologies_from_GitHub(lis, source)
        except Exception as e:
            lis = [None]
    else:
        lis=[None]
    global_lis.append(lis)
    # print(global_lis)

In [7]:
# Add global_lis of languages to a new column in asmple_repos
sample_repos['languages'] = global_lis
sample_repos

Unnamed: 0,url,repo,author,license,watch_count,languages
0,https://github.com/01miru/HomeSense,HomeSense,01miru,mit,3,"[Swift, Other]"
1,https://github.com/01org/yask,yask,01org,mit,6,"[C++, Perl, Makefile, Python, Other]"
2,https://github.com/06wj/FL,FL,06wj,mit,3,"[JavaScript, Python]"
3,https://github.com/0legAdamov/AOIntroViewContr...,AOIntroViewController,0legAdamov,mit,3,[None]
4,https://github.com/0of/Promise2,Promise2,0of,mit,4,"[C++, CMake, Other]"
...,...,...,...,...,...,...
15346,https://github.com/opentok/opentok-ios-sdk-sam...,opentok-ios-sdk-samples,opentok,mit,32,[None]
15347,https://github.com/opentok/opentok-hardware-se...,opentok-hardware-setup.js,opentok,mit,3,[None]
15348,https://github.com/opentable/explicitobjectmap...,explicitobjectmap-node,opentable,mit,5,[None]
15349,https://github.com/openstreetmap/iD,iD,openstreetmap,isc,118,[None]


In [25]:
# Get rid of rows without a language
for i, item in enumerate(sample_repos['languages']):
    if None not in item:
        sample_repos.drop(i)
        print(item)
sample_repos

['Swift', 'Other']
['C++', 'Perl', 'Makefile', 'Python', 'Other']
['JavaScript', 'Python']
['C++', 'CMake', 'Other']
['Other']
['Go']
['C++']
['C++', 'Python', 'Makefile', 'Batchfile']
['C', 'Makefile']


Unnamed: 0,url,repo,author,license,watch_count,languages
0,https://github.com/01miru/HomeSense,HomeSense,01miru,mit,3,"[Swift, Other]"
1,https://github.com/01org/yask,yask,01org,mit,6,"[C++, Perl, Makefile, Python, Other]"
2,https://github.com/06wj/FL,FL,06wj,mit,3,"[JavaScript, Python]"
3,https://github.com/0legAdamov/AOIntroViewContr...,AOIntroViewController,0legAdamov,mit,3,[None]
4,https://github.com/0of/Promise2,Promise2,0of,mit,4,"[C++, CMake, Other]"
...,...,...,...,...,...,...
15346,https://github.com/opentok/opentok-ios-sdk-sam...,opentok-ios-sdk-samples,opentok,mit,32,[None]
15347,https://github.com/opentok/opentok-hardware-se...,opentok-hardware-setup.js,opentok,mit,3,[None]
15348,https://github.com/opentable/explicitobjectmap...,explicitobjectmap-node,opentable,mit,5,[None]
15349,https://github.com/openstreetmap/iD,iD,openstreetmap,isc,118,[None]


In [9]:
# a = sample_repos['languages'][3:4].str
# a == "[-]"

In [10]:
# sample_repos.to_csv('github_repos_dataset.csv')

In [11]:
# sample_repos