## 1. To get the names of all the packages present in the NPM Registry using API

In [3]:
import requests

def get_npm_packages():
    # Define the URL and parameters for the API request
    url = 'https://skimdb.npmjs.com/registry/_all_docs'
    params = {'include_docs': 'false'}
    # params = {'include_docs': 'false', 'limit': limit}
    
    # Send a GET request to the NPM API
    response = requests.get(url, params=params)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Extract the JSON data from the response
        data = response.json()
        # Return the list of packages
        return data['rows']
    else:
        # Print an error message if the request failed
        print(f"Failed to retrieve data. Status code: {response.status_code}")
        return []

# Initialize a list to store package names
package_names = []

# Get the list of all packages from the NPM repository
all_packages = get_npm_packages()

# Iterate over each package and extract its name
for pkg in all_packages:
    package_names.append(pkg['id'])

print("All the package_names are fetched successfully!")

All the package_names are fetched successfully!


### 1.1 Storing all the fetched package_names in a CSV File

In [25]:
#####

import pandas as pd

# Create a DataFrame with package names
df = pd.DataFrame(package_names, columns=['package_name'])

# Write DataFrame to CSV
df.to_csv('All_NPM_Package_names.csv', index=False)

In [28]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('All_NPM_Package_names.csv')

# Print the number of package names
print("Number of package names found in the NPM ecosystem:", len(df))


Number of package names found in the NPM ecosystem: 2606001


In [19]:
# Print the first few rows of the "package_name" column
print("First few package names:")
df.head(20)

First few package names:


Unnamed: 0,package_name
0,-
1,-
2,-
3,-
4,-
5,-
6,-
7,-
8,--hoodmane-test-pyodide
9,--ignore-scripts


In [29]:
# Printing all the package_names fetched
print("All the package names fetched:")
df

All the package names fetched:


Unnamed: 0,package_name
0,-
1,-
2,-
3,-
4,-
...,...
2605996,zzzzzzz
2605997,zzzzzzzz-pppppp
2605998,zzzzzzzzzz
2605999,zzzzzzzzzzzzzzzzz


### 1.2 Filtering out the subpackages to get the primary package names

In [23]:
import csv

# Open the All_NPM_Package_names CSV file
with open('All_NPM_Package_names.csv', 'r', newline='') as infile:
    reader = csv.reader(infile)
    # Count total number of total packages in the CSV
    input_row_count = sum(1 for row in reader)

# Open the All_NPM_Package_names CSV file again to filter rows
with open('All_NPM_Package_names.csv', 'r', newline='') as infile:
    reader = csv.reader(infile)
    # Filter the subpackages (containes '/', Example: '@webjunto/voyage-svg' is a subpackage of 'webjunto')
    filtered_rows = [row[0] for row in reader if '/' not in row[0]] 

# Write filtered rows to a new All_NPM_Primary_Package_names CSV file
with open('All_NPM_Primary_Package_names.csv', 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    for value in filtered_rows:
        writer.writerow([value])

# Count total number of primary packages
output_row_count = len(filtered_rows)

print("Total number of NPM Packages fetched:", input_row_count)
print("Total number of Primary NPM Packages found (After filtering out the subpackages):", output_row_count)


Total number of NPM Packages fetched: 2606002
Total number of Primary NPM Packages found (After filtering out the subpackages): 1746722


In [30]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('All_NPM_Primary_Package_names.csv')

# Print the first few rows of the "package_name" column from the 'Primary' Packages
print("First few primary package names:")
df.head(20)

First few primary package names:


Unnamed: 0,package_name
0,-
1,-
2,-
3,-
4,-
5,-
6,-
7,-
8,--hoodmane-test-pyodide
9,--ignore-scripts


In [31]:
# Printing all the primary package_names fetched
print("All the primary package names fetched:")
df

All the primary package names fetched:


Unnamed: 0,package_name
0,-
1,-
2,-
3,-
4,-
...,...
1746716,zzzzzzz
1746717,zzzzzzzz-pppppp
1746718,zzzzzzzzzz
1746719,zzzzzzzzzzzzzzzzz


### 1.3 Randomly picking 20k package names from the primary packages list for the furthur analysis

In [33]:
import pandas as pd
import random

#read all the primary package names from the csv
df = pd.read_csv('All_NPM_Primary_Package_names.csv')
primary_packages = df.iloc[:, 0].tolist()

# Shuffle the primary_packages
random.shuffle(primary_packages)

#Select first 20000 primary_packages from the shuffled primary_packages 
selected_primary_packages = primary_packages[:20000]

# Create a DataFrame with list2 as a column
df_output = pd.DataFrame({'Package Name': selected_primary_packages})

# Write the DataFrame to a CSV file
df_output.to_csv('random_packages_20k.csv', index=False)

print("20k random primary package names are picked and stored in a CSV successfully!")

20k random primary package names are picked and stored in a CSV successfully!


In [35]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('random_packages_20k.csv')

# Print the first few rows of the "package_name" column from the 'Primary' Packages
print("First few randomy picked primary package names:")
df.head(20)

First few randomy picked primary package names:


Unnamed: 0,Package Name
0,tencentcloud-sdk-nodejs-bm
1,is-equal-shallow
2,rexponse
3,eslint-plugin-no-extension-in-require
4,fluids
5,electron-typescript-react-tailwind-redux
6,spotifyweb-api
7,sanity-plugin-tasks
8,xybot-utils
9,aurelia-bcf


In [36]:
# Printing all the 20k randomly picked primary package_names fetched
print("All the 20k randomly picked primary package_names:")
df

All the 20k randomly picked primary package_names:


Unnamed: 0,Package Name
0,tencentcloud-sdk-nodejs-bm
1,is-equal-shallow
2,rexponse
3,eslint-plugin-no-extension-in-require
4,fluids
...,...
19995,rcetoggle
19996,nitro-sg
19997,ecmatags
19998,react-light-pagination


## 2. Data Fetching & Cleaning of all the 20k packages

### 2.1 Fetching & Cleaning Data about the 20k packages from the NPM Registry using API

In [52]:
import pandas as pd
import requests
import csv
from urllib.parse import urlparse
import random
from datetime import datetime, timedelta

# Open the random_packages_20k CSV file in read mode
with open('random_packages_20k.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    
    # Handle potential header row
    is_header = True
    package_names = []
    for row in reader:
        if is_header:
            is_header = False
            continue
        package_names.append(row[0])

package_names = package_names

data_rows = []

for package_name in package_names[0:10]:
    try:
        url = f'https://registry.npmjs.org/{package_name}'
        headers = {'Accept': 'application/vnd.npm.install-v1+json'}

        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for non-200 status codes

        package_data = response.json()

        # Extracting required fields from metadata
        keywords = package_data.get('keywords', [])
        keywords = ", ".join(keywords)
        last_version_data = list(package_data.get('versions', {}).values())[-1] if package_data.get('versions') else None
        
        # Extracting git URL from last version data if available
        repository_url = None
        cleaned_repository_url = None
        if isinstance(last_version_data, dict):
            repository_data = last_version_data.get('repository')
            if isinstance(repository_data, dict):
                repository_url = repository_data.get('url', None)
                # Convert URLs to HTTPS for GitHub repositories
                if repository_url and 'github.com' in repository_url:
                    if repository_url.startswith(('git+', 'ssh://git@github.com', 'https://github.com', 'http://github.com', 'git://github.com', 'git@personal.github.com', 'git@github.com', 'github.com', '@personal.github.com')):
                        if 'git+ssh://git@github.com' in repository_url:
                            repository_url = repository_url.replace('git+ssh://git@github.com', 'https://github.com')
                        elif 'ssh://git@github.com' in repository_url:
                            repository_url = repository_url.replace('ssh://git@github.com', 'https://github.com')
                        elif 'git://github.com' in repository_url:
                            repository_url = repository_url.replace('git://github.com', 'https://github.com')
                        elif 'git@personal.github.com:' in repository_url:
                            repository_url = repository_url.replace('git@personal.github.com:', 'https://github.com/')
                        elif 'git@github.com:' in repository_url:   
                            repository_url = repository_url.replace('git@github.com:', 'https://github.com/')
                        elif '@personal.github.com:' in repository_url:
                            repository_url = repository_url.replace('@personal.github.com:', 'https://github.com/')

                        repository_url = repository_url[:-4] if repository_url.endswith('.git') else repository_url
                        
                    # Clean the git URL
                    if repository_url:
                        parsed_url = urlparse(repository_url)
                        if parsed_url.scheme == 'git':
                            cleaned_repository_url = parsed_url.netloc + parsed_url.path
                        else:
                            cleaned_repository_url = repository_url.lstrip('git+')

        # Fetching the abbreviated response
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for non-200 status codes
        abbreviated_package_data = response.json()

        # Extracting dependencies
        last_version_data = list(abbreviated_package_data.get('versions', {}).values())[-1] if abbreviated_package_data.get('versions') else None
        dependencies = last_version_data.get('dependencies', {}) if last_version_data else {}
        dependencies_name = list(dependencies.keys())
        
       
        # Fetching download stats
        today = datetime.utcnow().date()
        three_months_ago = today - timedelta(days=91)
        yesterday = today - timedelta(days=1)
        date_range = f"{three_months_ago.strftime('%Y-%m-%d')}:{yesterday.strftime('%Y-%m-%d')}"
        download_stats_last_3_month = requests.get(f'https://api.npmjs.org/downloads/range/{date_range}/{package_name}').json()
        last_3_month_downloads = sum(day['downloads'] for day in download_stats_last_3_month['downloads'])

        if cleaned_repository_url:
            data_row = {
                'package_name': package_name,
                'git_repository': cleaned_repository_url,
                'dependencies_name': dependencies_name,
                'keywords': keywords, 
                'last_3_month_downloads':last_3_month_downloads
            }
            data_rows.append(data_row)

    except Exception as e:
        print(f"Error fetching data for {package_name}: {e}")

# Create a DataFrame from the list of rows
data = pd.DataFrame(data_rows)

# Save the DataFrame to a new CSV file
data.to_csv('npmjs_data_20k.csv', index=False, escapechar='\\')

print("npmjs data for 20k packages is retrived and stored successfully!")

  today = datetime.utcnow().date()


npmjs data for 20k packages is retrived and stored successfully!


In [54]:
print("npmjs data fecthed for first few rows:")
data.head()

npmjs data fecthed for first few rows:


Unnamed: 0,package_name,git_repository,dependencies_name,keywords,last_3_month_downloads
0,tencentcloud-sdk-nodejs-bm,https://github.com/tencentcloud/tencentcloud-s...,[tencentcloud-sdk-nodejs-common],"tencentcloudapi, tencentcloud, qcloud, sdk, js...",11656
1,is-equal-shallow,https://github.com/jonschlinkert/is-equal-shallow,[is-primitive],"compare, comparison, equal, equals, is, is-equ...",22711666
2,rexponse,https://github.com/elchinzadeh/rexponse,[],"express, response, rest",11
3,eslint-plugin-no-extension-in-require,https://github.com/pdubroy/eslint-plugin-no-ex...,[],"eslint, plugin, node, require, path, extension",16302
4,fluids,https://github.com/alloc/fluids,[],"reactive, reactivity, glue, observable, watch",333188


### 2.2 Fetching & Cleaning Data about the 20k packages from their respective Github repo using webscrapping

In [71]:
import csv
from collections import defaultdict
import lxml.html as lx
import pandas as pd
from urllib.parse import urlparse
from git import Repo
import json
import requests

def git_request_stars(git_repo_url):
    result=requests.get(git_repo_url)
    html = lx.fromstring(result.text)
    try:
        star_element = html.xpath('//*[@id="repo-stars-counter-star"]')[0]
    
        try:
            #converting "332" to 332 stars
            stars = int(star_element.text)
        except:
            #Converting '3.3k' to 3300 
            if(star_element.text and star_element.text[-1] == 'k'): 
                stars = int(float(star_element.text[:-1])*1000)
            else:
                stars = star_element.text
        finally:
            return stars
    except Exception as e:
        return None          


def fetch_stars(git_repos, output_csv):
    data = []
  
    for git_repo in git_repos:
        # Example:- git_repo: ('is-equal-shallow', 'https://github.com/jonschlinkert/is-equal-shallow')
        package_name = git_repo[0]
        repo_url = git_repo[1]
        if repo_url: #if stars data is able to be fetched
            result = git_request_stars(repo_url)
            if result is not None:
                data.append({
                    "package_name": package_name,
                    "git_repository": repo_url,
                    "stars": result
                })

    with open(output_csv, 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=["package_name", "git_repository", "stars"])
        writer.writeheader()
        writer.writerows(data)

input_file = 'npmjs_data_20k.csv'
with open(input_file, 'r',encoding='utf-8', errors='ignore') as file:
    reader = csv.DictReader(file)
    # Handle potential header row
    is_header = True
    git_repos = []
    for row in reader:
        if is_header:
            is_header = False
            continue
        git_repos.append((row['package_name'],row['git_repository'])) 

        
fetch_stars(git_repos, 'github_data_20k.csv')

print("github data for 20k packages is retrived and stored successfully!")

github data for 20k packages is retrived and stored successfully!


In [74]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv("github_data_20k.csv")

df

Unnamed: 0,package_name,git_repository,stars
0,is-equal-shallow,https://github.com/jonschlinkert/is-equal-shallow,29
1,rexponse,https://github.com/elchinzadeh/rexponse,2
2,eslint-plugin-no-extension-in-require,https://github.com/pdubroy/eslint-plugin-no-ex...,2
3,fluids,https://github.com/alloc/fluids,6
4,spotifyweb-api,https://github.com/gabrieldarezzo/js-tdd-course,1
...,...,...,...
8075,insomnia-plugin-customtimestamp,https://github.com/Gabb1995/insomnia-plugin-cu...,30
8076,project-showcaser,https://github.com/Oichacha/projects-showcaser,0
8077,pkginfo-json5,http://github.com/mathieumg/pkginfo-json5,0
8078,nitro-sg,https://github.com/powerhome/nitro-storybook,0


#### Out of 20k packages, github stars data has been only fetched for only "8080" packages. For all the remaining 11920 packages, stars data could't be fetched due to various reasons like:
##### (1) Improper repo url
##### (2) Broken repo url
##### (3) 404 error for few repos
##### (4) Few packages have repo at other places and not github (our study is only confined to github)
##### So, we continue all the future study with only these "8080" packages data

### 2.3 Merging npmjs_data_20k & github_data_20k datasets

In [76]:
import pandas as pd

# Load the CSV files into DataFrames
npmjs_df = pd.read_csv("npmjs_data_20k.csv")
github_df = pd.read_csv("github_data_20k.csv")

# Merge the two DataFrames based on the 'package_name' column
merged_df = pd.merge(npmjs_df, github_df[['package_name', 'stars']], on='package_name', how='left')

# Save the merged DataFrame to a new CSV file
merged_df.to_csv("npmjs_github_data_8k.csv", index=False)


In [80]:
# Load the CSV file into a DataFrame
df = pd.read_csv("npmjs_github_data_8k.csv")

df

Unnamed: 0,package_name,git_repository,dependencies_name,keywords,last_3_month_downloads,stars
0,is-equal-shallow,https://github.com/jonschlinkert/is-equal-shallow,['is-primitive'],"compare, comparison, equal, equals, is, is-equ...",22674275.0,29
1,rexponse,https://github.com/elchinzadeh/rexponse,[],"express, response, rest",14.0,2
2,eslint-plugin-no-extension-in-require,https://github.com/pdubroy/eslint-plugin-no-ex...,[],"eslint, plugin, node, require, path, extension",18644.0,2
3,fluids,https://github.com/alloc/fluids,[],"reactive, reactivity, glue, observable, watch",324158.0,6
4,spotifyweb-api,https://github.com/gabrieldarezzo/js-tdd-course,[],"js, tdd, library",8.0,1
...,...,...,...,...,...,...
8075,insomnia-plugin-customtimestamp,https://github.com/Gabb1995/insomnia-plugin-cu...,['moment'],"insomnia, plugin, custom, timestamp, date, tim...",288.0,30
8076,project-showcaser,https://github.com/Oichacha/projects-showcaser,"['@testing-library/jest-dom', '@testing-librar...",,7.0,0
8077,pkginfo-json5,http://github.com/mathieumg/pkginfo-json5,['json5'],"info, tools, package.json",22.0,0
8078,nitro-sg,https://github.com/powerhome/nitro-storybook,"['classnames', 'es5-shim', 'flow-runtime', 'lo...",,19.0,0


### 2.4 Fetching & Cleaning Data about the 8k packages from their Synk.io vulerability page using webscrapping (BeautifulSoup & Selenium) 

In [87]:
import csv
import urllib
import time
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlparse
import random
import re

# Open the CSV file in read mode
with open('npmjs_github_data_8k.csv', 'r', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    # Handle potential header row
    is_header = True
    package_names = []
    for row in reader:
        if is_header:
            is_header = False  # Skip the header row if present
            continue
        package_names.append(row[0]) 

packages = package_names

def get_package_data(package):
    retry_attempts = 3  # Number of retry attempts
    for _ in range(retry_attempts):
        try:
            url = 'https://snyk.io/advisor/npm-package/{}'.format(urllib.parse.quote_plus(package))

            options = webdriver.ChromeOptions()
            options.add_argument('--headless')
            driver = webdriver.Chrome(options=options)
            driver.get(url)

            # Wait for the page to fully load
            time.sleep(0.38)

            # Extract HTML content after waiting
            html_content = driver.page_source
            soup = BeautifulSoup(html_content, "html.parser")

            health_score = popularity = None

            #getting heath score
            number_div = soup.find('div', class_='number')
            if number_div:
                health_score = number_div.find('span').text.strip()[:3].rstrip()

            #getting popularity
            ul_element = soup.find('ul', class_='scores')
            if ul_element:
                spans = ul_element.find_all('span', class_='vue--pill__body')
                popularity = spans[1].text.strip()

            driver.quit()

            if any(value == 'pending...' or value.strip() == '?' for value in [health_score, popularity]):
                continue  # Retry for this package
            else:
                return [package, popularity, health_score]

        except Exception as e:
            return None

    return None

output_filename = "synk_data_8k.csv"

with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['package_name', 'Popularity', 'Health Score']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for package in packages:
        package_data = get_package_data(package)
        if package_data:
            writer.writerow(dict(zip(fieldnames, package_data)))

print("Synk.io vulnerabilty data for 8k packages is retrived and stored successfully!")

Synk.io vulnerabilty data for 8k packages is retrived and stored successfully!


In [89]:
# Load the CSV file into a DataFrame
df = pd.read_csv("synk_data_8k.csv")

df

Unnamed: 0,package_name,Popularity,Health Score
0,is-equal-shallow,Influential project,68
1,rexponse,Limited,42
2,eslint-plugin-no-extension-in-require,Small,45
3,fluids,Recognized,47
4,spotifyweb-api,Limited,42
...,...,...,...
8075,insomnia-plugin-customtimestamp,Limited,42
8076,project-showcaser,Limited,10
8077,pkginfo-json5,Limited,36
8078,nitro-sg,Limited,36


### 2.5 Merging npmjs_github_data_8k.csv & synk_data_8k datasets

In [92]:
import pandas as pd

# Load the CSV files into DataFrames
npmjs_github_df = pd.read_csv("npmjs_github_data_8k.csv")
synk_df = pd.read_csv("synk_data_8k.csv")

# Merge the two DataFrames based on the 'package_name' column
merged_df = pd.merge(npmjs_github_df, synk_df, on='package_name', how='left')

# Save the merged DataFrame to a new CSV file
merged_df.to_csv("package_details.csv", index=False)


In [93]:
# Load the CSV file into a DataFrame
df = pd.read_csv("package_details.csv")

df

Unnamed: 0,package_name,git_repository,dependencies_name,keywords,last_3_month_downloads,stars,Popularity,Health Score
0,is-equal-shallow,https://github.com/jonschlinkert/is-equal-shallow,['is-primitive'],"compare, comparison, equal, equals, is, is-equ...",22674275.0,29,Influential project,68
1,rexponse,https://github.com/elchinzadeh/rexponse,[],"express, response, rest",14.0,2,Limited,42
2,eslint-plugin-no-extension-in-require,https://github.com/pdubroy/eslint-plugin-no-ex...,[],"eslint, plugin, node, require, path, extension",18644.0,2,Small,45
3,fluids,https://github.com/alloc/fluids,[],"reactive, reactivity, glue, observable, watch",324158.0,6,Recognized,47
4,spotifyweb-api,https://github.com/gabrieldarezzo/js-tdd-course,[],"js, tdd, library",8.0,1,Limited,42
...,...,...,...,...,...,...,...,...
8075,insomnia-plugin-customtimestamp,https://github.com/Gabb1995/insomnia-plugin-cu...,['moment'],"insomnia, plugin, custom, timestamp, date, tim...",288.0,30,Limited,42
8076,project-showcaser,https://github.com/Oichacha/projects-showcaser,"['@testing-library/jest-dom', '@testing-librar...",,7.0,0,Limited,10
8077,pkginfo-json5,http://github.com/mathieumg/pkginfo-json5,['json5'],"info, tools, package.json",22.0,0,Limited,36
8078,nitro-sg,https://github.com/powerhome/nitro-storybook,"['classnames', 'es5-shim', 'flow-runtime', 'lo...",,19.0,0,Limited,36


## 3. Labelling every package into various vulnerability levels & categorising every package as popular/non-popular

### 3.1 Categorising every package as popular/non-pouplar

In [95]:
import pandas as pd

# Read the CSV file and create a DataFrame
df = pd.read_csv("package_details.csv")

# function to determine the 'Popular' status
def is_popular(row):
    if (row['Popularity'] != 'Limited' and row['Popularity'] != 'Small') or (row['stars'] >= 100) or (row['last_3_month_downloads'] >= 1000):
        return 'Yes'
    else:
        return 'No'

# Apply the function to create the 'Popular' column
df['Popular'] = df.apply(is_popular, axis=1)

# Save the modified DataFrame to a new CSV file
df.to_csv("package_details_with_popular.csv", index=False)

print("Every package is categorised as popular/non-pouplar, and the respective data is stored in the CSV file successfully")


Every package is categorised as popular/non-pouplar, and the respective data is stored in the CSV file successfully


In [96]:
# Load the CSV file into a DataFrame
df = pd.read_csv("package_details_with_popular.csv")

df

Unnamed: 0,package_name,git_repository,dependencies_name,keywords,last_3_month_downloads,stars,Popularity,Health Score,Popular
0,is-equal-shallow,https://github.com/jonschlinkert/is-equal-shallow,['is-primitive'],"compare, comparison, equal, equals, is, is-equ...",22674275.0,29,Influential project,68,Yes
1,rexponse,https://github.com/elchinzadeh/rexponse,[],"express, response, rest",14.0,2,Limited,42,No
2,eslint-plugin-no-extension-in-require,https://github.com/pdubroy/eslint-plugin-no-ex...,[],"eslint, plugin, node, require, path, extension",18644.0,2,Small,45,Yes
3,fluids,https://github.com/alloc/fluids,[],"reactive, reactivity, glue, observable, watch",324158.0,6,Recognized,47,Yes
4,spotifyweb-api,https://github.com/gabrieldarezzo/js-tdd-course,[],"js, tdd, library",8.0,1,Limited,42,No
...,...,...,...,...,...,...,...,...,...
8075,insomnia-plugin-customtimestamp,https://github.com/Gabb1995/insomnia-plugin-cu...,['moment'],"insomnia, plugin, custom, timestamp, date, tim...",288.0,30,Limited,42,No
8076,project-showcaser,https://github.com/Oichacha/projects-showcaser,"['@testing-library/jest-dom', '@testing-librar...",,7.0,0,Limited,10,No
8077,pkginfo-json5,http://github.com/mathieumg/pkginfo-json5,['json5'],"info, tools, package.json",22.0,0,Limited,36,No
8078,nitro-sg,https://github.com/powerhome/nitro-storybook,"['classnames', 'es5-shim', 'flow-runtime', 'lo...",,19.0,0,Limited,36,No


### 3.2 Labelling every package into various vulnerability levels 

In [97]:
import pandas as pd

# Read the CSV file
df = pd.read_csv("package_details_with_popular.csv")

# function to map Health_Score values to vulnerability levels
def get_vulnerability_level(score):
    if score < 40:
        return "Very High"
    elif 40 <= score < 60:
        return "High"
    elif 60 <= score < 75:
        return "Medium"
    else:
        return "Low"

# Apply the custom function to create the new column
df['vulnerability_level'] = df['Health Score'].apply(get_vulnerability_level)

# Save the modified DataFrame to a new CSV file
df.to_csv("package_details_with_popular_and_vulnerability.csv", index=False)

print("Every package is labelled into various vulnerability levels, and the respective data is stored in the CSV file successfully")


Every package is labelled into various vulnerability levels, and the respective data is stored in the CSV file successfully


In [98]:
# Load the CSV file into a DataFrame
df = pd.read_csv("package_details_with_popular_and_vulnerability.csv")

df

Unnamed: 0,package_name,git_repository,dependencies_name,keywords,last_3_month_downloads,stars,Popularity,Health Score,Popular,vulnerability_level
0,is-equal-shallow,https://github.com/jonschlinkert/is-equal-shallow,['is-primitive'],"compare, comparison, equal, equals, is, is-equ...",22674275.0,29,Influential project,68,Yes,Medium
1,rexponse,https://github.com/elchinzadeh/rexponse,[],"express, response, rest",14.0,2,Limited,42,No,High
2,eslint-plugin-no-extension-in-require,https://github.com/pdubroy/eslint-plugin-no-ex...,[],"eslint, plugin, node, require, path, extension",18644.0,2,Small,45,Yes,High
3,fluids,https://github.com/alloc/fluids,[],"reactive, reactivity, glue, observable, watch",324158.0,6,Recognized,47,Yes,High
4,spotifyweb-api,https://github.com/gabrieldarezzo/js-tdd-course,[],"js, tdd, library",8.0,1,Limited,42,No,High
...,...,...,...,...,...,...,...,...,...,...
8075,insomnia-plugin-customtimestamp,https://github.com/Gabb1995/insomnia-plugin-cu...,['moment'],"insomnia, plugin, custom, timestamp, date, tim...",288.0,30,Limited,42,No,High
8076,project-showcaser,https://github.com/Oichacha/projects-showcaser,"['@testing-library/jest-dom', '@testing-librar...",,7.0,0,Limited,10,No,Very High
8077,pkginfo-json5,http://github.com/mathieumg/pkginfo-json5,['json5'],"info, tools, package.json",22.0,0,Limited,36,No,Very High
8078,nitro-sg,https://github.com/powerhome/nitro-storybook,"['classnames', 'es5-shim', 'flow-runtime', 'lo...",,19.0,0,Limited,36,No,Very High


#### All the required data for analysis for 8080 packages is stored in a dataset and is now ready!
#### Now, we need to fetch the data of contributors of the github repos' of these 8080 packages...

## 4. Fetching Data of all the Package contibutors (git repos contributors) from Github

In [132]:
import requests_cache
requests_cache.install_cache("STA220Project")

### 4.1 Fetching Contributors usernames from a git repo using Webscrapping (Selenium) 

#### Attempt-1: Data of contributors is being fetched before the page loads itself (Fail)

In [104]:
#function that takes a github_repo_url and returns the list of contributors of it

from pydriller import Repository
import requests
import lxml.html as lx
import time
from tqdm import tqdm

def contributors_old(github_repo_url):
    github_repo_contributors_page = f"{github_repo_url}/graphs/contributors"
    result = requests.get(github_repo_contributors_page)     
    # Wait for 20 seconds to allow the page to load completely (Method Didn't work!!!)
    time.sleep(20)
    html = lx.fromstring(result.text)
    contributors_profiles = html.xpath('//a[contains(@class, "d-inline-block")]/@href')
    
    return list(set(contributors_profiles))
     

#### Attempt-2: Selenium (Too slow)

In [105]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
import lxml.html as lx
import time
from tqdm import tqdm

#function that takes a github_repo_url and returns the list of contributors of it
def contributors(github_repo_url):
    github_repo_contributors_page = f"{github_repo_url}/graphs/contributors"
    
    # Create a WebDriver instance
    driver = webdriver.Chrome()
    
    try:
        # Open the contributors page
        driver.get(github_repo_contributors_page)
        time.sleep(15)  # Wait for 15 seconds to allow the page to load completely 
        html = lx.fromstring(driver.page_source)
        contributors_profiles = html.xpath('//a[contains(@class, "d-inline-block")]/@href')
    finally:
        # Close the WebDriver instance
        driver.quit()
    if(len(contributors_profiles) > 2):    
        contributors_profiles = contributors_profiles[2:] #removing first 2 data points as they are noisy
    else: #retry after 10 secs delay
        try:
            # Create a WebDriver instance
            driver = webdriver.Chrome()
            # Open the contributors page
            driver.get(github_repo_contributors_page)
            time.sleep(20)  # Wait for 20 seconds to allow the page to load completely 
            html = lx.fromstring(driver.page_source)
            contributors_profiles = html.xpath('//a[contains(@class, "d-inline-block")]/@href')
        finally:
            # Close the WebDriver instance
            driver.quit()
        if(len(contributors_profiles) > 2):    
            contributors_profiles = contributors_profiles[2:] #removing first 2 data points as they are noisy
        else:
            return []
        
    return contributors_profiles

#### Attempt-3: Selenium with headless mode (Still slow)

In [106]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import requests
import lxml.html as lx
import time
from tqdm import tqdm

#function that takes a github_repo_url and returns the list of contributors of it
def contributors(github_repo_url):
    github_repo_contributors_page = f"{github_repo_url}/graphs/contributors"

    # Set Chrome options for headless mode
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Enable headless mode
    
    # Create a WebDriver instance
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(github_repo_contributors_page)
    
    contributors_profiles = []
    wait_time = 0
    while len(contributors_profiles) <= 2 and wait_time < 40:
        # Open the contributors page
        time.sleep(5)  # Wait for 5 seconds to allow the page to load completely
        wait_time = wait_time + 5
        html = lx.fromstring(driver.page_source)
        contributors_profiles = html.xpath('//a[contains(@class, "d-inline-block")]/@href')
    
    # Close the WebDriver instance
    driver.quit()    
    
    if(len(contributors_profiles) > 2):    
        contributors_profiles = contributors_profiles[2:] #removing first 2 data points as they are noisy
    else:
        return []
        
    return contributors_profiles


#### Attempt-4: Selenium with improving "network_conditions" (Still somewhat slower)

In [107]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import requests
import lxml.html as lx
import time
from tqdm import tqdm


#function that takes a github_repo_url and returns the list of contributors of it
def contributors(github_repo_url):
    github_repo_contributors_page = f"{github_repo_url}/graphs/contributors"

    # #Set Chrome options for headless mode
    # chrome_options = Options()
    # chrome_options.add_argument("--headless")  # Enable headless mode

    # Create ChromeOptions object
    chrome_options = webdriver.ChromeOptions()
    
    # Enable Chrome DevTools Protocol
    chrome_options.add_experimental_option('w3c', False)
    
    # Set network throttling conditions
    network_conditions = {
        "offline": False,  # Set to True to simulate offline conditions
        "latency": 100,    # Latency in milliseconds
        "download_throughput": 1024 * 1024,  # Download throughput in bytes/second
        "upload_throughput": 512 * 1024      # Upload throughput in bytes/second
    }
    
    # Add network conditions to ChromeOptions
    chrome_options.add_argument(f"--window-size=1920,1080 --remote-debugging-port=9222")
    your_chrome_user_data_dir = r"C:\Users\kotha\AppData\Local\Google\Chrome\User Data"
    chrome_options.add_argument(f"--user-data-dir={your_chrome_user_data_dir}")
    chrome_options.add_argument("--start-maximized")
    
    # Use CDP to set network conditions
    chrome_options.add_argument("--remote-debugging-port=9222")  # Set the port for Chrome DevTools Protocol
    chrome_options.add_argument("--disable-background-timer-throttling")
    chrome_options.add_argument("--disable-backgrounding-occluded-windows")
    chrome_options.add_argument("--disable-renderer-backgrounding")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-infobars")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    # Create a ChromeDriver instance with the modified ChromeOptions
    driver = webdriver.Chrome(options=chrome_options)

    driver.get(github_repo_contributors_page)
    
    contributors_profiles = []
    wait_time = 0
    while len(contributors_profiles) <= 2 and wait_time < 40:
        # Open the contributors page
        time.sleep(5)  # Wait for 5 seconds to allow the page to load completely
        wait_time = wait_time + 5
        html = lx.fromstring(driver.page_source)
        contributors_profiles = html.xpath('//a[contains(@class, "d-inline-block")]/@href')
    
    # Close the WebDriver instance
    driver.quit()    
    
    if(len(contributors_profiles) > 2):    
        contributors_profiles = contributors_profiles[2:] #removing first 2 data points as they are noisy
    else:
        return []
        
    return contributors_profiles


#### Attempt-5: Selenium (creating 'Driver' only once) with diff page-load-strategy & improved "network_conditions" (Worked!)

In [120]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import requests
import lxml.html as lx
import time
from tqdm import tqdm

# Create ChromeOptions and set the page load strategy
chrome_options = Options()
chrome_options.add_argument("--page-load-strategy=normal")

# Initialize the WebDriver with ChromeOptions
driver = webdriver.Chrome(options=chrome_options)

# Enable Chrome DevTools Protocol
devtools_address = driver.command_executor._url
session_id = driver.session_id

# Set network conditions using Chrome DevTools Protocol
devtools_session = driver.execute_cdp_cmd('Network.enable', {})
driver.execute_cdp_cmd(
    'Network.emulateNetworkConditions',
    {
        'offline': False,
        'latency': 100,  # Latency in milliseconds
        'downloadThroughput': 1024 * 1024,  # Download throughput in bytes/second
        'uploadThroughput': 512 * 1024  # Upload throughput in bytes/second
    }
)

#function that takes a github_repo_url and returns the list of contributors of it
def contributors(github_repo_url):
    github_repo_contributors_page = f"{github_repo_url}/graphs/contributors"

    driver.get(github_repo_contributors_page)
    
    contributors_profiles = []
    wait_time = 0
    while len(contributors_profiles) <= 2 and wait_time < 60:
        # Open the contributors page
        time.sleep(5)  # Wait for 5 seconds to allow the page to load completely
        wait_time = wait_time + 5

        html = lx.fromstring(driver.page_source)
        contributors_profiles = html.xpath('//a[contains(@class, "d-inline-block")]/@href')
    
    # # Close the WebDriver instance
    # driver.quit()    
    
    if(len(contributors_profiles) > 2):    
        contributors_profiles = contributors_profiles[2:] #removing first 2 data points as they are noisy
    else:
        return []
        
    return contributors_profiles


In [112]:
#Sample Testcase
contributors("https://github.com/tue-mdse/genderComputer")

['/bvasiles',
 '/aserebrenik',
 '/brianczapiga',
 '/10b14224cc',
 '/michaelmior',
 '/scribblemaniac',
 '/NimmiW',
 '/ArnoVanLumig',
 '/mazieres']

### 4.2 Fetching all the Contributors fullnames & locations of a github repo from each contributors' profile page using Webscrapping

In [None]:
import requests
import lxml.html as lx

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import requests
import lxml.html as lx
import time
from tqdm import tqdm

# Create ChromeOptions and set the page load strategy
chrome_options = Options()
chrome_options.add_argument("--page-load-strategy=normal")

# Initialize the WebDriver with ChromeOptions
driver = webdriver.Chrome(options=chrome_options)

# Enable Chrome DevTools Protocol
devtools_address = driver.command_executor._url
session_id = driver.session_id

# Set network conditions using Chrome DevTools Protocol
devtools_session = driver.execute_cdp_cmd('Network.enable', {})
driver.execute_cdp_cmd(
    'Network.emulateNetworkConditions',
    {
        'offline': False,
        'latency': 100,  # Latency in milliseconds
        'downloadThroughput': 1024 * 1024,  # Download throughput in bytes/second
        'uploadThroughput': 512 * 1024  # Upload throughput in bytes/second
    }
)

#function that takes a github_repo_url and returns the list of fullnames and their respective location of the contributors of it

def Contributors_fullnames_locations(github_repo_url, access_token = "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"):
    Contributors_fullnames_locations = {}
    contributors_profiles = contributors(github_repo_url)
    for contributors_profile in (contributors_profiles):
        # removing [Bot] profiles (Example: removing "https://github.com/apps/github-actions(github-actions[bot])" from 
        #"https://github.com/chakra-ui/chakra-ui/graphs/contributors")
        # if("[bot]" in contributors_profiles):
        #     continue
        if(contributors_profile.startswith("/apps/")): 
            continue
        else:    
            contributors_profile_page = f"https://github.com{contributors_profile}" #From relative URL ('/brianczapiga') to absolute URL
            result = requests.get(contributors_profile_page)        
            html = lx.fromstring(result.text)
            
            contributor_fullname = html.xpath('//span[contains(@class, "fullname")]/text()')
            if(contributor_fullname):
                contributor_fullname = contributor_fullname[0].strip()
            else:
                contributor_fullname = ""
                
            contributor_location = html.xpath('//svg[contains(@class, "octicon-location")]/following-sibling::span[1]/text()')
            if contributor_location:
                contributor_location = contributor_location[0].strip()
            else:
                contributor_location = ""
            Contributors_fullnames_locations[contributors_profile[1:]] = [contributor_fullname, contributor_location]
    return Contributors_fullnames_locations    

In [127]:
#Sample Testcase
Contributors_fullnames_locations("https://github.com/tue-mdse/genderComputer")

{'bvasiles': ['Bogdan Vasilescu', 'Pittsburgh, PA'],
 'aserebrenik': ['Alexander Serebrenik', 'Netherlands'],
 'brianczapiga': ['Brian Czapiga', ''],
 '10b14224cc': ['', ''],
 'michaelmior': ['Michael Mior', 'Rochester, New York'],
 'scribblemaniac': ['', ''],
 'NimmiW': ['Nimmi Rashinika', 'Waterloo, Canada'],
 'ArnoVanLumig': ['Arno van Lumig', 'Nijmegen'],
 'mazieres': ['Antoine Mazières', 'Land']}

### 4.3 Fetching all the Contributors fullnames & locations of a github repo from each contributors' profile page using Webscrapping, and then pass this unstructured location string to "Nominatim" API, to fecth the country name & country ISO code of each of the contributor

In [136]:
import requests
import json
import pycountry # pip install pycountry
import lxml.html as lx
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
from tqdm import tqdm

# Create ChromeOptions and set the page load strategy
chrome_options = Options()
chrome_options.add_argument("--page-load-strategy=normal")

# Initialize the WebDriver with ChromeOptions
driver = webdriver.Chrome(options=chrome_options)

# Enable Chrome DevTools Protocol
devtools_address = driver.command_executor._url
session_id = driver.session_id

# Set network conditions using Chrome DevTools Protocol
devtools_session = driver.execute_cdp_cmd('Network.enable', {})
driver.execute_cdp_cmd(
    'Network.emulateNetworkConditions',
    {
        'offline': False,
        'latency': 100,  # Latency in milliseconds
        'downloadThroughput': 1024 * 1024,  # Download throughput in bytes/second
        'uploadThroughput': 512 * 1024  # Upload throughput in bytes/second
    }
)

#function that takes a country's ISO A2 Code (Example: IN) and returns the respective ISO A3 Code (Example: IND)
def ISO_A2_to_A3(A2_code):
    try:
        country = pycountry.countries.get(alpha_2=A2_code)
        if country:
            return country.alpha_3
        else:
            return ""
    except LookupError:
        return ""

"""
function that takes a github_repo_url and returns the list of fullnames and their respective countries (Country name & ISO Code) 
of the contributors of it
"""

def Contributors_fullnames_countries(github_repo_url):
    Contributors_fullnames_countries = {}
    contributors_fullnames_locations = Contributors_fullnames_locations(github_repo_url)
    for contributor in (contributors_fullnames_locations):
        contributor_fullname = contributors_fullnames_locations[contributor][0] #full name at index [0]
        contributor_location = contributors_fullnames_locations[contributor][1] #location at index [1]
        if contributor_location: #location is not empty
            api_url = f"https://nominatim.openstreetmap.org/search.php?q={contributor_location}&format=jsonv2&addressdetails=1&limit=1&accept-language=en"
            # Make a request to the API
            response = requests.get(api_url)
            # Parse the JSON response
            data = json.loads(response.text)
            # Extract the 'address' from the JSON response 
            # 'address' contains country_code name, need to fecth that 
            # Example: "address":{"city_district":"Nijmegen",...,"country":"Netherlands","country_code":"nl"}
            if data:
                address = data[0]['address']
                try:
                    country = address['country'] #'nl'
                    country_code = address['country_code'] #'nl'
                    country_code = country_code.upper() #'NL'
                    country_code = ISO_A2_to_A3(country_code) #from 'NL' to 'NLD'
                except:
                    country = ""
                    country_code = ""
            else:
                country = ""
                country_code = ""
        else:
            country = ""
            country_code = ""
        Contributors_fullnames_countries[contributor] = [contributor_fullname, [country,country_code]]
    return Contributors_fullnames_countries
                

In [137]:
#Sample Testcase
Contributors_fullnames_countries("https://github.com/tue-mdse/genderComputer")

{'bvasiles': ['Bogdan Vasilescu', ['United States', 'USA']],
 'aserebrenik': ['Alexander Serebrenik', ['Netherlands', 'NLD']],
 'brianczapiga': ['Brian Czapiga', ['', '']],
 '10b14224cc': ['', ['', '']],
 'michaelmior': ['Michael Mior', ['United States', 'USA']],
 'scribblemaniac': ['', ['', '']],
 'NimmiW': ['Nimmi Rashinika', ['Canada', 'CAN']],
 'ArnoVanLumig': ['Arno van Lumig', ['Netherlands', 'NLD']],
 'mazieres': ['Antoine Mazières', ['Pakistan', 'PAK']]}

### 4.4 Predicting the gender of every contributor, and forming the contributor dataset 

In [138]:
from genderComputer.genderComputer import GenderComputer #install from "https://github.com/tue-mdse/genderComputer"
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import requests
import lxml.html as lx
import time
from tqdm import tqdm

# Create ChromeOptions and set the page load strategy
chrome_options = Options()
chrome_options.add_argument("--page-load-strategy=normal")

# Initialize the WebDriver with ChromeOptions
driver = webdriver.Chrome(options=chrome_options)

# Enable Chrome DevTools Protocol
devtools_address = driver.command_executor._url
session_id = driver.session_id

# Set network conditions using Chrome DevTools Protocol
devtools_session = driver.execute_cdp_cmd('Network.enable', {})
driver.execute_cdp_cmd(
    'Network.emulateNetworkConditions',
    {
        'offline': False,
        'latency': 100,  # Latency in milliseconds
        'downloadThroughput': 1024 * 1024,  # Download throughput in bytes/second
        'uploadThroughput': 512 * 1024  # Upload throughput in bytes/second
    }
)

gc = GenderComputer()

# Function that takes a github_repo_url and package_name and returns the predicted genders of the contributors
def Contributors_genders(github_repo_url, package_name):
    contributors_fullnames_countries = Contributors_fullnames_countries(github_repo_url)
    data = []

    for contributor in contributors_fullnames_countries:
        contributor_fullname = contributors_fullnames_countries[contributor][0]  # full name at index [0]
        contributor_country_details = contributors_fullnames_countries[contributor][1]  # country_details at index [1]
        contributor_country_name = contributor_country_details[0]  # Country name at index [0]
        contributor_country_ISO_code = contributor_country_details[1]  # Country code at index [1]

        if contributor_fullname:  # perform gender prediction only if contributor fullname is found on their profile
            contributor_predicted_gender = gc.resolveGender(contributor_fullname, contributor_country_name)
            if contributor_predicted_gender:  # if some gender is predicted (not 'None')
                data.append({
                    'contributor': contributor,
                    'package_name': package_name,
                    'github_repo_url': github_repo_url,
                    'contributor_fullname': contributor_fullname,
                    'contributor_country_name': contributor_country_name,
                    'contributor_country_ISO_code': contributor_country_ISO_code,
                    'contributor_predicted_gender': contributor_predicted_gender
                })

    # Create DataFrame from collected data
    df = pd.DataFrame(data)
    return df


Loaded dictionary from C:\Users\kotha\OneDrive\Desktop\DAs\UC Davis\Courses\Winter 2024\STA220\STA220 Project\genderComputer\..\nameLists\gender.dict
Finished initialization


In [142]:
# Read the package details from package_details.csv
package_details_df = pd.read_csv("package_details_with_popular_and_vulnerability.csv")

# Initialize an empty DataFrame to store contributors' info
final_df = pd.DataFrame()

# Iterate over each row in package_details_df
for index, row in tqdm(package_details_df.iterrows()):
    # Extract package_name and github_repo_url from the current row
    package_name = row['package_name']
    github_repo_url = row['git_repository']
    
    # Execute the Contributors_genders function and append the results to final_df
    final_df = pd.concat([final_df, Contributors_genders(github_repo_url, package_name)])

# Reset the index of the final DataFrame
final_df.reset_index(drop=True, inplace=True)


In [143]:
# Write the DataFrame to a CSV file
final_df.to_csv('all_contributors_info.csv', index=False)

print("Contributors info of the 8k packages is fetched and stored in a CSV successfully!")

Contributors info of the 8k packages is fetched and stored in a CSV successfully!


In [144]:
# Read the CSV file
df = pd.read_csv("all_contributors_info.csv")
df

Unnamed: 0,contributor,package_name,github_repo_url,contributor_fullname,contributor_country_name,contributor_country_ISO_code,contributor_predicted_gender
0,jonschlinkert,is-equal-shallow,https://github.com/jonschlinkert/is-equal-shallow,Jon Schlinkert,United States,USA,male
1,doowb,is-equal-shallow,https://github.com/jonschlinkert/is-equal-shallow,Brian Woodward,United States,USA,male
2,roderickhsiao,is-equal-shallow,https://github.com/jonschlinkert/is-equal-shallow,Roderick Hsiao,,,male
3,elchinzadeh,rexponse,https://github.com/elchinzadeh/rexponse,Elchin Zakizadeh,Azerbaijan,AZE,male
4,pdubroy,eslint-plugin-no-extension-in-require,https://github.com/pdubroy/eslint-plugin-no-ex...,Patrick Dubroy,Germany,DEU,male
...,...,...,...,...,...,...,...
31758,mauro-oto,nitro-sg,https://github.com/powerhome/nitro-storybook,Mauro Otonelli,Argentina,ARG,male
31759,rruiz85,nitro-sg,https://github.com/powerhome/nitro-storybook,Ramon Ruiz,,,male
31760,web-kat,nitro-sg,https://github.com/powerhome/nitro-storybook,Katie Edgar,United States,USA,female
31761,nickwohnhas,nitro-sg,https://github.com/powerhome/nitro-storybook,Nick Wohnhas,United States,USA,male


#### Hence, the dataset containing the required attributes of each and every contributor of the 8k packages is formed and ready for the analysis now!

#### Two datasets formed are:
##### (1) package_details_with_popular_and_vulnerability.csv
##### (2) all_contributors_info.csv

#### As now both the datasets are ready, we can move forward to the analysis part of our study!

## Acknowledgment:

#### Both of the teammates, V V S Aakash Kotha & Nikita Bhrugumaharshi Emberi, contributed equally to the development of this project. While most of the work was done independently, we utilized Language Models, specifically ChatGPT[1], whenever necessary to confirm details and enhance our understanding. We extend our sincere thanks to it for its valuable assistance throughout the project. Additionally, we would like to thank our prof. Peter Kramlinger & TA. Sophia Sun for their guidance and support. Their insights and feedback were invaluable in shaping the direction of our work.

#### [1] ChatGPT (https://chat.openai.com/)