In [2]:
import pandas as pd
import requests
import csv
from urllib.parse import urlparse

# Open the CSV file in read mode
with open('/Users/tahers/Documents/SE_NPM_packages/npm_package_names.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)

    # Handle potential header row
    is_header = True
    package_names = []
    for row in reader:
        if is_header:
            is_header = False  # Skip the header row if present
            continue
        package_names.append(row[0])  # Assuming package names are in the first column

package_names = package_names[1000:1200]

# Create an empty list to store the retrieved data
data_rows = []

# Iterate through each package name and fetch data
for package_name in package_names:
    try:
        url = f'https://registry.npmjs.org/{package_name}'
        headers = {'Accept': 'application/vnd.npm.install-v1+json'}

        # Fetching the full metadata response
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for non-200 status codes

        package_data = response.json()

        # Extracting required fields from metadata
        description = package_data.get('description', None)
        maintainers_length = len(package_data.get('maintainers', []))
        readmeFilename_exists = 'readmeFilename' in package_data
        created = package_data.get('time', {}).get('created', None)
        modified = package_data.get('time', {}).get('modified', None)
        last_version_data = list(package_data.get('versions', {}).values())[-1] if package_data.get('versions') else None
        
        # Extracting version and git URL from last version data if available
        latest_version = None
        repository_url = None
        cleaned_repository_url = None
        if isinstance(last_version_data, dict):
            latest_version = last_version_data.get('version', None)
            # Check if 'repository' exists and is a dictionary
            repository_data = last_version_data.get('repository')
            if isinstance(repository_data, dict):
                repository_url = repository_data.get('url', None)
                # Convert SSH URL to HTTPS for GitHub repositories
                if repository_url and 'github.com' in repository_url and (repository_url.startswith('git+') or repository_url.startswith('ssh://git@github.com') or repository_url.startswith('https://github.com') or repository_url.startswith('git://github.com')):
                    repository_url = repository_url.replace('git+ssh://git@github.com', 'https://github.com')
                    repository_url = repository_url.replace('ssh://git@github.com', 'https://github.com')
                    repository_url = repository_url.replace('git://github.com', 'https://github.com')
                    
                    repository_url = repository_url[:-4] if repository_url.endswith('.git') else repository_url
                else:
                    print(f'skipped: {repository_url}')
                # Clean the git URL
                if repository_url:
                    parsed_url = urlparse(repository_url)
                    if parsed_url.scheme == 'git':
                        cleaned_repository_url = parsed_url.netloc + parsed_url.path
                    else:
                        cleaned_repository_url = repository_url.lstrip('git+')

        # Fetching the abbreviated response
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for non-200 status codes
        abbreviated_package_data = response.json()

        # Extracting dependencies, devDependencies, and deprecated from abbreviated response
        last_version_data = list(abbreviated_package_data.get('versions', {}).values())[-1] if abbreviated_package_data.get('versions') else None
        dependencies = last_version_data.get('dependencies', {}) if last_version_data else {}
        dependencies_name = list(dependencies.keys())
        dependencies_count = len(dependencies)

        dev_dependencies = last_version_data.get('devDependencies', {}) if last_version_data else {}
        dev_dependencies_name = list(dev_dependencies.keys())
        dev_dependencies_count = len(dev_dependencies)

        # Extracting deprecated status
        if last_version_data:
            deprecated = last_version_data.get('deprecated')
            if isinstance(deprecated, str):
                deprecated = deprecated.strip()  # Remove leading and trailing whitespace
                if deprecated == '' or deprecated.lower() == 'false':
                    deprecated = None  # Treat empty string or 'false' as not deprecated
        else:
            deprecated = None

        # Set deprecated status
        deprecated_status = 'Yes' if deprecated else 'No'

        # Fetching download stats
        download_stats_last_day = requests.get(f'https://api.npmjs.org/downloads/point/last-day/{package_name}').json()
        download_stats_last_week = requests.get(f'https://api.npmjs.org/downloads/range/last-week/{package_name}').json()
        download_stats_last_month = requests.get(f'https://api.npmjs.org/downloads/range/last-month/{package_name}').json()
        download_stats_last_3_month = requests.get(f'https://api.npmjs.org/downloads/range/2023-11-17:2024-02-16/{package_name}').json()
        
        # Extracting download stats
        last_day_downloads = download_stats_last_day['downloads']
        last_week_downloads = sum(day['downloads'] for day in download_stats_last_week['downloads'])
        last_month_downloads = sum(day['downloads'] for day in download_stats_last_month['downloads'])
        last_3_month_downloads = sum(day['downloads'] for day in download_stats_last_3_month['downloads'])

        # Create a row for the DataFrame
        data_row = {
            'package_name': package_name,
            'description': description,
            'maintainers_length': maintainers_length,
            'readmeFilename_exists': readmeFilename_exists,
            'created': created,
            'modified': modified,
            'latest_version': latest_version,
            'latest_version_git_repo': cleaned_repository_url,
            'dependencies_name': dependencies_name,
            'dependencies_count': dependencies_count,
            'dev_dependencies_name': dev_dependencies_name,
            'dev_dependencies_count': dev_dependencies_count,
            'deprecated': deprecated_status,
            'last_day_downloads': last_day_downloads,
            'last_week_downloads': last_week_downloads,
            'last_month_downloads': last_month_downloads,
            'last_3_month_downloads':last_3_month_downloads
        }

        # Append the row to the list
        data_rows.append(data_row)

    except Exception as e:
        print(f"Error fetching data for {package_name}: {e}")
        # Create a row with all values as 'NA'
        error_row = {key: 'NA' for key in data_row.keys()}
        error_row['package_name'] = package_name
        data_rows.append(error_row)

# Create a DataFrame from the list of rows
data = pd.DataFrame(data_rows)

# Save the DataFrame to a new CSV file
data.to_csv('npmjs_data.csv', index=False)

skipped: https://gitee.com/codingdream123123


KeyboardInterrupt: 