In [1]:
import pandas as pd
import requests
import csv
from urllib.parse import urlparse
import random
from datetime import datetime, timedelta


def change_timestamp(timestamp_string):
    timestamp = datetime.strptime(timestamp_string, '%Y-%m-%dT%H:%M:%S.%fZ')
    formatted_date = timestamp.strftime('%Y-%m-%d')
    return (formatted_date)

# Open the CSV file in read mode
with open('/kaggle/input/random-70k-npm/Dataset/random_packages_70k.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)

    # Handle potential header row
    is_header = True
    package_names = []
    for row in reader:
        if is_header:
            is_header = False
            continue
        package_names.append(row[0])

package_names = package_names[45000:50000]
print(len(package_names))

data_rows = []
cnt =0

for package_name in package_names:
    try:
        url = f'https://registry.npmjs.org/{package_name}'
        headers = {'Accept': 'application/vnd.npm.install-v1+json'}

        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for non-200 status codes

        package_data = response.json()

        # Extracting required fields from metadata
        description = package_data.get('description', None)
        maintainers_length = len(package_data.get('maintainers', []))
        readmeFilename_exists = 'readmeFilename' in package_data
        created = package_data.get('time', {}).get('created', None)
        created_date = change_timestamp(created)
        modified = package_data.get('time', {}).get('modified', None)
        modified_date = change_timestamp(modified)
        last_version_data = list(package_data.get('versions', {}).values())[-1] if package_data.get('versions') else None
        
        # Extracting version and git URL from last version data if available
        latest_version = None
        repository_url = None
        cleaned_repository_url = None
        if isinstance(last_version_data, dict):
            latest_version = last_version_data.get('version', None)
            repository_data = last_version_data.get('repository')
            if isinstance(repository_data, dict):
                repository_url = repository_data.get('url', None)
                # Convert URLs to HTTPS for GitHub repositories
                if repository_url and 'github.com' in repository_url:
                    if repository_url.startswith(('git+', 'ssh://git@github.com', 'https://github.com', 'http://github.com', 'git://github.com', 'git@personal.github.com', 'git@github.com', 'github.com', '@personal.github.com')):
                        if 'git+ssh://git@github.com' in repository_url:
                            repository_url = repository_url.replace('git+ssh://git@github.com', 'https://github.com')
                        elif 'ssh://git@github.com' in repository_url:
                            repository_url = repository_url.replace('ssh://git@github.com', 'https://github.com')
                        elif 'git://github.com' in repository_url:
                            repository_url = repository_url.replace('git://github.com', 'https://github.com')
                        elif 'git@personal.github.com:' in repository_url:
                            repository_url = repository_url.replace('git@personal.github.com:', 'https://github.com/')
                        elif 'git@github.com:' in repository_url:   
                            repository_url = repository_url.replace('git@github.com:', 'https://github.com/')
                        elif '@personal.github.com:' in repository_url:
                            repository_url = repository_url.replace('@personal.github.com:', 'https://github.com/')

                        repository_url = repository_url[:-4] if repository_url.endswith('.git') else repository_url
                    else:
                        print(f'skipped: {repository_url}')
                        
                    # Clean the git URL
                    if repository_url:
                        parsed_url = urlparse(repository_url)
                        if parsed_url.scheme == 'git':
                            cleaned_repository_url = parsed_url.netloc + parsed_url.path
                        else:
                            cleaned_repository_url = repository_url.lstrip('git+')

        # Fetching the abbreviated response
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for non-200 status codes
        abbreviated_package_data = response.json()

        # Extracting dependencies, devDependencies, and deprecated from abbreviated response
        last_version_data = list(abbreviated_package_data.get('versions', {}).values())[-1] if abbreviated_package_data.get('versions') else None
        dependencies = last_version_data.get('dependencies', {}) if last_version_data else {}
        dependencies_name = list(dependencies.keys())
        dependencies_count = len(dependencies)
        dev_dependencies = last_version_data.get('devDependencies', {}) if last_version_data else {}
        dev_dependencies_name = list(dev_dependencies.keys())
        dev_dependencies_count = len(dev_dependencies)

        # Extracting deprecated status
        if last_version_data:
            deprecated = last_version_data.get('deprecated')
            if isinstance(deprecated, str):
                deprecated = deprecated.strip()  # Remove leading and trailing whitespace
                if deprecated == '' or deprecated.lower() == 'false':
                    deprecated = None  # Treat empty string or 'false' as not deprecated
        else:
            deprecated = None
        deprecated_status = True if deprecated else False
        
        
        today = datetime.utcnow().date()
        three_months_ago = today - timedelta(days=91)
        yesterday = today - timedelta(days=1)
        date_range = f"{three_months_ago.strftime('%Y-%m-%d')}:{yesterday.strftime('%Y-%m-%d')}"
        print(date_range)


        # Fetching download stats
        download_stats_last_month = requests.get(f'https://api.npmjs.org/downloads/range/last-month/{package_name}').json()
        download_stats_last_3_month = requests.get(f'https://api.npmjs.org/downloads/range/{date_range}/{package_name}').json()
        last_month_downloads = sum(day['downloads'] for day in download_stats_last_month['downloads'])
        last_3_month_downloads = sum(day['downloads'] for day in download_stats_last_3_month['downloads'])

        if cleaned_repository_url:
            data_row = {
                'package_name': package_name,
                'description': description,
                'package_maintainers': maintainers_length,
                'readmeFilename_exists': readmeFilename_exists,
                'package_created': created_date,
                'package_modified': modified_date,
                'latest_version': latest_version,
                'github_repository': cleaned_repository_url,
                'dependencies_list': dependencies_name,
                'dependencies_count': dependencies_count,
                'devdependencies_list': dev_dependencies_name,
                'devdependencies_count': dev_dependencies_count,
                'deprecated': deprecated_status,
                'last_month_downloads': last_month_downloads,
                'last_3_month_downloads':last_3_month_downloads
            }
            data_rows.append(data_row)
            print(cnt)
            cnt+=1

    except Exception as e:
        print(f"Error fetching data for {package_name}: {e}")

# Create a DataFrame from the list of rows
data = pd.DataFrame(data_rows)

# Save the DataFrame to a new CSV file
data.to_csv('npmjs_batch6.csv', index=False, escapechar='\\')

50
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
