In [None]:
import csv
from pydriller import Repository
from pydriller.metrics.process.lines_count import LinesCount
from collections import defaultdict
import requests
import lxml.html as lx
from datetime import datetime, timedelta
import time

def pull_request_frequency(repo_link):
    username, repo_name = repo_link.split('/')[-2:]
    
    open_prs_url = f'https://api.github.com/repos/{username}/{repo_name}/pulls?state=open'
    open_prs_response = requests.get(open_prs_url)
    open_prs_count = len(open_prs_response.json())
    
    merged_prs_url = f'https://api.github.com/repos/{username}/{repo_name}/pulls?state=closed&sort=updated&direction=desc'
    merged_prs_response = requests.get(merged_prs_url)
    merged_prs_count = len(merged_prs_response.json())
    
    return open_prs_count, merged_prs_count

def fetch_forks_stars(url):
    response = requests.get(url)
    response.raise_for_status()
    html = lx.fromstring(response.text)
    fork_element = html.xpath('//*[@id="repo-network-counter"]')[0]
    star_element = html.xpath('//*[@id="repo-stars-counter-star"]')[0]
    forks = int(fork_element.text)
    stars = int(star_element.text)
    return forks, stars

def issues_pending(git_repo_url):
    result=requests.get(git_repo_url)
    html = lx.fromstring(result.text)
    issues_tab = html.xpath('//a[@id="issues-tab"]')
    if issues_tab:
        issues_url = git_repo_url + "/issues"
        result=requests.get(issues_url)
        html = lx.fromstring(result.text)
        open_issues = html.xpath('//a[@data-ga-click="Issues, Table state, Open"]')
        if open_issues:
            open_issues_content = open_issues[0].text_content()
            open_issues_count = int(open_issues_content.split()[0].replace(',', ''))
            return open_issues_count
        else:
            return 0
    else:
        return 0
    
def issues_resolved(git_repo_url):
    result=requests.get(git_repo_url)
    html = lx.fromstring(result.text)
    issues_tab = html.xpath('//a[@id="issues-tab"]')
    if issues_tab:
        issues_url = git_repo_url + "/issues"
        result=requests.get(issues_url)
        html = lx.fromstring(result.text)
        closed_issues = html.xpath('//a[@data-ga-click="Issues, Table state, Closed"]')
        if closed_issues:
            closed_issues_content = closed_issues[0].text_content()
            closed_issues_count = int(closed_issues_content.split()[0].replace(',', ''))
            return closed_issues_count
        else:
            return 0
    else:
        return 0
    
# def make_github_request(url):
#     while True:
#         token = ""  # Add your GitHub personal access token here
#         headers = {'Authorization': f'Bearer {token}'}
#         response = requests.get(url, headers=headers)
#         if response.status_code == 200:
#             return response
#         elif response.status_code == 403 and 'rate limit exceeded' in response.text.lower():
#             sleep_time = get_rate_limit_reset_time(response) + 5  # Adding 5 seconds to be safe
#             print(f"Rate limit exceeded. Waiting for {sleep_time} seconds before retrying.")
#             time.sleep(sleep_time)
#         else:
#             response.raise_for_status()

# def get_rate_limit_reset_time(response):
#     rate_limit_reset = int(response.headers.get('X-RateLimit-Reset', 0))
#     return max(rate_limit_reset - time.time(), 0)

def pr_freq(repo_link, total_days):
    username, repo_name = repo_link.split('/')[-2:]

    pr_url = f'https://api.github.com/repos/{username}/{repo_name}/pulls?state=all'
    response = requests.get(pr_url)
    all_prs = response.json()

    total_pull_requests = len(all_prs)
    pull_request_frequency = total_pull_requests / total_days

    return pull_request_frequency

def release_frequency(github_repo_url):
    release_dates = []
    for commit in Repository(github_repo_url, only_releases=True).traverse_commits():
        release_dates.append(commit.committer_date)
    
    if(len(release_dates)>1):
        # Calculate the difference between the last and first release dates
        difference = release_dates[-1] - release_dates[0]
        
        # Get the number of days from the timedelta object
        number_of_days = difference.days
        number_of_releases = len(release_dates)
    
        release_frequency_days = number_of_releases / number_of_days
    
        # Convert release frequency to months and years
        release_frequency_months = number_of_releases / (number_of_days / 30)  # Assuming 30 days in a month
        release_frequency_years = number_of_releases / (number_of_days / 365)  # Assuming 365 days in a year

    else:
        return 0,0,0

    return release_frequency_days, release_frequency_months, release_frequency_years


def get_loc(github_repo_url):
    total_added = 0
    total_deleted = 0
    
    commit_dates = []
    for commit in Repository(github_repo_url).traverse_commits():
        commit_dates.append(commit.committer_date)
        
    if(len(commit_dates)!=0):
        first_commit_date = commit_dates[0]
        last_commit_date = commit_dates[-1]
    
        # Initialize LinesCount
        lines_count = LinesCount(path_to_repo=github_repo_url,since=first_commit_date,to=last_commit_date)
        
        total_added = lines_count.count_added()
        total_deleted = lines_count.count_removed()
    else:
        return 0
        
    LOC = {}
    for key in total_added:
        if key in total_deleted:
            LOC[key] = total_added[key] - total_deleted[key]
        else:
            LOC[key] = total_added[key]

    Filtered_LOC = {}
    
    for file in LOC:
        if(file[-3:] == ".js"):
            if("test" not in file and "tests" not in file and ".spec.js" not in file and ".test.js" not in file and ".spec.ts" not in file and ".test.ts" not in file and "spec" not in file):
                if("node_modules\\" not in file and "public\\" not in file and "build\\" not in file and "test\\" not in file):
                    Filtered_LOC[file] = LOC[file]
    return sum(Filtered_LOC.values())


def get_total_lines_added_deleted(github_repo_url):
    total_added = 0
    total_deleted = 0
    
    commit_dates = []
    for commit in Repository(github_repo_url).traverse_commits():
        commit_dates.append(commit.committer_date)
        
    if(len(commit_dates)!=0):
        first_commit_date = commit_dates[0]
        last_commit_date = commit_dates[-1]
    
        # Initialize LinesCount
        lines_count = LinesCount(path_to_repo=github_repo_url,since=first_commit_date,to=last_commit_date)
        
        total_added = lines_count.count_added()
    
        total_deleted = lines_count.count_removed()

    else:
        return 0, 0

    return sum(total_added.values()), sum(total_deleted.values())


def get_lines_added_deleted_last_one_and_half_years(github_repo_url):
    total_added = 0
    total_deleted = 0
    current_date = datetime.now()
    one_and_a_half_years_ago = current_date - timedelta(days=547)  # Assuming 1 year = 365 days so 547 for 1.5 years
    year = one_and_a_half_years_ago.year
    month = one_and_a_half_years_ago.month
    day = one_and_a_half_years_ago.day
    one_and_a_half_years_ago_date = datetime(year, month, day)
    lines_count = LinesCount(path_to_repo=github_repo_url, since=one_and_a_half_years_ago_date, to=current_date)
    total_added = lines_count.count_added()
    total_deleted = lines_count.count_removed()
    return sum(total_added.values()), sum(total_deleted.values())

def is_readme_updated_in_last_one_and_half_years(github_repo_url):
    total_added = 0
    total_deleted = 0

    # Get the current date
    current_date = datetime.now()
    
    # Calculate the date 1.5 years ago
    one_and_a_half_years_ago = current_date - timedelta(days=547)  # Assuming 1 year = 365 days
    
    # Extract year, month, and day components
    year = one_and_a_half_years_ago.year
    month = one_and_a_half_years_ago.month
    day = one_and_a_half_years_ago.day
    
    one_and_a_half_years_ago_date = datetime(year, month, day)
    
    # Initialize LinesCount
    lines_count = LinesCount(path_to_repo=github_repo_url, since=one_and_a_half_years_ago_date, to=current_date)
    
    total_added = lines_count.count_added()

    total_deleted = lines_count.count_removed()
    
    lowercase_total_added = {key.lower() if key is not None else key: value for key, value in total_added.items()}
    lowercase_total_deleted = {key.lower() if key is not None else key: value for key, value in total_deleted.items()}

    try:
        readme_added = lowercase_total_added['readme.md']
        readme_deleted = lowercase_total_deleted['readme.md']
        if(readme_added+readme_deleted==0):
            readme_updated = False
        else:
            readme_updated = True
    except:
        readme_updated = False

    return readme_updated

def get_github_repo_sloc(github_url):
    repo_name = "temp_repo"
    try:
        subprocess.run(['git', 'clone', github_url, repo_name], check=True)

        result = subprocess.run(['npx', 'cloc', repo_name, '--json', '--exclude-dir=node_modules,public,build, test', '--not-match-f=test|tests|\\.spec\\.js|\\.test\\.js|\\.spec\\.ts|\\.test\\.ts|spec'], capture_output=True, text=True)
        
        if result.returncode != 0:
            print("Error occurred while running cloc command:")
            print(result.stderr)
            return 0
        
        loc_data = json.loads(result.stdout)
        
        if 'JavaScript' in loc_data:
            return loc_data['JavaScript']
        else:
            return 0
    except subprocess.CalledProcessError as e:
        print(f"Error: {e}")
        return 0
    finally:
        subprocess.run(['rm', '-rf', repo_name])


def analyze_repository(url):
    total_commits = 0
    commit_dates = []
    contributors = set()

    try:
        forks, stars = fetch_forks_stars(url)
        open_issues = issues_pending(url)
        resolved_issues = issues_resolved(url)
        total_commits, avg_commits_per_day, last_commit_date, unique_contributors = 0, 0, None, 0

        repo = Repository(url)
        for commit in repo.traverse_commits():
            total_commits += 1
            commit_dates.append(commit.committer_date)
            contributors.add(commit.author.email)

        if total_commits > 0:
            first_commit_date = min(commit_dates)
            last_commit_date = max(commit_dates)
            unique_contributors = len(contributors)
            total_days = (last_commit_date - first_commit_date).days + 1
            avg_commits_per_day = total_commits / total_days if total_days > 0 else 0

        open_prs_count, merged_prs_count = pull_request_frequency(url)
        pr_frequency = pr_freq(url, total_days)
        pr_frequency = round(pr_frequency, 2)
        version_release_frequency_days, version_release_frequency_months, version_release_frequency_years = release_frequency(url)
        lines_of_codes = get_loc(url)
        # source_lines_of_code = get_github_repo_sloc(url)
        source_lines_of_code = None
        total_added, total_deleted = get_total_lines_added_deleted(url)
        lines_added_one_and_half_year,lines_deleted_one_and_half_year = get_lines_added_deleted_last_one_and_half_years(url)
        readme_updated = is_readme_updated_in_last_one_and_half_years(url)
        

        return total_commits, avg_commits_per_day, last_commit_date, unique_contributors, forks, stars, open_prs_count, merged_prs_count, open_issues, resolved_issues, pr_frequency, version_release_frequency_days, version_release_frequency_months, version_release_frequency_years,lines_of_codes, source_lines_of_code, total_added, total_deleted, lines_added_one_and_half_year, lines_deleted_one_and_half_year, readme_updated
    except Exception as e:
        print(f"Error analyzing repository {url}: {e}")
        return None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None

def fetch_and_analyze_repositories(input_csv, output_csv):
    unique_repos = set()
    data = []

    with open(input_csv, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            repo_url = row.get('latest_version_git_repo')
            if repo_url and repo_url not in unique_repos:
                print(f"Analyzing repository: {repo_url}")
                total_commits, avg_commits_per_day, last_commit_date, unique_contributors, forks, stars, open_prs_count, merged_prs_count, open_issues, resolved_issues, pr_frequency, version_release_frequency_days, version_release_frequency_months, version_release_frequency_years,lines_of_codes, source_lines_of_code, total_added, total_deleted, lines_added_one_and_half_year, lines_deleted_one_and_half_year, readme_updated = analyze_repository(repo_url)
                if total_commits is not None:
                    data.append({
                        'git_repo': repo_url,
                        'total_commits': total_commits,
                        'avg_commits_per_day': avg_commits_per_day,
                        'last_commit_date': last_commit_date,
                        'unique_contributors': unique_contributors,
                        'forks': forks,
                        'stars': stars,
                        'open_PRs': open_prs_count,
                        'merged_PRs': merged_prs_count,
                        'open_issues': open_issues,
                        'resolved_issues': resolved_issues,
                        'pr_frequency': pr_frequency,
                        'version_release_frequency_days': version_release_frequency_days, 'version_release_frequency_months': version_release_frequency_months, 'version_release_frequency_years': version_release_frequency_years,
                        'LOC' : lines_of_codes,
                        'SLOC': source_lines_of_code,
                        'total_lines_added': total_added,
                        'total_lines_deleted': total_deleted,
                        'lines_added_one_and_half_year': lines_added_one_and_half_year,
                        'lines_deleted_one_and_half_year': lines_deleted_one_and_half_year,
                        'readme_updated': readme_updated
                    })
                unique_repos.add(repo_url)
            elif repo_url:
                print(f"Skipping duplicate repository: {repo_url}")
            else:
                print("Skipping repository: value not present")

    with open(output_csv, 'w', newline='') as file:
        fieldnames = ['git_repo', 'total_commits', 'avg_commits_per_day', 'last_commit_date', 'unique_contributors', 'forks', 'stars', 'open_PRs', 'merged_PRs', 'open_issues', 'resolved_issues', 'pr_frequency', 'version_release_frequency_days', 'version_release_frequency_months', 'version_release_frequency_years' 'LOC', 'SLOC', 'total_lines_added', 'total_lines_deleted', 'lines_added_one_and_half_year', 'lines_deleted_one_and_half_year', 'readme_updated']
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)

input_file = '/kaggle/input/pydriller-test-2/npm_package_data_final.csv'
fetch_and_analyze_repositories(input_file, 'git_stats.csv')


# SLOC
# complexity