In [None]:
import csv
from pydriller import Repository
from pydriller.metrics.process.lines_count import LinesCount
from collections import defaultdict
import requests
import lxml.html as lx
from datetime import datetime, timedelta
import time

def pull_request_frequency(repo_link):
    username, repo_name = repo_link.split('/')[-2:]
    
    open_prs_url = f'https://api.github.com/repos/{username}/{repo_name}/pulls?state=open'
    open_prs_response = requests.get(open_prs_url)
    open_prs_count = len(open_prs_response.json())
    
    merged_prs_url = f'https://api.github.com/repos/{username}/{repo_name}/pulls?state=closed&sort=updated&direction=desc'
    merged_prs_response = requests.get(merged_prs_url)
    merged_prs_count = len(merged_prs_response.json())
    
    return open_prs_count, merged_prs_count

def fetch_forks_stars(url):
    response = requests.get(url)
    response.raise_for_status()
    html = lx.fromstring(response.text)
    fork_element = html.xpath('//*[@id="repo-network-counter"]')[0]
    star_element = html.xpath('//*[@id="repo-stars-counter-star"]')[0]
    forks = int(fork_element.text)
    stars = int(star_element.text)
    return forks, stars

def issues_pending(git_repo_url):
    result=requests.get(git_repo_url)
    html = lx.fromstring(result.text)
    issues_tab = html.xpath('//a[@id="issues-tab"]')
    if issues_tab:
        issues_url = git_repo_url + "/issues"
        result=requests.get(issues_url)
        html = lx.fromstring(result.text)
        open_issues = html.xpath('//a[@data-ga-click="Issues, Table state, Open"]')
        if open_issues:
            open_issues_content = open_issues[0].text_content()
            open_issues_count = int(open_issues_content.split()[0].replace(',', ''))
            return open_issues_count
        else:
            return 0
    else:
        return 0
    
def issues_resolved(git_repo_url):
    result=requests.get(git_repo_url)
    html = lx.fromstring(result.text)
    issues_tab = html.xpath('//a[@id="issues-tab"]')
    if issues_tab:
        issues_url = git_repo_url + "/issues"
        result=requests.get(issues_url)
        html = lx.fromstring(result.text)
        closed_issues = html.xpath('//a[@data-ga-click="Issues, Table state, Closed"]')
        if closed_issues:
            closed_issues_content = closed_issues[0].text_content()
            closed_issues_count = int(closed_issues_content.split()[0].replace(',', ''))
            return closed_issues_count
        else:
            return 0
    else:
        return 0
    
# def make_github_request(url):
#     while True:
#         token = ""  # Add your GitHub personal access token here
#         headers = {'Authorization': f'Bearer {token}'}
#         response = requests.get(url, headers=headers)
#         if response.status_code == 200:
#             return response
#         elif response.status_code == 403 and 'rate limit exceeded' in response.text.lower():
#             sleep_time = get_rate_limit_reset_time(response) + 5  # Adding 5 seconds to be safe
#             print(f"Rate limit exceeded. Waiting for {sleep_time} seconds before retrying.")
#             time.sleep(sleep_time)
#         else:
#             response.raise_for_status()

# def get_rate_limit_reset_time(response):
#     rate_limit_reset = int(response.headers.get('X-RateLimit-Reset', 0))
#     return max(rate_limit_reset - time.time(), 0)

def pr_freq(repo_link, total_days):
    username, repo_name = repo_link.split('/')[-2:]

    pr_url = f'https://api.github.com/repos/{username}/{repo_name}/pulls?state=all'
    response = requests.get(pr_url)
    all_prs = response.json()

    total_pull_requests = len(all_prs)
    pull_request_frequency = total_pull_requests / total_days

    return pull_request_frequency

# def release_history(repo_link):
#     username, repo_name = repo_link.split('/')[-2:]

#     releases_url = f'https://api.github.com/repos/{username}/{repo_name}/releases'
#     response = requests.get(releases_url)
#     releases = response.json()
    
#     if releases:
#         # Extracting published dates
#         published_dates = [release['published_at'] for release in releases]
#         # Sorting dates chronologically
#         sorted_dates = sorted(published_dates)
#         # Converting dates from string to datetime format
#         datetime_dates = [datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ') for date in sorted_dates]
#         # Calculate frequency between the first and last release
#         first_release_date = datetime_dates[0]
#         last_release_date = datetime_dates[-1]
#         total_days = (last_release_date - first_release_date).days + 1
#         release_frequency = len(releases) / total_days if total_days > 0 else 0
#         return release_frequency.round(release_frequency,2)
#     else:
#         return 0

def get_loc(github_repo_url):
    loc = 0
    for commit in Repository(github_repo_url).traverse_commits():
        loc += commit.lines
    return loc

def get_total_lines_added_deleted(github_repo_url):
    total_added = 0
    total_deleted = 0
    repo = Repository(github_repo_url)
    first_commit = next(repo.traverse_commits(), None)
    first_commit_hash = first_commit.hash if first_commit else None
    last_commit = next(repo.traverse_commits(), None)
    for commit in repo.traverse_commits():
        last_commit = commit
    last_commit_hash = last_commit.hash
    lines_count = LinesCount(path_to_repo=github_repo_url,from_commit=first_commit_hash,to_commit=last_commit_hash)
    total_added = lines_count.count_added()
    total_deleted = lines_count.count_removed()

    return sum(total_added.values()), sum(total_deleted.values())

def get_lines_added_deleted_last_one_and_half_years(github_repo_url):
    total_added = 0
    total_deleted = 0
    current_date = datetime.now()
    one_and_a_half_years_ago = current_date - timedelta(days=547)  # Assuming 1 year = 365 days so 547 for 1.5 years
    year = one_and_a_half_years_ago.year
    month = one_and_a_half_years_ago.month
    day = one_and_a_half_years_ago.day
    one_and_a_half_years_ago_date = datetime(year, month, day)
    lines_count = LinesCount(path_to_repo=github_repo_url, since=one_and_a_half_years_ago_date, to=current_date)
    total_added = lines_count.count_added()
    total_deleted = lines_count.count_removed()
    return sum(total_added.values()), sum(total_deleted.values())

def is_readme_updated_in_last_one_and_half_years(github_repo_url):
    total_added = 0
    total_deleted = 0

    # Get the current date
    current_date = datetime.now()
    
    # Calculate the date 1.5 years ago
    one_and_a_half_years_ago = current_date - timedelta(days=547)  # Assuming 1 year = 365 days
    
    # Extract year, month, and day components
    year = one_and_a_half_years_ago.year
    month = one_and_a_half_years_ago.month
    day = one_and_a_half_years_ago.day
    
    one_and_a_half_years_ago_date = datetime(year, month, day)
    
    # Initialize LinesCount
    lines_count = LinesCount(path_to_repo=github_repo_url, since=one_and_a_half_years_ago_date, to=current_date)
    
    total_added = lines_count.count_added()

    total_deleted = lines_count.count_removed()
    
    lowercase_total_added = {key.lower() if key is not None else key: value for key, value in total_added.items()}
    lowercase_total_deleted = {key.lower() if key is not None else key: value for key, value in total_deleted.items()}

    try:
        readme_added = lowercase_total_added['readme.md']
        readme_deleted = lowercase_total_deleted['readme.md']
        if(readme_added+readme_deleted==0):
            readme_updated = False
        else:
            readme_updated = True
    except:
        readme_updated = False

    return readme_updated


def analyze_repository(url):
    total_commits = 0
    commit_dates = []
    contributors = set()

    try:
        forks, stars = fetch_forks_stars(url)
        open_issues = issues_pending(url)
        resolved_issues = issues_resolved(url)
        total_commits, avg_commits_per_day, last_commit_date, unique_contributors = 0, 0, None, 0

        repo = Repository(url)
        for commit in repo.traverse_commits():
            total_commits += 1
            commit_dates.append(commit.committer_date)
            contributors.add(commit.author.email)

        if total_commits > 0:
            first_commit_date = min(commit_dates)
#             avg_commits_per_day = total_commits / ((max(commit_dates) - min(commit_dates)).days + 1)
            last_commit_date = max(commit_dates)
            unique_contributors = len(contributors)
            total_days = (last_commit_date - first_commit_date).days + 1
            avg_commits_per_day = total_commits / total_days if total_days > 0 else 0

        open_prs_count, merged_prs_count = pull_request_frequency(url)
        pr_frequency = pr_freq(url, total_days)
        pr_frequency = round(pr_frequency, 2)
#         release_frequency = release_history(url)
        lines_of_codes = get_loc(url)
        total_added, total_deleted = get_total_lines_added_deleted(url)
        lines_added_one_and_half_year,lines_deleted_one_and_half_year = get_lines_added_deleted_last_one_and_half_years(url)
        readme_updated = is_readme_updated_in_last_one_and_half_years(url)
#         total_added, total_deleted = None, None
        

        return total_commits, avg_commits_per_day, last_commit_date, unique_contributors, forks, stars, open_prs_count, merged_prs_count, open_issues, resolved_issues, pr_frequency, lines_of_codes, total_added, total_deleted, lines_added_one_and_half_year, lines_deleted_one_and_half_year, readme_updated
    except Exception as e:
        print(f"Error analyzing repository {url}: {e}")
        return None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None

def fetch_and_analyze_repositories(input_csv, output_csv):
    unique_repos = set()
    data = []

    with open(input_csv, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            repo_url = row.get('latest_version_git_repo')
            if repo_url and repo_url not in unique_repos:
                print(f"Analyzing repository: {repo_url}")
                total_commits, avg_commits_per_day, last_commit_date, unique_contributors, forks, stars, open_prs_count, merged_prs_count, open_issues, resolved_issues, pr_frequency, lines_of_codes, total_added, total_deleted, lines_added_one_and_half_year, lines_deleted_one_and_half_year, readme_updated = analyze_repository(repo_url)
                if total_commits is not None:
                    data.append({
                        'git_repo': repo_url,
                        'total_commits': total_commits,
                        'avg_commits_per_day': avg_commits_per_day,
                        'last_commit_date': last_commit_date,
                        'unique_contributors': unique_contributors,
                        'forks': forks,
                        'stars': stars,
                        'open_PRs': open_prs_count,
                        'merged_PRs': merged_prs_count,
                        'open_issues': open_issues,
                        'resolved_issues': resolved_issues,
                        'pr_frequency': pr_frequency,
#                         'release_frequency': release_frequency,
                        'LOC' : lines_of_codes,
                        'total_lines_added': total_added,
                        'total_lines_deleted': total_deleted,
                        'lines_added_one_and_half_year': lines_added_one_and_half_year,
                        'lines_deleted_one_and_half_year': lines_deleted_one_and_half_year,
                        'readme_updated': readme_updated
                    })
                unique_repos.add(repo_url)
            elif repo_url:
                print(f"Skipping duplicate repository: {repo_url}")
            else:
                print("Skipping repository: value not present")

    with open(output_csv, 'w', newline='') as file:
        fieldnames = ['git_repo', 'total_commits', 'avg_commits_per_day', 'last_commit_date', 'unique_contributors', 'forks', 'stars', 'open_PRs', 'merged_PRs', 'open_issues', 'resolved_issues', 'pr_frequency', 'LOC', 'total_lines_added', 'total_lines_deleted', 'lines_added_one_and_half_year', 'lines_deleted_one_and_half_year', 'readme_updated']
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)

input_file = '/kaggle/input/pydriller-test-2/npm_package_data_final.csv'
fetch_and_analyze_repositories(input_file, 'git_stats.csv')


# added lines & deleted lines -> done
# pr freq -> done 
# release history -> done
# LOC ->done
# readme updated -> done