In [5]:
class Issue:
    def __init__(self, title, description, url, featured_answer):
        self.title = title
        self.description = description
        self.url = url
        self.featured_answer = featured_answer

issues_batch = []

In [1]:
# get repos
# iterate through repos' issues(using next/prev link headers)
# iterata through issues' comments and get featured comment by properties : ["author_association"]
# create list of issues objects
# save to parquet
# vecotrize issues and save to vector database

# TODO: test featured answers results based on author_association

import requests
import dotenv
import os
import pandas as pd
import time
from datetime import datetime, timedelta

dotenv.load_dotenv()
baseurl = "https://api.github.com/"
token = os.getenv("GH_PAT")
api_version = "2022-11-28"
headers = {
    "Accept": "application/vnd.github.v3+json",
    "Authorization": f"Bearer {token}",
    "X-GitHub-Api-Version": api_version
}

api = "repositories"
repos_link = f"{baseurl}{api}"
iterations = 0
rate_limit = 5000
reset_time = None
num_of_issues_collected = 0
while repos_link:
    resp = requests.get(repos_link, headers=headers)
    if resp.status_code != 200:
        with open("./logs/issues-gh.log", "a") as f:
            f.write(f"Failed to execute request to {repos_link}. Status code: {resp.status_code}. Headers: {resp.headers.__dict__}\n") 
        time.sleep(5.0)
        continue
    public_repos = resp.json()
    link_header = resp.headers.get("Link")
    rate_limit = int(resp.headers.get("X-RateLimit-Remaining"))
    if rate_limit <= 5:
        reset_time = int(resp.headers.get("X-RateLimit-Reset"))
        reset_date = datetime.fromtimestamp(reset_time)
        print(f"Rate limit reached. Sleeping until {reset_date.strftime('%Y-%m-%d %H:%M:%S')}.")
        time.sleep(reset_date.timestamp() - datetime.now().timestamp() + 5.0)
    if "rel=\"next\"" in link_header: 
        repos_link = link_header.split(",")[0].split(";")[0].replace("<",'').replace(">",'')
    else:
        repos_link = None
    for repo in public_repos:
        print(f"Processing {repo['full_name']}")
        issues_link = f"{baseurl}repos/{repo['full_name']}/issues"
        while issues_link:
            resp = requests.get(issues_link, headers=headers)
            if resp.status_code != 200:
                with open("./logs/issues-gh.log", "a") as f:
                    f.write(f"Failed to execute request to {issues_link}. Status code: {resp.status_code}. Headers: {resp.headers.__dict__}\n")
                if resp.status_code == 403 or resp.status_code == 404 or resp.status_code == 401:
                    try:
                        link_header = resp.headers.get("Link")
                        issues_links = link_header.split(",")
                        issues_link = next((link.split(';')[0].replace("<",'').replace(">",'') for link in issues_links if "rel=\"next\"" in link), None)
                    except:
                        issues_link = None
                time.sleep(5.0)
                continue
            repo_issues = resp.json()
            link_header = resp.headers.get("Link")
            rate_limit = int(resp.headers.get("X-RateLimit-Remaining"))
            if rate_limit <= 5:
                reset_time = int(resp.headers.get("X-RateLimit-Reset"))
                reset_date = datetime.fromtimestamp(reset_time)
                print(f"Rate limit reached. Sleeping until {reset_date.strftime('%Y-%m-%d %H:%M:%S')}.")
                time.sleep(reset_date.timestamp() - datetime.now().timestamp() + 5.0)
            try:
                issues_links = link_header.split(",")
                issues_link = next((link.split(';')[0].replace("<",'').replace(">",'') for link in issues_links if "rel=\"next\"" in link), None)
            except:
                issues_link = None
            for issue in repo_issues:
                link = f"{baseurl}repos/{repo['full_name']}/issues/{issue['number']}/comments?sort=created&direction=asc"
                try:
                    issue_comments = requests.get(link, headers=headers).json()
                except:
                    with open("./logs/issues-gh.log", "a") as f:
                        f.write(f"Failed to execute request to {link}. Status code: {resp.status_code}. Headers: {resp.headers.__dict__}\n")
                    continue
                featured_comment = None
                # should we just get one answer or contact multiple
                for comment in issue_comments:
                    if (comment["author_association"] == "OWNER"):
                        featured_comment = comment["body"]
                        break
                    elif (comment["author_association"] == "COLLABORATOR"):
                        featured_comment = comment["body"]
                        break
                    elif (comment["author_association"] == "MEMBER"):
                        featured_comment = comment["body"]
                        break
                    elif (comment["author_association"] == "CONTRIBUTOR"):
                        featured_comment = comment["body"]
                        break
                if featured_comment and issue["body"]:
                    issues_batch.append(Issue(issue["title"], issue["body"], issue["url"], featured_comment))
    iterations += 1
    num_of_issues_collected += len(issues_batch)
    with open("./logs/issues-gh.log", "a") as f:
        f.write(f"Next Page: {repos_link} .Processed {iterations} iterations. Request Quota left {rate_limit}. Collected issues: {num_of_issues_collected}\n")
    df = pd.DataFrame([vars(issue) for issue in issues_batch], columns=["title", "description", "url", "featured_answer"], dtype=object)
    df.to_parquet(f"./output/issues-gh-{iterations}.parquet")
    issues_batch = []
    time.sleep(1.5)


Processing mojombo/grit
Processing wycats/merb-core
Processing rubinius/rubinius


NameError: name 'issues_batch' is not defined

In [20]:
issues_batch = []

In [21]:
# search for poststs in stackoverflow
# get featured answer by properties : ["is_accepted", "score"]
# create list of issues objects
# save all to parquet
# vecotrize posts with is_answered: true and save to vector database

baseurl = "https://api.stackexchange.com/"
api_version = "2.3/"
api = "search/advanced"
batch = 100
page=1
rate_limit = 300
param_string_template = f"?pagesize={batch}&order=desc&sort=activity&site=stackoverflow&answers=1&filter=!*236eb_eL9rai)MOSNZ-6D3Q6ZKb0buI*IVotWaTb"
param_string = param_string_template + f"&page={page}"
posts_link = f"{baseurl}{api_version}{api}{param_string}"
iterations = 0
while posts_link:
    print(f"Processing {posts_link}")
    resp = requests.get(posts_link)
    if resp.status_code != 200:
        with open("./logs/issues-so.log", "a") as f:
            f.write(f"Failed to execute request to {posts_link}. Status code: {resp.status_code}. Headers: {resp.headers.__dict__}\n")
        time.sleep(5.0)
        continue
    posts = resp.json()
    rate_limit = int(posts["quota_remaining"])
    if rate_limit <= 5:
        reset_time = 3600 * 24
        reset_date = datetime.now() + timedelta(seconds=reset_time)
        print(f"Rate limit reached. Sleeping until {reset_date.strftime('%Y-%m-%d %H:%M:%S')}.")
        time.sleep(reset_time + 5)
    page += 1
    param_string = param_string_template + f"&page={page}"
    posts_link = f"{baseurl}{api_version}{api}{param_string}"
    for post in posts["items"]:
        description = post["body_markdown"]
        url = post["link"]
        title = post["title"]
        featured_answer = next((answer["body_markdown"] for answer in post["answers"] if answer["is_accepted"] and answer["score"] > 0), None)
        if not featured_answer:
            answers_sorted_by_score = sorted(post['answers'], key=lambda x: x['score'], reverse=True)
            if answers_sorted_by_score[0]['score'] > 0:
                featured_answer = answers_sorted_by_score[0]["body_markdown"]
        if featured_answer and description:
            issues_batch.append(Issue(title, description, url, featured_answer))
    iterations += 1
    with open("./logs/issues-so.log", "a") as f:
        f.write(f"Next Page: {page} .Processed {iterations} iterations\n")
    df = pd.DataFrame([vars(issue) for issue in issues_batch], columns=["title", "description", "url", "featured_answer"], dtype=object)
    print(df.head())
    df.to_parquet(f"./output/issues-so-{iterations}.parquet")
    

Processing https://api.stackexchange.com/2.3/search/advanced?pagesize=100&order=desc&sort=activity&site=stackoverflow&answers=1&filter=!*236eb_eL9rai)MOSNZ-6D3Q6ZKb0buI*IVotWaTb&page=1
                                               title  \
0  Piece of a functions stops working when put in...   
1  Power BI - feature to publish a dashboard loca...   
2  Vue: How do I call multiple functions with @cl...   
3              How to join table columns in gnuplot?   
4                              Get months from dates   

                                         description  \
0  EDIT:\r\nWorking on this for hours and hours h...   
1  In the publishing options, I&#39;m looking for...   
2  How can I call multiple functions in a single ...   
3  How can I create a column as a combination of ...   
4  I have this tibble with a bunch of dates, belo...   

                                                 url  \
0  https://stackoverflow.com/questions/77909638/p...   
1  https://stackoverflow.com/

KeyboardInterrupt: 

In [6]:
import pandas as pd
for i in range(1, 6):
    df = pd.read_parquet(f"./output/issues-so-{i}.parquet", columns=["title", "description", "featured_answer"])
    for index, row in df.iterrows():
        with open("./output/issues-so.txt", "a",encoding="utf-8") as f:
            f.write("--------------------------------------------------\n")
            f.write(f"{row['title']}\n{row['description']}\n||||||||||||||{row['featured_answer']}\n\n")