In [119]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timezone
from selenium import webdriver

In [120]:
def time_epoch(date):
    dt = datetime.strptime(date, "%d/%b/%y %H:%M").replace(tzinfo=timezone.utc)
    epoch = int(dt.timestamp())
    return epoch

In [121]:
def fetch_issue_body(issue_url):
    res = requests.get(issue_url)
    soup = BeautifulSoup(res.text, 'html.parser')

    type = soup.find('span', {'id': 'type-val', 'class': 'value'}).text.strip()
    assignee = soup.find('span', {'class':'user-hover'}).text.strip()
    created_at = soup.find('span', {'id': 'created-val', 'data-fieldtype': 'datetime'}).text.strip()
    created_at_epoch = time_epoch(created_at)
    description = soup.find('div', {'id':'description-val'}).get_text(strip=True)

    return {
        'type' : type,
        'assignee' : assignee,
        'createdAt' : created_at,
        'created_at_epoch' : created_at_epoch,
        'description' : description
    }

In [122]:
def fetch_issue_comments(issue_url):
    driver = webdriver.Chrome()
    driver.get(issue_url)
    html = driver.page_source
    driver.quit()
    soup = BeautifulSoup(html, 'html.parser')

    comments_sec = soup.find_all('div', {'class':'issue-data-block activity-comment twixi-block expanded'})
    comments = []

    for sec in comments_sec:
        comment_sec = sec.find('div', {'class':'twixi-wrap verbose actionContainer'})
            
        author = comment_sec.find('a',{'class' : 'user-hover user-avatar'}).text.strip()
        created_at = comment_sec.find('time').text.strip()
        created_at_epoch = time_epoch(created_at)
        text = comment_sec.find('div', {'class':'action-body flooded'}).get_text(strip=True)

        comment = {
            "author" : author,
            "created_at":created_at,
            "created_at_epoch":created_at_epoch,
            "text": text
        }
        print(comment)
        comments.append(comment)
    return comments

In [124]:
issue_no = 100
issues = []

while issue_no < 22080:
    issue_url = f'https://issues.apache.org/jira/browse/CAMEL-{issue_no}'

    issue = fetch_issue_body(issue_url)
    comments = fetch_issue_comments(issue_url)
    issue['comments'] = comments
    issues.append(issue)
    print(f"issue done: {issue_no}")
    issue_no +=1

{'author': 'James Strachan', 'created_at': '13/Aug/07 19:36', 'created_at_epoch': 1187033760, 'text': "Any idea how to reproduce the ClassNotFoundException? e.g. the camel-core project doesn't use activemq yet it uses the AnnotationTypeConverterLoader OK?"}
{'author': 'Aaron Crickenberger', 'created_at': '13/Aug/07 21:10', 'created_at_epoch': 1187039400, 'text': 'In my case, I\'m using these jars in a jboss deployment.  So long as I include each individual camel jar except camel-activemq, things are OK.Add camel-activemq to camel-spring\'s pom.xml, and the run "mvn test" for camel-spring.  I couldn\'t get this to reproduce for camel-core for some reason.    I apologize that this isn\'t the most direct example, but the stack trace looks the same.'}
{'author': 'James Strachan', 'created_at': '04/Sep/07 08:36', 'created_at_epoch': 1188894960, 'text': "I've added a test case to try reproduce this issue...https://svn.apache.org/repos/asf/activemq/camel/trunk/tests/camel-partial-classpath-te

AttributeError: 'NoneType' object has no attribute 'get_text'

In [107]:
print(len(issues))
issues[0:2]

[{'type': 'Bug',
  'assignee': 'Claus Ibsen',
  'createdAt': '14/Dec/16 14:42',
  'created_at_epoch': 1481726520,
  'description': 'Assume I have rest pathrest("/test").get().type(ClassA.class).to("direct:someRoute");rest("/testSub").get().type(ClassB.class).to("direct:someOtherRoute");And in the type ClassA contains a reference to ClassB.Within the Swagger Doc the path for ClassA renders as expected:/test:\n    get:\n      responses:\n        200:\n          schema:\n            $ref:\'#/definitions/ClassA\'However ClassB gets a string parameter scheme/testSub:\n    get:\n      responses:\n        200:\n          schema:\n             type :\'string\'format :\'com.ClassB\'However I\'d expect it to be:/testSub:\n    get:\n      responses:\n        200:\n          schema:\n            $ref:\'#/definitions/ClassB\'',
  'comments': [{'author': 'ASF GitHub Bot',
    'created_at': '14/Dec/16 14:55',
    'created_at_epoch': 1481727300,
    'text': 'GitHub user bobpaulin opened a pull request

In [108]:
import json

with open('camel_issues.json', 'w', encoding='utf-8') as f:
    json.dump(issues, f, indent=2, ensure_ascii=False)
