In [2]:
import requests
from lxml import etree
import pandas as pd
from fake_useragent import UserAgent
from tqdm import tqdm

In [16]:
ARGUMAN_ROOT_URL = "https://en.arguman.org"
FALLACIES_PAGE_URL = ARGUMAN_ROOT_URL + "/fallacies"
ARGUMAN_API_URL = ARGUMAN_ROOT_URL + "/api/v1"

HTML_PARSER = etree.HTMLParser()

In [4]:
ua = UserAgent()
user_agent_header = {"User-Agent": ua.chrome}

In [5]:
def load_reports(offset=0):
    res = requests.get(FALLACIES_PAGE_URL,
                       headers=user_agent_header,
                       params={'offset': offset})

    res.raise_for_status()  # make sure request is successful
    root = etree.fromstring(res.text, HTML_PARSER)
    return root.xpath('//*[@class="fallacy-report"]')

In [6]:
def parse_report(report):
    title = report.xpath('div/h2/a/text()')[0].strip()
    url = ARGUMAN_ROOT_URL + report.xpath('div/h2/a/@href')[0]
    premise_content = report.xpath('div/div[1]/text()')[0].strip()

    try:
        premise_type = next(
            filter(lambda x: x in {'but', 'because', 'however'},
                   report.xpath('div/div[1]/@class')[0].split()))
    except StopIteration:
        premise_type = None

    fallacy_type = report.xpath('div/div[2]/h4/text()')[0]
    fallacy_reason = report.xpath('div/div[2]/text()')[1].strip()
    
    return {
        'title': title,
        'url': url,
        'premise_content': premise_content,
        'premise_type': premise_type,
        'fallacy_type': fallacy_type,
        'fallacy_reason': fallacy_reason
    }

In [7]:
reports = []
for offset in tqdm(range(0, 1000, 10)):
    for raw_report in load_reports(offset=offset):
        reports.append(parse_report(raw_report))

100%|██████████| 100/100 [00:25<00:00,  3.91it/s]


In [8]:
df = pd.DataFrame(reports)

In [9]:
df

Unnamed: 0,fallacy_reason,fallacy_type,premise_content,premise_type,title,url
0,"it does not matter what others do, only the ra...",Appeal To Authority,No other international movement/NGO/protest gr...,because,FFF International should leave Telegram,https://en.arguman.org/fff-international-shoul...
1,Relies on the Authority of Abraham Lincoln bei...,Appeal To Authority,Abraham believed slavery was morally wrong,because,Slaves should be freed,https://en.arguman.org/slaves-should-be-freed
2,Bananas has nothing to do with this.,Irrelevant Conclusion,Bananas are blue.,but,Contributor should always be default beneficiary,https://en.arguman.org/contributor-should-alwa...
3,This premises unnecessarily discusses other re...,Irrelevant Conclusion,"If Islam is not a religion of peace, then no o...",however,Islam is a religion of peace.,https://en.arguman.org/islam-is-a-religion-of-...
4,s;piderman,Begging The Question,The attack in Selma left many dead and it was ...,because,African Americans should be able to vote-We sh...,https://en.arguman.org/african-americans-shoul...
5,s;piderman,Begging The Question,African Americans have been oppressed by the w...,because,African Americans should be able to vote-We sh...,https://en.arguman.org/african-americans-shoul...
6,s;piderman,Begging The Question,America is still living in a racist time again...,but,African Americans should be able to vote-We sh...,https://en.arguman.org/african-americans-shoul...
7,There is a correlation between their success a...,Fallacy Of False Cause,"Very prominant figures in our history, such as...",but,Introverts are Undervalued,https://en.arguman.org/the-greater-our-knowled...
8,Why is the creator outside of this cause-and-e...,Fallacy Of Special Pleading,"The creator doesn't have to have been created,...",but,A creator must exist,https://en.arguman.org/a-creator-must-exist
9,"This argument assumes the premise ""Science has...",Begging The Question,Science has proved that God does not exist,however,The God of the bible loves people,https://en.arguman.org/the-god-of-the-bible-lo...


In [10]:
df.to_csv('../data/fallacies.csv')

In [32]:
# (0 for but, 1 for because, 2 for however)
# from https://github.com/arguman/arguman.org/blob/master/docs/api/arguments/create_premise.md
PREMISE_TYPE_TO_TEXT = {
    0: 'but',
    1: 'because',
    2: 'however'
}

In [48]:
def get_approved_premises(title):
    res = requests.get(ARGUMAN_API_URL + '/arguments',
                       headers=user_agent_header,
                       params={'search': title})
    res.raise_for_status()
    argument_json = res.json()

    if argument_json['count'] == 0:
        return []
    
    argument_title = argument_json['results'][0]['title']
    url = ARGUMAN_ROOT_URL + '/' + argument_json['results'][0]['slug']

    premises = []

    for premise in argument_json['results'][0]['premises']:
        if len(premise['supporters']) == 0:
            continue

        if not premise['parent']:  # only get parents of the root argument
            continue

        premise_content = premise['text']
        premise_type = PREMISE_TYPE_TO_TEXT[premise['premise_type']]
        n_supporters = len(premise['supporters'])

        premises.append({
            'title': argument_title,
            'url': url,
            'premise_content': premise_content,
            'premise_type': premise_type,
            'fallacy_type': 'None',
            'fallacy_reason': '',
            'n_supporters': n_supporters
        })

    return premises

In [49]:
approved_premises = []
for _, row in tqdm(df.iterrows()):
    approved_premises += get_approved_premises(row.title)

917it [06:39,  2.30it/s]


In [51]:
approved_df = pd.DataFrame(approved_premises)

In [53]:
approved_df.to_csv('../data/approved.csv')