# How to use this notebook?

Firstly, you need to install the dependencies on your Python environment.

```
pip install -r requirements.txt
```

Then you need to execute `wikimedia-api.py` script to generate `queried_datasets.csv` file.

And at the end you can run this notebook to clean the dataset...

# Get Data From API

In [3]:
import requests
import pandas as pd
import json

In [13]:

SEARCH_ENDPOINT = "https://api.wikimedia.org/core/v1/wikipedia/en/search/page?q="

PAGE_DETAIL_ENDPOINT = "https://api.wikimedia.org/core/v1/wikipedia/en/page/"

QUERY_WORDS = [
    "Ontology",
    "Data Science",
    "Astrophysics",
    "Marine Biology",
    "Climate Change",
    "Psychology",
    "War",
    "Artificial Intelligence",
    "Machine Learning",
    "Scrum Master",
    "Contemporary Art",
    "Vegan Cooking",
    "Landscape Photography",
    "Switzerland",
    "European Union",
    "NATO",
    "Family Constellations",
    "Bioengineering",
    "Medicine",
    "Physics",
    "Mathematics",
    "Data Visualization",
    "Topic Modeling",
    "Antarctica",
    "Sequoia",
    "Blue Whale",
    "Matcha",
    "Solar Panels",
    "Thyroid",
    "Gender equality",
    "Education",
    "Developing country",
    "Spanish History",
    "Dog training",
    "Solar System",
    "Autoimmune disease",
]

def search_query_in_pages(search_query):
    ep = SEARCH_ENDPOINT + search_query
    res = requests.get(ep)
    return res.json()



def get_page_detail(content_key):
    ep = PAGE_DETAIL_ENDPOINT + content_key
    res = requests.get(ep)
    record_json = res.json()
    return record_json

def is_cc_sa(record):
    cc_sa_text = record["license"]["url"]
    return "/by-sa/" in cc_sa_text


def get_source_of_page_detail(record):
    return record['source']


def get_page_source_in_html(key):
    ep = f"https://api.wikimedia.org/core/v1/wikipedia/en/page/{key}/html"
    res = requests.get(ep)
    return res.text
    

In [14]:
QUERIED_DATASETS = []

In [None]:
for query in QUERY_WORDS:
    searched_data = search_query_in_pages(query)
    jsoned_data_df = pd.read_json(json.dumps(searched_data["pages"]), orient="records")
    QUERIED_DATASETS.append(jsoned_data_df)

In [None]:
queried_words_datasets = pd.concat(QUERIED_DATASETS)

# Data Cleaning

In [15]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import json

In [16]:
df = pd.read_csv('queried_datasets.csv')

In [17]:
df.drop('Unnamed: 0',inplace=True,axis=1)
df.head()

Unnamed: 0,id,key,title,excerpt,matched_title,description,thumbnail
0,22261,Ontology,Ontology,"metaphysics, <span class=""searchmatch"">ontolog...",,Branch of philosophy that studies concepts suc...,"{'mimetype': 'image/png', 'size': None, 'width..."
1,49681,Ontology_(information_science),Ontology (information science),"In computer science and information science, a...",,Specification of a conceptualization,
2,2477036,Guerrilla_ontology,Guerrilla ontology,"Guerilla <span class=""searchmatch"">ontology</s...",,,
3,1454791,Gene_Ontology,Gene Ontology,"The Gene <span class=""searchmatch"">Ontology</s...",,Bioinformatics initiative,
4,3200382,Upper_ontology,Upper ontology,"information science, an upper <span class=""sea...",,Ontology applicable across domains of knowledge,


In [6]:
df['excerpt'] = [BeautifulSoup(X).getText() for X in df['excerpt']]

In [None]:
df.to_csv('queried_datasets_excerpt_cleaned.csv')

# Getting Source

In [18]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import json

In [19]:
df = pd.read_csv('queried_datasets_excerpt_cleaned.csv')
df.drop('Unnamed: 0',inplace=True,axis=1)

In [20]:
df.head()

Unnamed: 0,id,key,title,excerpt,matched_title,description,thumbnail
0,22261,Ontology,Ontology,"metaphysics, ontology is the philosophical stu...",,Branch of philosophy that studies concepts suc...,"{'mimetype': 'image/png', 'size': None, 'width..."
1,2477036,Guerrilla_ontology,Guerrilla ontology,Guerilla ontology is a practice described by a...,,,
2,49681,Ontology_(information_science),Ontology (information science),"In computer science and information science, a...",,Specification of a conceptualization,
3,1454791,Gene_Ontology,Gene Ontology,The Gene Ontology (GO) is a major bioinformati...,,Bioinformatics initiative,
4,3200382,Upper_ontology,Upper ontology,"information science, an upper ontology (also k...",,Ontology applicable across domains of knowledge,


In [21]:
SOURCE_PURE = []
SOURCE_HTML = []
IS_CC_SA = []

In [None]:
for index, row in df.iterrows():
    ep = f"https://api.wikimedia.org/core/v1/wikipedia/en/page/{row['key']}/html"
    res = requests.get(ep)
    SOURCE_HTML.append(res.text)

In [None]:
SOURCE_HTML

In [12]:
for index, row in df.iterrows():
    page_detail = get_page_detail(row['key'])
    IS_CC_SA.append(is_cc_sa(page_detail))
    SOURCE_PURE.append(get_source_of_page_detail(page_detail))

NameError: name 'get_page_detail' is not defined

In [11]:
res = get_page_detail('Ontology')

In [12]:
res['license']

{'url': 'https://creativecommons.org/licenses/by-sa/3.0/',
 'title': 'Creative Commons Attribution-Share Alike 3.0'}

In [22]:
ep = f"https://api.wikimedia.org/core/v1/wikipedia/en/page/Ontology/html"
res = requests.get(ep)

In [28]:
res.text

'<!DOCTYPE html>\n<html prefix="dc: http://purl.org/dc/terms/ mw: http://mediawiki.org/rdf/" about="https://en.wikipedia.org/wiki/Special:Redirect/revision/1117423379"><head prefix="mwr: https://en.wikipedia.org/wiki/Special:Redirect/"><meta charset="utf-8"/><meta property="mw:pageId" content="22261"/><meta property="mw:pageNamespace" content="0"/><link rel="dc:replaces" resource="mwr:revision/1117423334"/><meta property="mw:revisionSHA1" content="adf60af8b7f4c38eba349c23bb456c0e1c347a4e"/><meta property="dc:modified" content="2022-10-21T17:15:43.000Z"/><meta property="mw:htmlVersion" content="2.6.0"/><meta property="mw:html:version" content="2.6.0"/><link rel="dc:isVersionOf" href="//en.wikipedia.org/wiki/Ontology"/><base href="//en.wikipedia.org/wiki/"/><title>Ontology</title><meta property="mw:moduleStyles" content="ext.cite.style|ext.cite.styles"/><link rel="stylesheet" href="/w/load.php?lang=en&amp;modules=ext.cite.style%7Cext.cite.styles%7Cmediawiki.skinning.content.parsoid%7Cmed