# How to use this notebook?

Firstly, you need to install the dependencies on your Python environment.

```
pip install -r requirements.txt
```

Then you need to execute `wikimedia-api.py` script to generate `queried_datasets.csv` file.

And at the end you can run this notebook to clean the dataset...

# Get Data From API

In [None]:
import requests
import pandas as pd
import json

In [None]:

SEARCH_ENDPOINT = "https://api.wikimedia.org/core/v1/wikipedia/en/search/page?q="

PAGE_DETAIL_ENDPOINT = "https://api.wikimedia.org/core/v1/wikipedia/en/page/"

QUERY_WORDS = [
    "Ontology",
    "Data Science",
    "Astrophysics",
    "Marine Biology",
    "Climate Change",
    "Psychology",
    "War",
    "Artificial Intelligence",
    "Machine Learning",
    "Scrum Master",
    "Contemporary Art",
    "Vegan Cooking",
    "Landscape Photography",
    "Switzerland",
    "European Union",
    "NATO",
    "Family Constellations",
    "Bioengineering",
    "Medicine",
    "Physics",
    "Mathematics",
    "Data Visualization",
    "Topic Modeling",
    "Antarctica",
    "Sequoia",
    "Blue Whale",
    "Matcha",
    "Solar Panels",
    "Thyroid",
    "Gender equality",
    "Education",
    "Developing country",
    "Spanish History",
    "Dog training",
    "Solar System",
    "Autoimmune disease",
]

def search_query_in_pages(search_query):
    ep = SEARCH_ENDPOINT + search_query
    res = requests.get(ep)
    return res.json()


def is_cc_sa(content_key):
    ep = PAGE_DETAIL_ENDPOINT + content_key
    res = requests.get(ep)
    record = res.json()
    cc_sa_text = record["license"]["url"]
    return "/by-sa/" in cc_sa_text


QUERIED_DATASETS = []

for query in QUERY_WORDS:
    searched_data = search_query_in_pages(query)
    jsoned_data_df = pd.read_json(json.dumps(searched_data["pages"]), orient="records")
    QUERIED_DATASETS.append(jsoned_data_df)

queried_words_datasets = pd.concat(QUERIED_DATASETS)

queried_words_datasets.to_csv('queried_datasets.csv')

# Data Cleaning

In [8]:
import pandas as pd
from bs4 import BeautifulSoup

In [3]:
df = pd.read_csv('queried_datasets.csv')

In [6]:
df.drop('Unnamed: 0',inplace=True,axis=1)
df.head()

Unnamed: 0,id,key,title,excerpt,matched_title,description,thumbnail
0,22261,Ontology,Ontology,"metaphysics, <span class=""searchmatch"">ontolog...",,Branch of philosophy that studies concepts suc...,"{'mimetype': 'image/png', 'size': None, 'width..."
1,2477036,Guerrilla_ontology,Guerrilla ontology,"Guerilla <span class=""searchmatch"">ontology</s...",,,
2,49681,Ontology_(information_science),Ontology (information science),"In computer science and information science, a...",,Specification of a conceptualization,
3,1454791,Gene_Ontology,Gene Ontology,"The Gene <span class=""searchmatch"">Ontology</s...",,Bioinformatics initiative,
4,3200382,Upper_ontology,Upper ontology,"information science, an upper <span class=""sea...",,Ontology applicable across domains of knowledge,


In [10]:
df['excerpt'] = [BeautifulSoup(X).getText() for X in df['excerpt']]

In [11]:
df.head()

Unnamed: 0,id,key,title,excerpt,matched_title,description,thumbnail
0,22261,Ontology,Ontology,"metaphysics, ontology is the philosophical stu...",,Branch of philosophy that studies concepts suc...,"{'mimetype': 'image/png', 'size': None, 'width..."
1,2477036,Guerrilla_ontology,Guerrilla ontology,Guerilla ontology is a practice described by a...,,,
2,49681,Ontology_(information_science),Ontology (information science),"In computer science and information science, a...",,Specification of a conceptualization,
3,1454791,Gene_Ontology,Gene Ontology,The Gene Ontology (GO) is a major bioinformati...,,Bioinformatics initiative,
4,3200382,Upper_ontology,Upper ontology,"information science, an upper ontology (also k...",,Ontology applicable across domains of knowledge,


In [12]:
df.to_csv('queried_datasets_excerpt_cleaned.csv')