In [2]:
import pdb
import requests
from bs4 import BeautifulSoup

In [3]:
def find_see_also(soup):
    return soup.find(attrs={"id":"See_also"})


def find_links(soup):
    if soup.name == "ul":
        return soup.find_all("li")
    
    if soup.name == "div" and " ".join(soup["class"]) == "div-col columns column-width":
        return soup.find_all("li")
    
    return find_links(soup.find_next_sibling())


def process_links(links):
    hrefs = []
    for link in links:
        hrefs.append("https://en.wikipedia.org" + link.find('a').get("href"))
    return tuple(hrefs)
    
    
    
def get_article_title(soup):
    return soup.find(attrs={"id":"firstHeading"}).text
    
      
def get_related_links(url):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content)
    
    title = get_article_title(soup)
    
    see_also = find_see_also(soup)
    
    if see_also:
        links = find_links(see_also.find_parent())
    else: 
        return None
    
    return {
        "title": title,
        "url": url,
        "links": process_links(links)
    }

In [5]:
get_related_links("https://en.wikipedia.org/wiki/Brain")

{'title': 'Brain',
 'url': 'https://en.wikipedia.org/wiki/Brain',
 'links': ('https://en.wikipedia.org/wiki/Brain%E2%80%93computer_interface',
  'https://en.wikipedia.org/wiki/Central_nervous_system_disease',
  'https://en.wikipedia.org/wiki/List_of_neuroscience_databases',
  'https://en.wikipedia.org/wiki/Neurological_disorder',
  'https://en.wikipedia.org/wiki/Optogenetics',
  'https://en.wikipedia.org/wiki/Outline_of_neuroscience')}

In [59]:
import pandas as pd
from keys import *
class WikiScrapper:
    
    
    def __init__(self, max_trees=3, verbose=0):
        
        self.max_trees = max_trees
        self.verbose = verbose
        
        self.seen = {}
        self.titles = []

        self.tree_count = 0
        self.max_trees_reached = False

        self.queue = []

        self.data = []

        
    def traverse_from(self, related_dict):
        if type(related_dict) == str:
            related_dict = get_related_links(related_dict)
            self.data.append(related_dict)
        
        if self.verbose >= 1:
            print(related_dict.get("title"))
            
        if not related_dict:
            return
        
        self.seen[related_dict['title']] = True
        self.titles.append(related_dict['title'])
        
        
        if not self.max_trees_reached:
            for link in related_dict['links']:
                current_page = get_related_links(link)
                if current_page and not self.seen.get(current_page['title']):
                    if self.tree_count < self.max_trees:
                        self.data.append(current_page)
                        self.queue.append(current_page)
                        self.tree_count += 1
                    else:
                        self.max_trees_reached = True
        if self.verbose >= 2:
            print("Queue Length:\t\t", len(self.queue))
            print("Trees Traversed:\t", self.tree_count)
            print("max_trees_reached:\t", self.max_trees_reached)
            print("\n\n")
        
        if self.queue:
            self.traverse_from(self.queue.pop(0))
        
    def to_dataframe(self):
        return pd.DataFrame(self.data)
    
    def to_csv(self, file_name):
        self.to_dataframe().to_csv(file_name, index=False)
    
    def add_ids(self):
        for article in self.data:
            article['_id'] = article['title']
    
    def to_mlab(self):
        
        uri = f"mongodb://{mlab_api['username']}:{mlab_api['password']}@ds261277.mlab.com:61277/wiki_scrapper"
        client = pymongo.MongoClient(uri)

        db = client.get_default_database()

        data_inserter = db["known_related"]
        self.add_ids()
        for article in self.data:
            try:
                data_inserter.insert_one(article)
            except:
                continue
#                 data_inserter.update_one({"_id": data[0]["_id"]}, {"$set": {"message": data[0]['message']}})

        client.close()

In [60]:
scrapper = WikiScrapper(max_trees=4, verbose=1)

scrapper.traverse_from("https://en.wikipedia.org/wiki/Brain")

Brain
Brain–computer interface
Central nervous system disease
List of neuroscience databases
Neurological disorder


In [61]:
scrapper.to_mlab()

In [57]:
scrapper.to_dataframe()

Unnamed: 0,_id,links,title,url
0,Brain,(https://en.wikipedia.org/wiki/Brain%E2%80%93c...,Brain,https://en.wikipedia.org/wiki/Brain
1,Brain–computer interface,(https://en.wikipedia.org/wiki/Augmented_learn...,Brain–computer interface,https://en.wikipedia.org/wiki/Brain%E2%80%93co...
2,Central nervous system disease,(https://en.wikipedia.org/wiki/Neurodegenerati...,Central nervous system disease,https://en.wikipedia.org/wiki/Central_nervous_...
3,List of neuroscience databases,(https://en.wikipedia.org/wiki/Neuroinformatic...,List of neuroscience databases,https://en.wikipedia.org/wiki/List_of_neurosci...
4,Neurological disorder,(https://en.wikipedia.org/wiki/Central_nervous...,Neurological disorder,https://en.wikipedia.org/wiki/Neurological_dis...


In [36]:
import pymongo

data = [
    {
        "_id": "NEW ENTRY",
        "message": "small boom"    
    },
    {
        "_id": "And Some Other Thing",
        "message":"New Message"
    }
    
]

uri = "mongodb://qdizon:abc123@ds261277.mlab.com:61277/wiki_scrapper"
client = pymongo.MongoClient(uri)

db = client.get_default_database()

messages = db["known_related"]

try:
    messages.insert_one(data[0])
except:
    messages.update_one({"_id": data[0]["_id"]}, {"$set": {"message": data[0]['message']}})

client.close()


In [22]:
resp.text

'{ "message" : "Unexpected Content-Type \'application/x-www-form-urlencoded\', expecting \'application/json\'."}\n'

In [None]:
def trigger_traversal(entry_url, max_trees=3):
    
    current_tree_count = 0
    seen = {}
    titles = []
    
    max_trees_reached = False
    queue = []

    def traverse_related_links(related_dict, max_trees_reached, current_tree_count, max_trees):
            
        if not related_dict:
            seen[related_dict["url"]] = True

        for url in related_dict['related_links']:
            if not url in seen:
                if (not max_trees_reached) and url:
                    queue.append(url)
        
        #         print(len(queue))
        current_tree_count += 1
        print(current_tree_count)
        if current_tree_count == max_trees:
            max_trees_reached = True
        
        
        while True:
            next_related_dict = get_related_links(queue.pop(0))
            if next_related_dict:
                return traverse_related_links(next_related_dict, max_trees_reached, current_tree_count, max_trees)
            
        
        
        
        
    entry_dict = get_related_links(entry_url)
    
    return traverse_related_links(entry_dict, max_trees_reached, current_tree_count, max_trees)

trigger_traversal("https://en.wikipedia.org/wiki/Brain", max_trees=20)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
