In [70]:
import pdb
import requests
from bs4 import BeautifulSoup
import pymongo
from copy import copy, deepcopy

In [71]:
def find_see_also(soup):
    return soup.find(attrs={"id":"See_also"})


def find_links(soup):
    if soup.name == "ul":
        return soup.find_all("li")
    
    if soup.name == "div" and " ".join(soup["class"]) == "div-col columns column-width":
        return soup.find_all("li")
    
    return find_links(soup.find_next_sibling())


def process_links(links):
    hrefs = []
    for link in links:
        hrefs.append("https://en.wikipedia.org" + link.find('a').get("href"))
    return tuple(hrefs)
    
    
    
def get_article_title(soup):
    return soup.find(attrs={"id":"firstHeading"}).text
    
      
def get_related_links(url):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content)
    
    title = get_article_title(soup)
    
    see_also = find_see_also(soup)
    
    if see_also:
        links = find_links(see_also.find_parent())
    else: 
        return None
    
    return {
        "title": title,
        "url": url,
        "links": process_links(links)
    }

In [72]:
get_related_links("https://en.wikipedia.org/wiki/Brain")

{'title': 'Brain',
 'url': 'https://en.wikipedia.org/wiki/Brain',
 'links': ('https://en.wikipedia.org/wiki/Brain%E2%80%93computer_interface',
  'https://en.wikipedia.org/wiki/Central_nervous_system_disease',
  'https://en.wikipedia.org/wiki/List_of_neuroscience_databases',
  'https://en.wikipedia.org/wiki/Neurological_disorder',
  'https://en.wikipedia.org/wiki/Optogenetics',
  'https://en.wikipedia.org/wiki/Outline_of_neuroscience')}

In [80]:
import pandas as pd
from keys import *

class WikiScrapper:
    
    
    def __init__(self):
        self.data = []
    
    
    def traverse_from(self, url, max_depth=3, max_nodes=100):
        current = get_related_links(url)
        queue = list(current['links'])
        self.data = [current]
        seen = {current["title"]: True}
        depth_count = 1
                
        while depth_count < max_depth:
            queue_copy = queue.copy()
            
            for link in queue_copy:
                try:
                    current = get_related_links(queue.pop(0))
                    
                    if current:
                        
                        if seen.get(current['title']):
                            continue
                        
                        seen[current['title']] = True
                        
                        queue += current['links']
                        self.data.append(current)
                        
                        if max_nodes and len(self.data) == max_nodes:
                            return self.data
                        
                except:
                    continue
            
            depth_count += 1
            
        return self.data
        
    
    def to_dataframe(self):
        return pd.DataFrame(self.data)
    
    def to_csv(self, file_name):
        self.to_dataframe().to_csv(file_name, index=False)
    
    def add_ids(self):
        data = deepcopy(self.data)
        
        for article in data:
            article['_id'] = article['title']
        return data
    
    def to_mlab(self):
        
        uri = f"mongodb://{mlab_api['username']}:{mlab_api['password']}@ds261277.mlab.com:61277/wiki_scrapper"
        client = pymongo.MongoClient(uri)

        db = client.get_default_database()

        data_inserter = db["known_related"]
        
        # add ids to our data for so we don't save duplicates of the same topic
        # save to a new version so that we don't overwrite any of our other data
        links_data = self.add_ids()

        for article in links_data:
            try:
                data_inserter.insert_one(article)
            except:
                continue

        client.close()

In [86]:
scrapper = WikiScrapper()

scrapper.traverse_from("https://en.wikipedia.org/wiki/Brain", max_depth=3, max_nodes=None);


In [87]:
scrapper.to_mlab()

In [84]:
scrapper.to_csv("test.csv")