# Approach 
1. Take a search term
2. Search for all tags related to the term
3. Get all top posts (all time set by default) of the tag
4. Create a dataframe

In [None]:
from bs4 import BeautifulSoup
import requests
import re
import time
from tqdm import tqdm
import pandas as pd

In [None]:
SEARCH_TERMS = [
    "soft skills",
    "collaboration",
    "interpersonal communication",
    "communication",
    "creativity",
    "relationship curation",
    "self awareness",
    "emotional intelligence",
    "leadership",
    "persistence",
    "planning",
    "stress management"
]
TAG_SEARCHER_URL = "https://medium.com/search/tags?q="

def urlify(string) :
  return string.replace(' ', '+')

def scrape_page(url, parser='html.parser') :
  page = requests.get(url)
  assert page.status_code==200, f"Request did not pass. Status : {page.status_code}"
  return BeautifulSoup(page.content, parser) 

def print_max_n(lst, N=10) :
  l = len(lst)
  if N==-1 : 
    N = l
  print(f"Printing first {min(l, N)} of {l} : ")
  for i in range(min(l, N)) :
    print(lst[i])

def GET_TOP_BLOGS_URL(url_tag, time="all-time"): 
  assert time in ["all-time", "year", "month", "week"], "Invalid time filter"
  return f"https://medium.com/tag/{url_tag}/top/{time}"

# Extract Tags Using Search Term

In [None]:
SEARCH_INDEX = 1
TAG_SEARCHER_URL+urlify(SEARCH_TERMS[SEARCH_INDEX])

'https://medium.com/search/tags?q=collaboration'

In [None]:
doc = scrape_page(TAG_SEARCHER_URL+urlify(SEARCH_TERMS[SEARCH_INDEX]))

In [None]:
TAG_NAMES = []
URLS_TAGS = []

for element in doc.find_all(href=re.compile("/tag/.*\?source=")) : 
  href = element.attrs["href"]
  res = re.search(f"/[A-Za-z0-9\-]+\?", href)
  if res : 
    b, e = res.span()
    TAG_NAMES.append(element.text)
    URLS_TAGS.append(href[b+1:e-1])
print_max_n(TAG_NAMES)
print_max_n(URLS_TAGS)

Printing first 10 of 14 : 
Stress Management
Stress Management Tips
Stress Management Therapy
Stress Management Course
Stress Management Clinic
Stress Management Skills
Stress Management Texas
Stress Management Surrey
Stress Management Nurses
Stressmanagementcoach
Printing first 10 of 14 : 
stress-management
stress-management-tips
stress-management-therapy
stress-management-course
stress-management-clinic
stress-management-skills
stress-management-texas
stress-management-surrey
stress-management-nurses
stressmanagementcoach


# Extract top posts of tag

In [None]:
TAG_INDEX = 0
doc2 = scrape_page(GET_TOP_BLOGS_URL(URLS_TAGS[TAG_INDEX]))

In [None]:
posts = []

for article in doc2.find_all("article") :
  data = {}
  reading_time_element = article.find(lambda x :  x.has_attr("aria-label") and x.attrs["aria-label"]=="Post Preview Reading Time")
  if reading_time_element : 
    data["rtime"] = reading_time_element.text
  post_preview_element = article.find(lambda x :  x.has_attr("aria-label") and x.attrs["aria-label"]=="Post Preview Title")
  if post_preview_element : 
    if post_preview_element.has_attr("href") :
      res = re.search("/[A-Za-z0-9\-\/]+\?", post_preview_element.attrs["href"])
      b, e = res.span()
      data["article_url"] = "https://medium.com/"+ post_preview_element.attrs["href"][b+1 : e-1]
    heading_element = post_preview_element.find("h2")
    if heading_element : 
      data["heading"] = heading_element.text
    para_element = post_preview_element.find("p")
    if para_element : 
      data["text_preview"] = para_element.text
  post_image_element = article.find(lambda x :  x.has_attr("aria-label") and x.attrs["aria-label"]=="Post Preview Image")
  if post_image_element : 
    image_element = post_image_element.find("img")
    if image_element : 
      data["image_url"] = image_element.attrs["src"]
  data["search_term"] = SEARCH_TERMS[SEARCH_INDEX]
  data["url_tag"] = URLS_TAGS[TAG_INDEX]
  data["tag_name"] = TAG_NAMES[TAG_INDEX]
  posts.append(data)

In [None]:
posts

[{'rtime': '14 min read',
  'article_url': 'https://medium.com/swlh/how-to-lead-when-you-have-no-authority-9f22206356d4',
  'heading': 'How To Lead When You Have No Authority',
  'text_preview': 'Four Pillars to Increase Your Influence Both at Work And in Life —  Everyone remembers the 2002 movie, My Big Fat Greek Wedding. It’s a hilarious story about the struggles of Toula (the daughter of a traditional Greek family) as she tries to fall in love and get married. There is one particular scene where Toula asks her father permission to go to…',
  'image_url': 'https://miro.medium.com/fit/c/224/224/1*4LGVOiW7jWqnTW5h6yJWDw.jpeg',
  'search_term': 'soft skills',
  'url_tag': 'soft-skills',
  'tag_name': 'Soft Skills'},
 {'rtime': '10 min read',
  'article_url': 'https://medium.com/newco/hard-and-soft-skills-in-tech-8be00216f67f',
  'heading': 'Hard and Soft Skills in Tech',
  'text_preview': 'It’s both more serious and less serious than we’ve admitted —  I’ve recently seen a lot of very an

# Putting It All Together

In [None]:
posts = []
lsi = len(SEARCH_TERMS)
for SEARCH_INDEX in range(lsi) :
  time.sleep(0.5) # Avoid rate limit
  doc = scrape_page(TAG_SEARCHER_URL+urlify(SEARCH_TERMS[SEARCH_INDEX]))
  TAG_NAMES = []
  URLS_TAGS = []

  for element in doc.find_all(href=re.compile("/tag/.*\?source=")) : 
    href = element.attrs["href"]
    res = re.search(f"/[A-Za-z0-9\-]+\?", href)
    if res : 
      b, e = res.span()
      TAG_NAMES.append(element.text)
      URLS_TAGS.append(href[b+1:e-1])
  
  lti = len(TAG_NAMES)
  for TAG_INDEX in tqdm(range(lti), desc=f"Processing Term : {SEARCH_TERMS[SEARCH_INDEX]}") :
    time.sleep(1) # Avoid rate limit
    doc2 = scrape_page(GET_TOP_BLOGS_URL(URLS_TAGS[TAG_INDEX]))
    for article in doc2.find_all("article") :
      data = {}
      reading_time_element = article.find(lambda x :  x.has_attr("aria-label") and x.attrs["aria-label"]=="Post Preview Reading Time")
      if reading_time_element : 
        data["rtime"] = reading_time_element.text
      post_preview_element = article.find(lambda x :  x.has_attr("aria-label") and x.attrs["aria-label"]=="Post Preview Title")
      if post_preview_element : 
        if post_preview_element.has_attr("href") :
          res = re.search("/[A-Za-z0-9\-\/]+\?", post_preview_element.attrs["href"])
          if not res : 
            continue
          b, e = res.span()
          data["article_url"] = "https://medium.com/"+ post_preview_element.attrs["href"][b+1 : e-1]
        heading_element = post_preview_element.find("h2")
        if heading_element : 
          data["heading"] = heading_element.text
        para_element = post_preview_element.find("p")
        if para_element : 
          data["text_preview"] = para_element.text
      post_image_element = article.find(lambda x :  x.has_attr("aria-label") and x.attrs["aria-label"]=="Post Preview Image")
      if post_image_element : 
        image_element = post_image_element.find("img")
        if image_element : 
          data["image_url"] = image_element.attrs["src"]
      data["search_term"] = SEARCH_TERMS[SEARCH_INDEX]
      data["url_tag"] = URLS_TAGS[TAG_INDEX]
      data["tag_name"] = TAG_NAMES[TAG_INDEX]
      posts.append(data)

Processing Term : soft skills: 100%|██████████| 29/29 [00:50<00:00,  1.73s/it]
Processing Term : collaboration: 100%|██████████| 30/30 [00:52<00:00,  1.75s/it]
Processing Term : interpersonal communication: 0it [00:00, ?it/s]
Processing Term : communication: 100%|██████████| 30/30 [00:58<00:00,  1.93s/it]
Processing Term : creativity: 100%|██████████| 30/30 [00:54<00:00,  1.81s/it]
Processing Term : relationship curation: 0it [00:00, ?it/s]
Processing Term : self awareness: 100%|██████████| 13/13 [00:23<00:00,  1.79s/it]
Processing Term : emotional intelligence: 100%|██████████| 5/5 [00:09<00:00,  1.87s/it]
Processing Term : leadership: 100%|██████████| 30/30 [01:00<00:00,  2.03s/it]
Processing Term : persistence: 100%|██████████| 27/27 [00:48<00:00,  1.81s/it]
Processing Term : planning: 100%|██████████| 29/29 [00:53<00:00,  1.84s/it]
Processing Term : stress management: 100%|██████████| 14/14 [00:24<00:00,  1.76s/it]


In [None]:
len(posts)

1165

In [None]:
df = pd.DataFrame(posts)

In [None]:
df.columns = ["Reading Time", "URL", "Heading", "Content", "Image URL", "Skill", "URL Tag", "Tag"]

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1165 entries, 0 to 1164
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Reading Time  1165 non-null   object
 1   URL           1165 non-null   object
 2   Heading       1165 non-null   object
 3   Content       1163 non-null   object
 4   Image URL     1100 non-null   object
 5   Skill         1165 non-null   object
 6   URL Tag       1165 non-null   object
 7   Tag           1165 non-null   object
dtypes: object(8)
memory usage: 72.9+ KB


In [None]:
df.head()

Unnamed: 0,Reading Time,URL,Heading,Content,Image URL,Skill,URL Tag,Tag
0,14 min read,https://medium.com/swlh/how-to-lead-when-you-h...,How To Lead When You Have No Authority,Four Pillars to Increase Your Influence Both a...,https://miro.medium.com/fit/c/224/224/1*4LGVOi...,soft skills,soft-skills,Soft Skills
1,10 min read,https://medium.com/newco/hard-and-soft-skills-...,Hard and Soft Skills in Tech,It’s both more serious and less serious than w...,https://miro.medium.com/fit/c/224/224/1*KOzo8n...,soft skills,soft-skills,Soft Skills
2,8 min read,https://medium.com/hackernoon/the-one-essentia...,The one essential skill that will set you apar...,and how you can hone this skill in five easy w...,https://miro.medium.com/fit/c/224/224/1*dhwHUl...,soft skills,soft-skills,Soft Skills
3,7 min read,https://medium.com/hackernoon/10-soft-skills-e...,10 Soft Skills Every Developer Needs,Oxford Dictionary describes soft skills as: Pe...,https://miro.medium.com/fit/c/224/224/1*A-1Rzp...,soft skills,soft-skills,Soft Skills
4,10 min read,https://medium.com/bridging-the-gap-between-ju...,Bridging the Gap Between Junior and Senior Eng...,Bridging the Gap Between Junior and Senior Eng...,https://miro.medium.com/fit/c/224/224/1*PA_JLj...,soft skills,soft-skills,Soft Skills


In [None]:
df["Skill"].value_counts()

leadership                262
communication             218
creativity                155
planning                  133
persistence               111
collaboration              89
soft skills                69
self awareness             62
stress management          42
emotional intelligence     24
Name: Skill, dtype: int64

In [None]:
df.to_csv("Medium Posts v1.csv", index=False)