# Team Vaccinated: Milestone 1

## Part 1: Introduction

The goal of this practicum is to gather data from Facebook.

The following Python libraries are used:
 - Pandas: ...
 - Numpy: ...
 - BeautifulSoup: ...

## Part 2: Gather data

1. Open a page in Facebook.
2. Scroll down.
3. Ctrl+s to save page.

## Part 3: Run code

Run the code below to import the necessary libraries.

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup, NavigableString

In [2]:
filename = 'Stop Mandatory Vaccination - Posts.html'
#filename = 'HealthCare.gov - Posts.html'
number_of_posts = 50

In [3]:
f, full_soup = None, None

try:
    f = open(filename, encoding='utf-8')
    full_soup = BeautifulSoup(f)
except Exception as e:
    print(e)

In [4]:
#Parse a given post's soup
def parse_post(post_soup):
    data = {}
    
    #Get date published
    data['timestamp'] = post_soup.find('span', class_='fsm fwn fcg').find('span', class_='timestampContent').text
    
    #Get text and links on the post
    data['links'] = []
    data['text'] = ''
    for p in post_soup.find_all('p'):
        if p.a:
            a_tags = p.a.extract()
            data['links'].append(a_tags.text.strip())
        data['text'] += p.text.strip()
    
    #Find if it has an image or not
    img = post_soup.find('img', {'class': 'scaledImageFitWidth img'})
    data['img_src'] = img.attrs['src'] if img and 'src' in img.attrs else None
    data['img-label'] = img.attrs['aria-label'] if img and 'aria-label' in img.attrs else None
    
    #Get the link info if it exists
    article = post_soup.find('div', {'class': 'mbs'})
    data['article_name'] = article.text if article and 'text' in article else None
    host = post_soup.find('div', {'class': '_6lz _6mb _1t62 ellipsis'})
    data['article_host'] = host.text if host and 'text' in host else None
    subtitle = post_soup.find('div', class_='_6m7 _3bt9')
    data['article_subtitle'] = subtitle.text if subtitle else None
    
    #Find other profiles if it has linked to them
    data['linked_profiles'] = [page.text for page in post_soup.find_all('span', class_='fwb')
                               if page.text != 'Stop Mandatory Vaccination']
    
    return data

In [5]:
#Parse all posts given a soup
def parse_posts(soup, limit=0):
    for i, post_child in enumerate(soup.find_all('div', class_='userContent')):
        if limit != 0 and i >= limit:
            break
        yield parse_post(post_child.parent)

In [6]:
posts = list(parse_posts(full_soup, limit=number_of_posts))
print('Number of posts: {}'.format(len(posts)))

Number of posts: 50


In [7]:
import pprint
pp = pprint.PrettyPrinter(indent=4, width=110)
for p in posts:
    pp.pprint(p['text'])

('Get educated. Seriously, read several books and learn and understand this topic inside and out so you can '
 'rebuke the pediatrician, the politician, your friends, your family, and everyone who may harass you for '
 'not vaccinating. And then, get into this fight with us and educate others. This is where you start, right '
 'here, with these excellent resources that medical professionals, scientists and activists have created for '
 'us so YOU can understand this vaccine topic more clearly. I suggest you purchase as many as you can '
 'afford: there is no such thing as being “overeducated” on this topic that affects us all.')
('Your host of The Thyroid Reset Summit, Dr. Justin Marchegiani, works with patients all over the world who '
 'are challenged by underlying thyroid issues. He created this health event to teach you to examine your '
 'health from all angles -- not just the convenient ones -- and find answers to continue your healing '
 "journey.He'll share that wisdom with you

In [9]:
word_count = 0
all_caps_count = 0

for p in posts:
    words = p['text'].split()
    word_count += len(words)
    for word in words:
        if word.isupper():
            all_caps_count += 1

if word_count == 0:
    percentage = 0
else:
    percentage = (all_caps_count / word_count) * 100

print('All caps percentage: {}'.format(percentage))

All caps percentage: 4.938271604938271
