In [30]:
!pip install PyPDF2
!pip install --upgrade pip
!pip install newspaper3k
!pip install htmldate
!pip install xlsxwriter

[0m

In [12]:
import pandas as pd
import PyPDF2
import os
import requests
from bs4 import BeautifulSoup
import tldextract
import time
import random
import newspaper
from newspaper import Article, fulltext, Config, ArticleException
import re
from htmldate import find_date

In [13]:
# Open PDF file
PDFFile = open('weekly_raw_data/merged_pdf/Weekly Covearge Report FY24 October 05 to October 19.pdf','rb')
PDF = PyPDF2.PdfReader(PDFFile)
pages = len(PDF.pages)
key = '/Annots'
uri = '/URI'
ank = '/A'

In [14]:
urls_list = []

In [15]:
for page in range(pages):
    print("Current Page: {}".format(page))
    pageSliced = PDF.pages[page]
    pageObject = pageSliced.get_object()
    if key in pageObject.keys():
        ann = pageObject[key]
        for a in ann:
            try:
                u = a.get_object()
                if uri in u[ank].keys():
                    urls_list.append(u[ank][uri])
            except KeyError:
                pass

Current Page: 0
Current Page: 1
Current Page: 2
Current Page: 3
Current Page: 4
Current Page: 5


In [16]:
urls = pd.DataFrame(urls_list, columns=['url'])
urls.reset_index(drop=True, inplace=True)
print(urls.shape)
urls.head()

(112, 1)


Unnamed: 0,url
0,https://microsoft.sharepoint.com/teams/LatAmPR...
1,https://protect-eu.mimecast.com/s/4YhyCLYo9HQw...
2,https://microsoft.sharepoint.com/teams/LatAmPR...
3,https://protect-eu.mimecast.com/s/XBVACNEqkTjP...
4,https://www.bluradio.com/tecnologia/microsoft-...


### Define Required Functions

In [18]:
## 2. Links
def extract_link(url):
    try:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        link = soup.find('link', rel='canonical')['href']
        print(link)
        return link
    except Exception as e:
        print(e)
        print(f'Error in {url} not found')
        return 'Review'

In [19]:
def extract_link(url):
    # Try to extract the link with BeautifulSoup
    try:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, "html.parser")
        link = soup.find("link", rel="canonical")["href"]
        print(f"Link extraído con éxito: {link}")
        return link
    except Exception as e:
        print(f"Error al extraer el enlace con BeautifulSoup: {e}")
        # If there was an error, try to extract the link with Newspaper3k
        try:
            article = Article(url)
            article.download()
            article.parse()
            link = article.canonical_link
            print(f"Link extraído con éxito: {link}")
            return link
        except Exception as e:
            print(f"Error al extraer el enlace con Newspaper3k: {e}")
            print(f"Error al extraer el enlace en {url}")
            return "Revisar"

In [20]:
def correct_link(url):
    if re.search('onedrive', url):
        return url
    else:
        link = extract_link(url)
        return link

In [21]:
def check_url(url):
    try:
        r = requests.get(url)
        if r.status_code == 200:
            return r.status_code, url, 'Success'
        else:
            return r.status_code, url, 'Error'
    except Exception as e:
        return 'Error', url, e

In [22]:
# 3. Domains [Outlets]
def get_domain(url):
    try:
        ext = tldextract.extract(url)
        domain = ext.domain
        suffix = ext.suffix
        subdomain = ext.subdomain
        if subdomain == 'www':
            subdomain = ''
        else:
            subdomain = subdomain
        return subdomain + domain + '.' + suffix
    except Exception as e:
        print(e)
        return f"error {e}"

In [23]:
# 4. Titles
def extract_title(url):
    try:
        config = Config()
        config.memoize_articles = False
        article = Article(url, config=config)
        article.download()
        article.parse()
        title = article.title
        print(title)
        return title
    except Exception as e:
        print(e)
        print(f'Error in {url} not found')
        return None

In [24]:
def extract_title_2(url):
    try:
        # Intentamos extraer el título con BeautifulSoup
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        title = soup.find('title').text
        print(title)
        return title
    except Exception as e:
        print(f'Error in {url} not found')
        # Si hay un error al extraer el título con BeautifulSoup, intentamos con newspaper3k
        try:
            article = Article(url)
            article.download()
            article.parse()
            title = article.title
            print(title)
            return title
        except Exception as e:
            print(e)
            print(f'Error in {url} not found')
            return "Review Manually"

In [25]:
## 6. Content
'''
Content extraction
'''
#Diffbot API
user =      'katia.bedolla@porternovelli.mx'
API_TOKEN = '7a7668b8111a6e4d5750c12a8c93b56d'

class DiffbotClient(object):

    base_url = 'http://api.diffbot.com/'

    def request(self, url, token, api, fields=None, version=3, **kwargs):
        """
        Returns a python object containing the requested resource from the diffbot api
        """
        params = {"url": url, "token": token}
        if fields:
            params['fields'] = fields
        params.update(kwargs)
        response = requests.get(self.compose_url(api, version), params=params)
        response.raise_for_status()
        return response.json()

    def compose_url(self, api, version_number):
        """
        Returns the uri for an endpoint as a string
        """
        version = self.format_version_string(version_number)
        return '{}{}/{}'.format(self.base_url, version, api)
    @staticmethod
    def format_version_string(version_number):
        """
        Returns a string representation of the API version
        """
        return 'v{}'.format(version_number)

In [27]:
def get_content_diffbot(url):
    diffbot = DiffbotClient()
    token = API_TOKEN
    api = "analyze"
    try:
        response = diffbot.request(url, token, api)
        if 'objects' in response:
            if len(response['objects'])>0:
                if 'text' in response['objects'][0]:
                    return response['objects'][0]['text']
                else:
                    return "No Content"
            else:
                return "Empty URL, Nothing found"
        else:
            return "Empty URL, Nothing found"
    except:
        return "Something went wrong with url"

user_agent_list = [
   #Chrome
     'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0',

    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    #Firefox
    'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
]

##### ----- Function: Define the function using newspaper3k ----- #####
def get_content_news(url): 
    user_agent = random.choice(user_agent_list)
    config = Config()
    config.browser_user_agent = user_agent
    time.sleep(.05)  
    a  =  Article(url, config=config)
    try:
        a.download()
        a.parse()
        paragraphs = a.text
        paragraphs = re.sub(r'<a href=.+?(?=)>|<br/>|\\xa0|\n|</a>|\xa0|<strong>|</strong>|<br/>•|<i(.*?)</i>|<img(.*?)>','',str(paragraphs))
        if type(paragraphs)==str and  len(paragraphs) >0:
            print("URL Content from {} is correct".format(url))
            return paragraphs
        elif paragraphs == '' or  type(paragraphs) == newspaper.article.ArticleException:
            ext_diff= get_content_diffbot(url)
            print("URL Content from {} is correct from diffbot".format(url))
            return ext_diff
    except Exception as exce:
        print("URL Content from {} is OtherError".format(url))
        try:
            ext_diff= get_content_diffbot(url)
            print("URL Content from {} is correct from diffbot".format(url))
            return ext_diff
        except:
            print("URL Content from {} is OtherError".format(url))

In [28]:
# 7. Dates
def extract_date(url):
    try:
        date = find_date(url)
        print(date)
        return date
    except Exception as e:
        print(e)
        pass

In [29]:
urls['link'] = urls['url'].apply(extract_link)
print(urls.shape)
urls.head()

Error al extraer el enlace con BeautifulSoup: 'NoneType' object is not subscriptable
Error al extraer el enlace con Newspaper3k: Article `download()` failed with 403 Client Error:  for url: https://microsoft.sharepoint.com/teams/LatAmPR2/Shared%20Documents/Forms/AllItems.aspx?csf=1&web=1&e=EgXWx8&cid=678ae3d2-7027-4ce6-b611-34be08de616c&FolderCTID=0x012000C0EB2CC593C3954DA9A068711C01B7AF&id=%2Fteams%2FLatAmPR2%2FShared%20Documents%2FGeneral%2FFY24%2FWeekly%20Newsletter%20-%20Americas%20Comms%20News%20%26%20Views%2FSubs%2FBrazil%2FOctober%2019th%2F2023.10.%2019_IA%2C%20Governan%C3%A7a%20e%20o%20futuro%20das%20companhias%20_%20ESG%20_%20Valor%20Econ%C3%B4mico.pdf&viewid=bdc81d78-4deb-497c-8336-338994e758bb&parent=%2Fteams%2FLatAmPR2%2FShared%20Documents%2FGeneral%2FFY24%2FWeekly%20Newsletter%20-%20Americas%20Comms%20News%20%26%20Views%2FSubs%2FBrazil%2FOctober%2019th on URL https://microsoft.sharepoint.com/teams/LatAmPR2/Shared%20Documents/Forms/AllItems.aspx?csf=1&web=1&e=EgXWx8&cid=678

Unnamed: 0,url,link
0,https://microsoft.sharepoint.com/teams/LatAmPR...,Revisar
1,https://protect-eu.mimecast.com/s/4YhyCLYo9HQw...,https://escolaeducacao.com.br/microsoft-dispon...
2,https://microsoft.sharepoint.com/teams/LatAmPR...,Revisar
3,https://protect-eu.mimecast.com/s/XBVACNEqkTjP...,https://saudedigitalnews.com.br/18/10/2023/com...
4,https://www.bluradio.com/tecnologia/microsoft-...,https://www.bluradio.com/tecnologia/microsoft-...


In [33]:
urls.to_excel('weekly_processed/Weekly Covearge Report FY24 October 05 to October 19.xlsx', index=False, engine='xlsxwriter')

## Read Excel File - Processing

In [62]:
urls = pd.read_excel('weekly_processed/Weekly Covearge Report FY24 October 05 to October 19.xlsx')
print(urls.shape)
urls.head()

(112, 2)


Unnamed: 0,url,link
0,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...
1,https://protect-eu.mimecast.com/s/4YhyCLYo9HQw...,https://escolaeducacao.com.br/microsoft-dispon...
2,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...
3,https://protect-eu.mimecast.com/s/XBVACNEqkTjP...,https://saudedigitalnews.com.br/18/10/2023/com...
4,https://www.bluradio.com/tecnologia/microsoft-...,https://www.bluradio.com/tecnologia/microsoft-...


In [63]:
urls['outlet'] = urls['link'].apply(get_domain)
print(urls.shape)
urls.head()

(112, 3)


Unnamed: 0,url,link,outlet
0,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...,microsoftsharepoint.com
1,https://protect-eu.mimecast.com/s/4YhyCLYo9HQw...,https://escolaeducacao.com.br/microsoft-dispon...,escolaeducacao.com.br
2,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...,microsoftsharepoint.com
3,https://protect-eu.mimecast.com/s/XBVACNEqkTjP...,https://saudedigitalnews.com.br/18/10/2023/com...,saudedigitalnews.com.br
4,https://www.bluradio.com/tecnologia/microsoft-...,https://www.bluradio.com/tecnologia/microsoft-...,bluradio.com


In [64]:
urls['title'] = urls['link'].apply(extract_title)
print(urls.shape)
urls.head()

Article `download()` failed with 403 Client Error: Forbidden for url: https://microsoft.sharepoint.com/teams/LatAmPR2/Shared%20Documents/Forms/AllItems.aspx?csf=1&web=1&e=EgXWx8&cid=678ae3d2-7027-4ce6-b611-34be08de616c&FolderCTID=0x012000C0EB2CC593C3954DA9A068711C01B7AF&id=%2Fteams%2FLatAmPR2%2FShared%20Documents%2FGeneral%2FFY24%2FWeekly%20Newsletter%20-%20Americas%20Comms%20News%20%26%20Views%2FSubs%2FBrazil%2FOctober%2019th%2F2023.10.%2019_IA%2C%20Governan%C3%A7a%20e%20o%20futuro%20das%20companhias%20_%20ESG%20_%20Valor%20Econ%C3%B4mico.pdf&viewid=bdc81d78-4deb-497c-8336-338994e758bb&parent=%2Fteams%2FLatAmPR2%2FShared%20Documents%2FGeneral%2FFY24%2FWeekly%20Newsletter%20-%20Americas%20Comms%20News%20%26%20Views%2FSubs%2FBrazil%2FOctober%2019th on URL https://microsoft.sharepoint.com/teams/LatAmPR2/Shared%20Documents/Forms/AllItems.aspx?csf=1&web=1&e=EgXWx8&cid=678ae3d2%2D7027%2D4ce6%2Db611%2D34be08de616c&FolderCTID=0x012000C0EB2CC593C3954DA9A068711C01B7AF&id=%2Fteams%2FLatAmPR2%2FS

Unnamed: 0,url,link,outlet,title
0,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...,microsoftsharepoint.com,
1,https://protect-eu.mimecast.com/s/4YhyCLYo9HQw...,https://escolaeducacao.com.br/microsoft-dispon...,escolaeducacao.com.br,Microsoft disponibiliza DALL-E 3 totalmente DE...
2,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...,microsoftsharepoint.com,
3,https://protect-eu.mimecast.com/s/XBVACNEqkTjP...,https://saudedigitalnews.com.br/18/10/2023/com...,saudedigitalnews.com.br,Como a IA pode solucionar problemas e gerar op...
4,https://www.bluradio.com/tecnologia/microsoft-...,https://www.bluradio.com/tecnologia/microsoft-...,bluradio.com,Microsoft lanza nuevas herramientas de IA para...


In [65]:
urls['title_2'] = urls['link'].apply(extract_title_2)
print(urls.shape)
urls.head()

Error in https://microsoft.sharepoint.com/teams/LatAmPR2/Shared%20Documents/Forms/AllItems.aspx?csf=1&web=1&e=EgXWx8&cid=678ae3d2%2D7027%2D4ce6%2Db611%2D34be08de616c&FolderCTID=0x012000C0EB2CC593C3954DA9A068711C01B7AF&id=%2Fteams%2FLatAmPR2%2FShared%20Documents%2FGeneral%2FFY24%2FWeekly%20Newsletter%20%2D%20Americas%20Comms%20News%20%26%20Views%2FSubs%2FBrazil%2FOctober%2019th%2F2023%2E10%2E%2019%5FIA%2C%20Governan%C3%A7a%20e%20o%20futuro%20das%20companhias%20%5F%20ESG%20%5F%20Valor%20Econ%C3%B4mico%2Epdf&viewid=bdc81d78%2D4deb%2D497c%2D8336%2D338994e758bb&parent=%2Fteams%2FLatAmPR2%2FShared%20Documents%2FGeneral%2FFY24%2FWeekly%20Newsletter%20%2D%20Americas%20Comms%20News%20%26%20Views%2FSubs%2FBrazil%2FOctober%2019th not found
Article `download()` failed with 403 Client Error: Forbidden for url: https://microsoft.sharepoint.com/teams/LatAmPR2/Shared%20Documents/Forms/AllItems.aspx?csf=1&web=1&e=EgXWx8&cid=678ae3d2-7027-4ce6-b611-34be08de616c&FolderCTID=0x012000C0EB2CC593C3954DA9A0687

Unnamed: 0,url,link,outlet,title,title_2
0,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...,microsoftsharepoint.com,,Review Manually
1,https://protect-eu.mimecast.com/s/4YhyCLYo9HQw...,https://escolaeducacao.com.br/microsoft-dispon...,escolaeducacao.com.br,Microsoft disponibiliza DALL-E 3 totalmente DE...,Microsoft disponibiliza DALL-E 3 totalmente DE...
2,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...,microsoftsharepoint.com,,Review Manually
3,https://protect-eu.mimecast.com/s/XBVACNEqkTjP...,https://saudedigitalnews.com.br/18/10/2023/com...,saudedigitalnews.com.br,Como a IA pode solucionar problemas e gerar op...,Como a IA pode solucionar problemas e gerar op...
4,https://www.bluradio.com/tecnologia/microsoft-...,https://www.bluradio.com/tecnologia/microsoft-...,bluradio.com,Microsoft lanza nuevas herramientas de IA para...,Microsoft lanza nuevas herramientas de IA para...


In [66]:
# 5. Source
urls['source'] = 'Weekly'
print(urls.shape)
urls.head()

(112, 6)


Unnamed: 0,url,link,outlet,title,title_2,source
0,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...,microsoftsharepoint.com,,Review Manually,Weekly
1,https://protect-eu.mimecast.com/s/4YhyCLYo9HQw...,https://escolaeducacao.com.br/microsoft-dispon...,escolaeducacao.com.br,Microsoft disponibiliza DALL-E 3 totalmente DE...,Microsoft disponibiliza DALL-E 3 totalmente DE...,Weekly
2,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...,microsoftsharepoint.com,,Review Manually,Weekly
3,https://protect-eu.mimecast.com/s/XBVACNEqkTjP...,https://saudedigitalnews.com.br/18/10/2023/com...,saudedigitalnews.com.br,Como a IA pode solucionar problemas e gerar op...,Como a IA pode solucionar problemas e gerar op...,Weekly
4,https://www.bluradio.com/tecnologia/microsoft-...,https://www.bluradio.com/tecnologia/microsoft-...,bluradio.com,Microsoft lanza nuevas herramientas de IA para...,Microsoft lanza nuevas herramientas de IA para...,Weekly


In [67]:
'''
Extract Content: CIMS Coverage
Newspaper3k implementarion
'''
start = time.time()
try:
    for index,row in urls.iterrows():
        if (row['source']=='Weekly'):
            urls.at[index, 'content'] =  get_content_news(row['link'])
            print(index)
except Exception as e:
    print(e)
    pass
end = time.time()
print(end)

print(urls.shape)
urls.head()

URL Content from https://microsoft.sharepoint.com/teams/LatAmPR2/Shared%20Documents/Forms/AllItems.aspx?csf=1&web=1&e=EgXWx8&cid=678ae3d2%2D7027%2D4ce6%2Db611%2D34be08de616c&FolderCTID=0x012000C0EB2CC593C3954DA9A068711C01B7AF&id=%2Fteams%2FLatAmPR2%2FShared%20Documents%2FGeneral%2FFY24%2FWeekly%20Newsletter%20%2D%20Americas%20Comms%20News%20%26%20Views%2FSubs%2FBrazil%2FOctober%2019th%2F2023%2E10%2E%2019%5FIA%2C%20Governan%C3%A7a%20e%20o%20futuro%20das%20companhias%20%5F%20ESG%20%5F%20Valor%20Econ%C3%B4mico%2Epdf&viewid=bdc81d78%2D4deb%2D497c%2D8336%2D338994e758bb&parent=%2Fteams%2FLatAmPR2%2FShared%20Documents%2FGeneral%2FFY24%2FWeekly%20Newsletter%20%2D%20Americas%20Comms%20News%20%26%20Views%2FSubs%2FBrazil%2FOctober%2019th is correct from diffbot
0
URL Content from https://escolaeducacao.com.br/microsoft-disponibiliza-dall-e-3-totalmente-de-graca-no-bing-chat-veja/ is correct
1
URL Content from https://microsoft.sharepoint.com/teams/LatAmPR2/Shared%20Documents/Forms/AllItems.aspx?c

Unnamed: 0,url,link,outlet,title,title_2,source,content
0,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...,microsoftsharepoint.com,,Review Manually,Weekly,Something went wrong with url
1,https://protect-eu.mimecast.com/s/4YhyCLYo9HQw...,https://escolaeducacao.com.br/microsoft-dispon...,escolaeducacao.com.br,Microsoft disponibiliza DALL-E 3 totalmente DE...,Microsoft disponibiliza DALL-E 3 totalmente DE...,Weekly,O DALL-E 3 agora pode ser utilizado através do...
2,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...,microsoftsharepoint.com,,Review Manually,Weekly,Something went wrong with url
3,https://protect-eu.mimecast.com/s/XBVACNEqkTjP...,https://saudedigitalnews.com.br/18/10/2023/com...,saudedigitalnews.com.br,Como a IA pode solucionar problemas e gerar op...,Como a IA pode solucionar problemas e gerar op...,Weekly,As possibilidades geradas pela utilização dos ...
4,https://www.bluradio.com/tecnologia/microsoft-...,https://www.bluradio.com/tecnologia/microsoft-...,bluradio.com,Microsoft lanza nuevas herramientas de IA para...,Microsoft lanza nuevas herramientas de IA para...,Weekly,El gigante tecnológico Microsoft anunció este ...


In [68]:
urls['date'] = urls['link'].apply(extract_date)
print(urls.shape)
urls.head()

("URL couldn't be processed: %s", 'https://microsoft.sharepoint.com/teams/LatAmPR2/Shared%20Documents/Forms/AllItems.aspx?csf=1&web=1&e=EgXWx8&cid=678ae3d2%2D7027%2D4ce6%2Db611%2D34be08de616c&FolderCTID=0x012000C0EB2CC593C3954DA9A068711C01B7AF&id=%2Fteams%2FLatAmPR2%2FShared%20Documents%2FGeneral%2FFY24%2FWeekly%20Newsletter%20%2D%20Americas%20Comms%20News%20%26%20Views%2FSubs%2FBrazil%2FOctober%2019th%2F2023%2E10%2E%2019%5FIA%2C%20Governan%C3%A7a%20e%20o%20futuro%20das%20companhias%20%5F%20ESG%20%5F%20Valor%20Econ%C3%B4mico%2Epdf&viewid=bdc81d78%2D4deb%2D497c%2D8336%2D338994e758bb&parent=%2Fteams%2FLatAmPR2%2FShared%20Documents%2FGeneral%2FFY24%2FWeekly%20Newsletter%20%2D%20Americas%20Comms%20News%20%26%20Views%2FSubs%2FBrazil%2FOctober%2019th')
2023-10-18
("URL couldn't be processed: %s", 'https://microsoft.sharepoint.com/teams/LatAmPR2/Shared%20Documents/Forms/AllItems.aspx?csf=1&web=1&e=EgXWx8&cid=678ae3d2%2D7027%2D4ce6%2Db611%2D34be08de616c&FolderCTID=0x012000C0EB2CC593C3954DA9A06

Unnamed: 0,url,link,outlet,title,title_2,source,content,date
0,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...,microsoftsharepoint.com,,Review Manually,Weekly,Something went wrong with url,
1,https://protect-eu.mimecast.com/s/4YhyCLYo9HQw...,https://escolaeducacao.com.br/microsoft-dispon...,escolaeducacao.com.br,Microsoft disponibiliza DALL-E 3 totalmente DE...,Microsoft disponibiliza DALL-E 3 totalmente DE...,Weekly,O DALL-E 3 agora pode ser utilizado através do...,2023-10-18
2,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...,microsoftsharepoint.com,,Review Manually,Weekly,Something went wrong with url,
3,https://protect-eu.mimecast.com/s/XBVACNEqkTjP...,https://saudedigitalnews.com.br/18/10/2023/com...,saudedigitalnews.com.br,Como a IA pode solucionar problemas e gerar op...,Como a IA pode solucionar problemas e gerar op...,Weekly,As possibilidades geradas pela utilização dos ...,2023-10-18
4,https://www.bluradio.com/tecnologia/microsoft-...,https://www.bluradio.com/tecnologia/microsoft-...,bluradio.com,Microsoft lanza nuevas herramientas de IA para...,Microsoft lanza nuevas herramientas de IA para...,Weekly,El gigante tecnológico Microsoft anunció este ...,2023-10-10


In [69]:
media_list_fy24= pd.read_excel("FY24_media_list_demo.xlsx", sheet_name= "FY24 Media List Americas")
media_list_fy24= media_list_fy24[['Outlet', 'Media Type', 'Country', 'Region', 'Women', 'Men', '18-24', '25-34', '35-44', '45+']]
print(media_list_fy24.shape)
media_list_fy24.head()

(146, 10)


Unnamed: 0,Outlet,Media Type,Country,Region,Women,Men,18-24,25-34,35-44,45+
0,adrenaline.com.br,Consumer,Brazil,Americas,0.187832,0.812168,0.353864,0.354373,0.149176,0.142587
1,advisor.ca,Business,Canada,Americas,,,,,,
2,ambito.com,Business,Argentina,Americas,0.372917,0.627083,0.154652,0.287253,0.207571,0.350524
3,andina.pe,Business,Peru,Americas,0.478957,0.521043,0.302806,0.302415,0.172419,0.22236
4,automationmag.com,Tech,Canada,Americas,,,,,,


In [70]:
urls.rename(columns={'outlet':'Outlet'}, inplace=True)
print(urls.shape)
urls.head()

(112, 8)


Unnamed: 0,url,link,Outlet,title,title_2,source,content,date
0,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...,microsoftsharepoint.com,,Review Manually,Weekly,Something went wrong with url,
1,https://protect-eu.mimecast.com/s/4YhyCLYo9HQw...,https://escolaeducacao.com.br/microsoft-dispon...,escolaeducacao.com.br,Microsoft disponibiliza DALL-E 3 totalmente DE...,Microsoft disponibiliza DALL-E 3 totalmente DE...,Weekly,O DALL-E 3 agora pode ser utilizado através do...,2023-10-18
2,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...,microsoftsharepoint.com,,Review Manually,Weekly,Something went wrong with url,
3,https://protect-eu.mimecast.com/s/XBVACNEqkTjP...,https://saudedigitalnews.com.br/18/10/2023/com...,saudedigitalnews.com.br,Como a IA pode solucionar problemas e gerar op...,Como a IA pode solucionar problemas e gerar op...,Weekly,As possibilidades geradas pela utilização dos ...,2023-10-18
4,https://www.bluradio.com/tecnologia/microsoft-...,https://www.bluradio.com/tecnologia/microsoft-...,bluradio.com,Microsoft lanza nuevas herramientas de IA para...,Microsoft lanza nuevas herramientas de IA para...,Weekly,El gigante tecnológico Microsoft anunció este ...,2023-10-10


In [72]:
weekly_df = urls.merge(media_list_fy24, on='Outlet', how='left')
print(weekly_df.shape)
weekly_df.head()

(112, 17)


Unnamed: 0,url,link,Outlet,title,title_2,source,content,date,Media Type,Country,Region,Women,Men,18-24,25-34,35-44,45+
0,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...,microsoftsharepoint.com,,Review Manually,Weekly,Something went wrong with url,,,,,,,,,,
1,https://protect-eu.mimecast.com/s/4YhyCLYo9HQw...,https://escolaeducacao.com.br/microsoft-dispon...,escolaeducacao.com.br,Microsoft disponibiliza DALL-E 3 totalmente DE...,Microsoft disponibiliza DALL-E 3 totalmente DE...,Weekly,O DALL-E 3 agora pode ser utilizado através do...,2023-10-18,,,,,,,,,
2,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...,microsoftsharepoint.com,,Review Manually,Weekly,Something went wrong with url,,,,,,,,,,
3,https://protect-eu.mimecast.com/s/XBVACNEqkTjP...,https://saudedigitalnews.com.br/18/10/2023/com...,saudedigitalnews.com.br,Como a IA pode solucionar problemas e gerar op...,Como a IA pode solucionar problemas e gerar op...,Weekly,As possibilidades geradas pela utilização dos ...,2023-10-18,,,,,,,,,
4,https://www.bluradio.com/tecnologia/microsoft-...,https://www.bluradio.com/tecnologia/microsoft-...,bluradio.com,Microsoft lanza nuevas herramientas de IA para...,Microsoft lanza nuevas herramientas de IA para...,Weekly,El gigante tecnológico Microsoft anunció este ...,2023-10-10,General_News,Colombia,Americas,0.442366,0.557634,0.23147,0.373815,0.175242,0.219473


In [73]:
# Fill NaN with 'NA'
weekly_df = weekly_df.fillna('NA')
print(weekly_df.shape)
weekly_df.head()

(112, 17)


Unnamed: 0,url,link,Outlet,title,title_2,source,content,date,Media Type,Country,Region,Women,Men,18-24,25-34,35-44,45+
0,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...,microsoftsharepoint.com,,Review Manually,Weekly,Something went wrong with url,,,,,,,,,,
1,https://protect-eu.mimecast.com/s/4YhyCLYo9HQw...,https://escolaeducacao.com.br/microsoft-dispon...,escolaeducacao.com.br,Microsoft disponibiliza DALL-E 3 totalmente DE...,Microsoft disponibiliza DALL-E 3 totalmente DE...,Weekly,O DALL-E 3 agora pode ser utilizado através do...,2023-10-18,,,,,,,,,
2,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...,microsoftsharepoint.com,,Review Manually,Weekly,Something went wrong with url,,,,,,,,,,
3,https://protect-eu.mimecast.com/s/XBVACNEqkTjP...,https://saudedigitalnews.com.br/18/10/2023/com...,saudedigitalnews.com.br,Como a IA pode solucionar problemas e gerar op...,Como a IA pode solucionar problemas e gerar op...,Weekly,As possibilidades geradas pela utilização dos ...,2023-10-18,,,,,,,,,
4,https://www.bluradio.com/tecnologia/microsoft-...,https://www.bluradio.com/tecnologia/microsoft-...,bluradio.com,Microsoft lanza nuevas herramientas de IA para...,Microsoft lanza nuevas herramientas de IA para...,Weekly,El gigante tecnológico Microsoft anunció este ...,2023-10-10,General_News,Colombia,Americas,0.442366,0.557634,0.23147,0.373815,0.175242,0.219473


### FY24 ###


This function takes a string argument 'country' and returns a string representing the region the country belongs to.

The regions are defined as follows:

- 'Brazil' for Brazil
- 'Argentina' for Argentina
- 'Mexico' for Mexico
- 'Chile' for Chile
- 'Andean Region' for Colombia, Bolivia, Ecuador, Paraguay, Peru and Uruguay
- 'Central' for Central America countries, Costa Rica, Guatemala, Honduras, Panama, El Salvador, Dominican Republic, Republica Dominicana, Jamaica, Puerto Rico, Trinidad y Tobago and Trinidad and Tobago
- 'Other' for any other country

In [74]:
def get_region(country):
    if country == 'Brazil':
        return 'Brazil'
    elif country == 'Argentina':
        #return 'Hispanic South America'
        return 'Spanish South America'
    elif country == 'Mexico':
        return 'Mexico'
    elif country == 'Chile':
        #return 'Hispanic South America'
        return 'Spanish South America'    
    elif country in ['Colombia', 'Bolivia', 'Ecuador', 'Paraguay', 'Peru', 'Uruguay', 'Perú']:
        #return 'Hispanic South America'
        return 'Spanish South America'
    elif country in ['Central', 'Costa Rica', 'Guatemala', 'Honduras', 'Panama', 'El Salvador', 'Dominican Republic', 'Republica Dominicana', 'Jamaica', 'Puerto Rico', 'Trinidad y Tobago', 'Trinidad and Tobago']:
        #return 'Central'
        return 'Central Caribbean'
    else:
        return 'Other'

In [76]:
weekly_df['Subsidiary']= weekly_df['Country'].apply(get_region)
print(weekly_df.shape)
weekly_df.head()

(112, 18)


Unnamed: 0,url,link,Outlet,title,title_2,source,content,date,Media Type,Country,Region,Women,Men,18-24,25-34,35-44,45+,Subsidiary
0,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...,microsoftsharepoint.com,,Review Manually,Weekly,Something went wrong with url,,,,,,,,,,,Other
1,https://protect-eu.mimecast.com/s/4YhyCLYo9HQw...,https://escolaeducacao.com.br/microsoft-dispon...,escolaeducacao.com.br,Microsoft disponibiliza DALL-E 3 totalmente DE...,Microsoft disponibiliza DALL-E 3 totalmente DE...,Weekly,O DALL-E 3 agora pode ser utilizado através do...,2023-10-18,,,,,,,,,,Other
2,https://microsoft.sharepoint.com/teams/LatAmPR...,https://microsoft.sharepoint.com/teams/LatAmPR...,microsoftsharepoint.com,,Review Manually,Weekly,Something went wrong with url,,,,,,,,,,,Other
3,https://protect-eu.mimecast.com/s/XBVACNEqkTjP...,https://saudedigitalnews.com.br/18/10/2023/com...,saudedigitalnews.com.br,Como a IA pode solucionar problemas e gerar op...,Como a IA pode solucionar problemas e gerar op...,Weekly,As possibilidades geradas pela utilização dos ...,2023-10-18,,,,,,,,,,Other
4,https://www.bluradio.com/tecnologia/microsoft-...,https://www.bluradio.com/tecnologia/microsoft-...,bluradio.com,Microsoft lanza nuevas herramientas de IA para...,Microsoft lanza nuevas herramientas de IA para...,Weekly,El gigante tecnológico Microsoft anunció este ...,2023-10-10,General_News,Colombia,Americas,0.442366,0.557634,0.23147,0.373815,0.175242,0.219473,Spanish South America


In [77]:
weekly_df.to_excel('weekly_processed/Report Latam Comms Report October 05th - October 19th_SCRIPT_VERSION.xlsx', index=False, engine='xlsxwriter')