# **El Tiempo by Day**

In [8]:
#!pip install requests tqdm beautifulsoup4 retry

In [9]:
import os
import requests
import pandas as pd
from datetime import datetime, timedelta
import re
import math
from tqdm import tqdm
from bs4 import BeautifulSoup
from retry import retry

base_directory = os.path.abspath(os.path.join(os.getcwd()))
data_directory = os.path.abspath(os.path.join(base_directory, '..', 'data', 'el_tiempo'))

In [10]:
# Parameters
start_date = datetime(2020, 1, 12) #YYYY-MM-DD
end_date = datetime(2023, 8, 31) #YYYY-MM-DD
days = 1 # No poner 0, o queda en bucle infinito
search_query = 'Policia'

In [11]:
def generate_date_range(start_date, end_date, days_range=1):
    current_date = start_date
    date_range = []

    while current_date < end_date:
        next_date = current_date + timedelta(days=days_range)
        date_range.append((current_date.strftime('%Y-%m-%d'), next_date.strftime('%Y-%m-%d')))
        current_date = next_date

    return date_range

dates = generate_date_range(start_date, end_date, days_range=days)
dates

[('2020-01-12', '2020-01-13'),
 ('2020-01-13', '2020-01-14'),
 ('2020-01-14', '2020-01-15'),
 ('2020-01-15', '2020-01-16'),
 ('2020-01-16', '2020-01-17'),
 ('2020-01-17', '2020-01-18'),
 ('2020-01-18', '2020-01-19'),
 ('2020-01-19', '2020-01-20'),
 ('2020-01-20', '2020-01-21'),
 ('2020-01-21', '2020-01-22'),
 ('2020-01-22', '2020-01-23'),
 ('2020-01-23', '2020-01-24'),
 ('2020-01-24', '2020-01-25'),
 ('2020-01-25', '2020-01-26'),
 ('2020-01-26', '2020-01-27'),
 ('2020-01-27', '2020-01-28'),
 ('2020-01-28', '2020-01-29'),
 ('2020-01-29', '2020-01-30'),
 ('2020-01-30', '2020-01-31'),
 ('2020-01-31', '2020-02-01'),
 ('2020-02-01', '2020-02-02'),
 ('2020-02-02', '2020-02-03'),
 ('2020-02-03', '2020-02-04'),
 ('2020-02-04', '2020-02-05'),
 ('2020-02-05', '2020-02-06'),
 ('2020-02-06', '2020-02-07'),
 ('2020-02-07', '2020-02-08'),
 ('2020-02-08', '2020-02-09'),
 ('2020-02-09', '2020-02-10'),
 ('2020-02-10', '2020-02-11'),
 ('2020-02-11', '2020-02-12'),
 ('2020-02-12', '2020-02-13'),
 ('2020-

In [12]:
# Function to remove leading and trailing spaces from each string in the tuples
def clean_spaces(tuples_list):
    return [(value1.strip(), value2.strip()) for value1, value2 in tuples_list]


def get_news_title(soup):
    # News title
    title_links = soup.find_all(class_='title page-link')
    epigraph_links = soup.find_all(class_='epigraph page-link')

    if title_links and epigraph_links:
        # Get the text of each element
        title_texts = [title.text for title in title_links]
        epigraph_texts = [epigraph.text for epigraph in epigraph_links]

        return list(zip(title_texts, epigraph_texts))
    else:
        return []
    
@retry(exceptions=Exception, tries=-1, delay=0)
def get_url(url):
    return requests.get(url)

In [15]:
def get_results_by_day(dates):
    results = []
    start_dates = []
    end_dates = []
    news = []

    for i in tqdm(dates):  
        start_date = i[0]
        end_date = i[1]
        
        start_dates.append(start_date)
        end_dates.append(end_date)

        url = f'https://www.eltiempo.com/buscar?q={search_query}&category=bogota&publishedAt%5Bfrom%5D={start_date}&publishedAt%5Buntil%5D={end_date}'

        # Start scraping
        response = get_url(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Number of results
            target_element = soup.find(class_='search-results-title')
            number_result = 0

            if target_element:
                # Extract numbers using regular expressions
                match = re.search(r'\b\d+\b', target_element.text)

                if match:
                    number_result = int(match.group())
                    results.append(number_result)
                else:
                    results.append(0)
            
            # Iterate over each page to get the news titles
            if number_result > 0:
                temp_news = []
                iterations = math.ceil(number_result / 10)
                for j in range(1, iterations + 1):   
                    if j == 1:    
                        news_titles = get_news_title(soup)
                        temp_news.extend(news_titles)
                    else:
                        sub_url = f'https://www.eltiempo.com/buscar/{j}?q={search_query}&category=bogota&publishedAt%5Bfrom%5D={start_date}&publishedAt%5Buntil%5D={end_date}'

                        response = get_url(sub_url)
                        if response.status_code == 200:
                            soup = BeautifulSoup(response.content, 'html.parser')
                            
                            news_titles = get_news_title(soup)
                            temp_news.extend(news_titles)
                        else:
                            print("Failed sub-query at", sub_url)
                news.append(temp_news)
            else:
                news.append([])

        else:
            print("Failed query at", url)
            
    df_results = pd.DataFrame({'start_date': start_dates, 'end_date': end_dates, 'results': results, 'news': news})

    # Clean columns
    df_results['news'] = df_results['news'].apply(clean_spaces)

    return df_results

df_results = get_results_by_day(dates)
df_results

  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 20/20 [01:06<00:00,  3.33s/it]


Unnamed: 0,start_date,end_date,results,news
0,2020-01-12,2020-01-13,5,[(Se reporta manifestaciones en la avenida Car...
1,2020-01-13,2020-01-14,8,[(El insólito pasado del taxista que insultó a...
2,2020-01-14,2020-01-15,9,[(Hacinamiento superior al 50 % aqueja a las U...
3,2020-01-15,2020-01-16,13,[(No para pesadilla de taquillera de TransMile...
4,2020-01-16,2020-01-17,16,[(Ladrones usaron gas pimienta para reducir a ...
5,2020-01-17,2020-01-18,13,[(28 heridos por accidente de bus sin frenos q...
6,2020-01-18,2020-01-19,6,[(¿Quiénes son y qué papel tendrán las madres ...
7,2020-01-19,2020-01-20,7,"[(Rechazan muerte de Zeus, el perro que murió ..."
8,2020-01-20,2020-01-21,12,[(Empresa mexicana sale al ruedo en la plaza d...
9,2020-01-21,2020-01-22,13,[(Las lecciones que dejó el protocolo en las m...


In [25]:
df_results['news'][0]

[('Se reporta manifestaciones en la avenida Caracas',
  'TransMilenio informó que se presentaron dificultades en la operación debido a la manifestación.'),
 ('Policía responde por caso de agente que esposó a taquillera',
  'Empleada de Recaudo trató de evitar el ingreso gratuito de agentes de civil, como dice el protocolo.'),
 ('Impresionante robo con fusiles, chalecos antibalas y un taxi destruido',
  'Cuatro delincuentes asaltaron un restaurante de comida china y fueron perseguidos por la policía.'),
 ('Policías esposan a taquillera de TM por negar ingreso gratis a agentes',
  'Estaban de civil. La norma indica que el libre ingreso se permite si están uniformados.'),
 ('Buscan a Thor, perro perdido en el centro de Bogotá',
  'Se trata de un labrador amarillo de aproximadamente un año.')]

In [16]:
df_results.describe()

Unnamed: 0,results
count,20.0
mean,9.1
std,4.140938
min,2.0
25%,5.75
50%,9.0
75%,13.0
max,16.0


In [26]:
# Días sin resultados
df_results[df_results['results'] == 0]

Unnamed: 0,start_date,end_date,results,news


In [24]:
df_results.to_csv(os.path.join(data_directory, f'{search_query}_{start_date.strftime('%Y-%m-%d')}_{end_date.strftime('%Y-%m-%d')}.csv'), 
                  encoding='utf-8-sig', index=False)