In [43]:
import os, time
import requests
from tqdm import tqdm
import pandas as pd
from datetime import datetime

In [2]:
def retrieve_html(verbose=False):
    for year in tqdm(range(2013, 2020)):
        for month in tqdm(range(1, 13), leave=False):
            if month < 10: month = '0'+str(month)
            global url
            url = f'https://en.tutiempo.net/climate/{month}-{year}/ws-421810.html'
#             print(url)
            url_text = requests.get(url)            
            text_utf = url_text.text.encode('utf-8')
            
            dir = f'database/html_data/{year}'
            fname = dir + f'/{month}.html'
            if not os.path.exists(dir):
                os.makedirs(dir)
            
            with open(fname, 'wb') as f:
                f.write(text_utf)

In [64]:
def url_to_df(url):

    url = f'https://en.tutiempo.net/climate/{month}-{year}/ws-421810.html'
    url_text = requests.get(url)            
    text_utf = url_text.text.encode('utf-8')

    df = pd.read_html(text_utf)[2][:31] # this returns a list of all the tables in the page, but we only want a specific one with the index 2. Upto 31st row.

    
    for d in range(len(df.Day)):
        day = int(df.Day[d])
        timestamp = datetime(year, month, day).timestamp()
        df.Day[d] = timestamp

    df.set_index('Day', inplace=True) 

    return df



def web_data_to_dataframe():
    master_df = pd.DataFrame()
    
    for year in tqdm(range(2013, 2020)):
        for month in tqdm(range(1, 13), leave=False):
            if month < 10: month = '0'+str(month) # converting the month format to sync with the url format
            url       = f'https://en.tutiempo.net/climate/{month}-{year}/ws-421810.html'
            
            df        = url_to_df(url)
            master_df = pd.concat([master_df, df])
    
    return master_df

In [None]:
if __name__ == '__main__':
    start = time.time()

    # Retrieving the Data
    df = web_data_to_dataframe()
    
    # Exporting the CSV
    df.to_csv(f'database/master_data.csv')
    
    
    print('Time Taken :', time.time()-start, 'sec.')