In [1]:
# Improvements:
# - source file -> avoid duplicated processing due to multiple tags

In [2]:
from bs4 import BeautifulSoup
import requests
from requests.adapters import HTTPAdapter
from requests.exceptions import ConnectionError
import configparser
import pathlib
from collections import defaultdict
import pandas as pd
import re
import unicodedata
import nltk
from nltk import SnowballStemmer
from langdetect import detect
import html2text
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/sofia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
stop_words = set(stopwords.words("spanish"))
stop_words

{'a',
 'al',
 'algo',
 'algunas',
 'algunos',
 'ante',
 'antes',
 'como',
 'con',
 'contra',
 'cual',
 'cuando',
 'de',
 'del',
 'desde',
 'donde',
 'durante',
 'e',
 'el',
 'ella',
 'ellas',
 'ellos',
 'en',
 'entre',
 'era',
 'erais',
 'eran',
 'eras',
 'eres',
 'es',
 'esa',
 'esas',
 'ese',
 'eso',
 'esos',
 'esta',
 'estaba',
 'estabais',
 'estaban',
 'estabas',
 'estad',
 'estada',
 'estadas',
 'estado',
 'estados',
 'estamos',
 'estando',
 'estar',
 'estaremos',
 'estará',
 'estarán',
 'estarás',
 'estaré',
 'estaréis',
 'estaría',
 'estaríais',
 'estaríamos',
 'estarían',
 'estarías',
 'estas',
 'este',
 'estemos',
 'esto',
 'estos',
 'estoy',
 'estuve',
 'estuviera',
 'estuvierais',
 'estuvieran',
 'estuvieras',
 'estuvieron',
 'estuviese',
 'estuvieseis',
 'estuviesen',
 'estuvieses',
 'estuvimos',
 'estuviste',
 'estuvisteis',
 'estuviéramos',
 'estuviésemos',
 'estuvo',
 'está',
 'estábamos',
 'estáis',
 'están',
 'estás',
 'esté',
 'estéis',
 'estén',
 'estés',
 'fue',
 'f

In [4]:
class WebSitesConfigParser(): # naive class    
    def __init__(self, file_path):
        self.config_file = file_path
        self.sections = defaultdict(list)
        
    def __getitem__(self, index):
        return self.sections[index]
    
    def read(self):
        with open(self.config_file) as cf:
            lines = cf.readlines()
        for idx, line in enumerate(lines):
            if line.startswith("["):
                section_name = line.strip('\n[]')
                continue
            self.sections[section_name].append(line.strip('\n'))
    
    def get_content(self):
        return self.sections
        
    def get_sections(self):
        return list(self.sections.keys())

In [5]:
def get_web_content(url, label):
    '''
    save website content to a .txt file
    '''
    print("Gathering content...")
    try:
        adapter = HTTPAdapter(max_retries=3)
        session = requests.Session()
        session.mount(url, adapter)
        url_res = session.get(url).text
    except requests.TooManyRedirects as tmr:
        raise(tmr)
    except ConnectionError as ce:
        raise(ce)
    else:
        html = url_res
        text = html2text.html2text(html)
        file = open("../data/raw/site_{c}.txt".format(c=label), 'wb+')
        striped_text = text.strip('/n')
        file.write(striped_text.encode('utf-8'))
        file.close()
    return text

# Stop words removal purposes: 
# - Allows analyze only relevant terms
# - Data dimension reduction
def format_web_content(content):
    '''
    - translate accent letters to 'non-accent' letters
    - remove non-content words/expressions (html stuff...)
    - filter out spanish stop-words
    - filter out (some of the) non-spanish words
    
    return: String
    '''
    print("Formatting content...")
    a,b = 'áéíóúü','aeiouu'
    trans = str.maketrans(a,b)
    stop_words = set(stopwords.words("spanish"))
    content = content.translate(trans)    
    tokens = re.findall(r"[a-zA-Z]{3,}", content)
    filtered_sentence = [w.lower() for w in tokens if w.lower() not in stop_words]    
    spanish = [word for word in filtered_sentence if detect(word) == 'es']
    return ' '.join(spanish)

def build_data_set(dict_):
    if not isinstance(dict_, dict):
        type_ = type(dict_)
        raise TypeError("Dictionary expected, {} found".format(type_))
    df = pd.DataFrame()
    c = 0
    for key in dict_.keys():
        print("Iterating over [{}] websites".format(key))
        for url in dict_[key]:
            try:
                if (url == ''):
                    continue
                print("\t\t ---> {}".format(url))
                content = format_web_content(get_web_content(url, c))
                entry = {'manually_assigned_tag': key, 'url': url, 'content': content}
                df = df.append(pd.DataFrame([entry], columns=entry.keys()))                
                c+=1
            except requests.TooManyRedirects:
                print('Too many redirects exception raised. Ignoring website...')
                continue
            except ConnectionError:
                print('Max retries exceeded. Ignoring website...')
                continue
    return df

In [6]:
! pwd

/home/sofia/Desktop/MyRepositories/topic_modeling/src


In [7]:
! ls 

dataset.ipynb  preprocess.ipynb  topic_modeling.ipynb  Untitled.ipynb
__init__.py    Stemmer.py	 untitled1.txt	       untitled.txt


In [8]:
parser = WebSitesConfigParser("../data/source/websites.txt")
parser.read()

In [9]:
parser.get_content()

defaultdict(list,
            {'supermarket': ['https://www.walmart.com.ar',
              'https://www.makro.com.ar/ofertas',
              'https://www.alvearsupermercados.com.ar/ofertas/',
              'https://www.cotodigital3.com.ar/sitios/cdigi/browse?_dyncharset=utf-8&Dy=1&Nty=1&Ntk=All%7Cproduct.sDisp_200&Ntt=OFESEMANAL020320%7C1004',
              ''],
             'telephone': ['https://www.personal.com.ar/',
              'https://www.movistar.com.ar/',
              'https://www.tuenti.com.ar',
              'https://www.claro.com.ar',
              ''],
             'cellphone': ['https://www.personal.com.ar/',
              'https://www.movistar.com.ar/',
              'https://www.tuenti.com.ar',
              'https://www.claro.com.ar',
              ''],
             'news': ['https://www.clarin.com/',
              'https://www.diariouno.com.ar/',
              'https://www.infobae.com',
              'https://www.lanacion.com.ar',
              'https://www.pagina12

In [11]:
myDataSet = build_data_set(parser.sections)

Iterating over [supermarket] websites
		 ---> https://www.walmart.com.ar
Gathering content...
Formatting content...
		 ---> https://www.makro.com.ar/ofertas
Gathering content...
Formatting content...
		 ---> https://www.alvearsupermercados.com.ar/ofertas/
Gathering content...
Formatting content...
		 ---> https://www.cotodigital3.com.ar/sitios/cdigi/browse?_dyncharset=utf-8&Dy=1&Nty=1&Ntk=All%7Cproduct.sDisp_200&Ntt=OFESEMANAL020320%7C1004
Gathering content...
Formatting content...
Iterating over [telephone] websites
		 ---> https://www.personal.com.ar/
Gathering content...
Formatting content...
		 ---> https://www.movistar.com.ar/
Gathering content...
Formatting content...
		 ---> https://www.tuenti.com.ar
Gathering content...
Formatting content...
		 ---> https://www.claro.com.ar
Gathering content...
Formatting content...
Iterating over [cellphone] websites
		 ---> https://www.personal.com.ar/
Gathering content...
Formatting content...
		 ---> https://www.movistar.com.ar/
Gathering c

In [12]:
myDataSet.head(20)

Unnamed: 0,manually_assigned_tag,url,content
0,supermarket,https://www.walmart.com.ar,nueva nueva correctamente puede cerrar lacteos...
0,supermarket,https://www.makro.com.ar/ofertas,logo entra cuenta quiero fornecedor sustentabi...
0,supermarket,https://www.alvearsupermercados.com.ar/ofertas/,alvearsupermercados logo blanco alvearsupermer...
0,supermarket,https://www.cotodigital3.com.ar/sitios/cdigi/b...,experiencia descuentos descuentos comparacione...
0,telephone,https://www.personal.com.ar/,micuenta destinos cuenta micuenta cuenta cuota...
0,telephone,https://www.movistar.com.ar/,cancelar productos servicios productos servici...
0,telephone,https://www.tuenti.com.ar,contenido logo esperas quiero quiero quiero qu...
0,telephone,https://www.claro.com.ar,personas personas claro clarovideo claro claro...
0,cellphone,https://www.personal.com.ar/,micuenta destinos cuenta micuenta cuenta finan...
0,cellphone,https://www.movistar.com.ar/,cancelar productos servicios productos servici...


In [13]:
myDataSet.to_csv('../data/interim/dataset_v2.csv')