In [1]:
import os
import requests
import json
import re
import time
import glob
import datetime
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from os.path import join
from slugify import slugify
from bs4 import BeautifulSoup
from bs4.element import NavigableString
from urllib.parse import urlparse, parse_qs

base_dir = "vuelax"
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

In [None]:
oportunidades_url = "http://www.vuelax.com/category/oportunidades/page/%d/"

In [None]:
content = []

for page in range(1, 10000):
    url = oportunidades_url % page
    op_page = requests.get(url)
    if page % 10 == 0:
        print("Requesting", url)
    if op_page.status_code != 200:
        break
    op_soup = BeautifulSoup(op_page.text, "lxml")
    main_ul = op_soup.find("ul", {"class":"penci-grid"})
    articles = main_ul.findAll("article", {"class":"item"})
    for article in articles:
        grid_title = article.find("h2", {"class":"grid-title"})
        a = grid_title.find("a")
        grid_post_box_meta = article.find("div", {"class":"grid-post-box-meta"})
        content.append([a.text, a.get('href'), grid_post_box_meta.text.strip()])

data = pd.DataFrame(content, columns= ["label", "url", "date"])
print(data.head())
print(data.info())

data.to_csv(join(base_dir, "original.csv"))

In [2]:
data = pd.read_csv(join(base_dir, "original.csv"), index_col=0)

In [3]:
location_regex = re.compile('([\w0-9,\s\.]+) [a|A] ([\w0-9,\s\.]+)\s*[-|–|"desde"|"DESDE"]\s*\$([0-9\.,]+)')


clean_values = []
non_clean_values = []

for index, row in data.iterrows():
    label = row['label']
    find = location_regex.search(label)
    if find:
        de = find.group(1)
        a = find.group(2)
        por = find.group(3)
        clean_values.append([de, a, por, row["url"], row["date"]])
    else:
        non_clean_values.append(row.values)

clean = pd.DataFrame(clean_values, columns= ["origin", "destination", "price", "url", "date"])
still_dirty_df = pd.DataFrame(non_clean_values, columns= ["label", "url", "date"])



print("== Clean ==")
print(clean.head())
print(clean.info())
clean.to_csv(join(base_dir, "clean.csv"))
print()
print("== Dirty ==")
print(still_dirty_df.head())
print(still_dirty_df.info())
still_dirty_df.to_csv(join(base_dir, "still_dirty.csv"))

== Clean ==
     origin  destination   price  \
0      CDMX       Tokyo   10,972   
1      CDMX        Lima    5,059   
2       CUN     Bélgica    9,731   
3    Canadá    Islandia    4,425   
4  Islandia  Inglaterra    1,156   

                                                 url            date  
0  http://www.vuelax.com/2018/01/14/cdmx-a-tokyo-...  enero 14, 2018  
1  http://www.vuelax.com/2018/01/13/cdmx-a-lima-5...  enero 13, 2018  
2  http://www.vuelax.com/2018/01/13/cun-a-belgica...  enero 13, 2018  
3  http://www.vuelax.com/2018/01/12/canada-a-isla...  enero 12, 2018  
4  http://www.vuelax.com/2018/01/12/islandia-a-in...  enero 12, 2018  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1255 entries, 0 to 1254
Data columns (total 5 columns):
origin         1255 non-null object
destination    1255 non-null object
price          1255 non-null object
url            1255 non-null object
date           1255 non-null object
dtypes: object(5)
memory usage: 49.1+ KB
None

== Dirty ==


In [4]:
still_dirty_df = pd.read_csv(join(base_dir, "still_dirty.csv"), index_col = 0)
print(still_dirty_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137 entries, 0 to 136
Data columns (total 3 columns):
label    137 non-null object
url      137 non-null object
date     137 non-null object
dtypes: object(3)
memory usage: 4.3+ KB
None


In [5]:
location_regex_note = re.compile('([\w0-9,\s\.]+) [a|A] ([\w0-9,\s\.]+)\s*\(([\w\s]+)\)\s*[-|–|"desde"|"DESDE"]\s*\$([0-9\.,]+)')


clean_values = []
non_clean_values = []

for index, row in still_dirty_df.iterrows():
    label = row['label']
    find = location_regex_note.search(label)
    if find:
        de = find.group(1)
        a = find.group(2)
        note = find.group(3)
        por = find.group(4)
        clean_values.append([de, a, por, note, row["url"], row["date"]])
    else:
        non_clean_values.append(row.values)


clean2 = pd.DataFrame(clean_values, columns= ["origin", "destination", "price", "note", "url", "date"])
print(clean2.info())

still_dirty_df = pd.DataFrame(non_clean_values, columns= ["label", "url", "date"])
print(still_dirty_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 6 columns):
origin         11 non-null object
destination    11 non-null object
price          11 non-null object
note           11 non-null object
url            11 non-null object
date           11 non-null object
dtypes: object(6)
memory usage: 608.0+ bytes
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126 entries, 0 to 125
Data columns (total 3 columns):
label    126 non-null object
url      126 non-null object
date     126 non-null object
dtypes: object(3)
memory usage: 3.0+ KB
None


In [6]:
clean = pd.read_csv(join(base_dir, "clean.csv"), index_col = 0)
clean = pd.concat([clean, clean2])

print("== Clean ==")
print(clean.head())
print(clean.info())
clean.to_csv(join(base_dir, "clean.csv"))
print()
print("== Dirty ==")
print(still_dirty_df.head())
print(still_dirty_df.info())
still_dirty_df.to_csv(join(base_dir, "still_dirty.csv"))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1266 entries, 0 to 10
Data columns (total 6 columns):
date           1266 non-null object
destination    1266 non-null object
note           11 non-null object
origin         1266 non-null object
price          1266 non-null object
url            1266 non-null object
dtypes: object(6)
memory usage: 69.2+ KB
== Clean ==
             date  destination note    origin   price  \
0  enero 14, 2018       Tokyo   NaN      CDMX  10,972   
1  enero 13, 2018        Lima   NaN      CDMX   5,059   
2  enero 13, 2018     Bélgica   NaN       CUN   9,731   
3  enero 12, 2018    Islandia   NaN    Canadá   4,425   
4  enero 12, 2018  Inglaterra   NaN  Islandia   1,156   

                                                 url  
0  http://www.vuelax.com/2018/01/14/cdmx-a-tokyo-...  
1  http://www.vuelax.com/2018/01/13/cdmx-a-lima-5...  
2  http://www.vuelax.com/2018/01/13/cun-a-belgica...  
3  http://www.vuelax.com/2018/01/12/canada-a-isla...  
4  http://w

In [12]:
still_dirty_df = pd.read_csv(join(base_dir, "still_dirty.csv"), index_col = 0)
print(still_dirty_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 126 entries, 0 to 125
Data columns (total 3 columns):
label    126 non-null object
url      126 non-null object
date     126 non-null object
dtypes: object(3)
memory usage: 3.9+ KB
None


In [13]:
location_regex_note = re.compile('¡([\w0-9,\s\.]+) [a|A] ([\w0-9,\s\.]+)!\s[-|–|"desde"|"DESDE"]\s*\$([0-9\.,]+)')


clean_values = []
non_clean_values = []

for index, row in still_dirty_df.iterrows():
    label = row['label']
    find = location_regex_note.search(label)
    if find:
        de = find.group(1)
        a = find.group(2)
        por = find.group(3)
        clean_values.append([de, a, por, row["url"], row["date"]])
    else:
        non_clean_values.append(row.values)


clean2 = pd.DataFrame(clean_values, columns= ["origin", "destination", "price", "url", "date"])
print(clean2.info())

still_dirty_df = pd.DataFrame(non_clean_values, columns= ["label", "url", "date"])
print(still_dirty_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 5 columns):
origin         42 non-null object
destination    42 non-null object
price          42 non-null object
url            42 non-null object
date           42 non-null object
dtypes: object(5)
memory usage: 1.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 3 columns):
label    84 non-null object
url      84 non-null object
date     84 non-null object
dtypes: object(3)
memory usage: 2.0+ KB
None


In [14]:
clean = pd.read_csv(join(base_dir, "clean.csv"), index_col = 0)
clean = pd.concat([clean, clean2])

print("== Clean ==")
print(clean.head())
print(clean.info())
clean.to_csv(join(base_dir, "clean.csv"))
print()
print("== Dirty ==")
print(still_dirty_df.head())
print(still_dirty_df.info())
still_dirty_df.to_csv(join(base_dir, "still_dirty.csv"))

== Clean ==
             date  destination note    origin   price  \
0  enero 14, 2018       Tokyo   NaN      CDMX  10,972   
1  enero 13, 2018        Lima   NaN      CDMX   5,059   
2  enero 13, 2018     Bélgica   NaN       CUN   9,731   
3  enero 12, 2018    Islandia   NaN    Canadá   4,425   
4  enero 12, 2018  Inglaterra   NaN  Islandia   1,156   

                                                 url  
0  http://www.vuelax.com/2018/01/14/cdmx-a-tokyo-...  
1  http://www.vuelax.com/2018/01/13/cdmx-a-lima-5...  
2  http://www.vuelax.com/2018/01/13/cun-a-belgica...  
3  http://www.vuelax.com/2018/01/12/canada-a-isla...  
4  http://www.vuelax.com/2018/01/12/islandia-a-in...  
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1308 entries, 0 to 41
Data columns (total 6 columns):
date           1308 non-null object
destination    1308 non-null object
note           11 non-null object
origin         1308 non-null object
price          1308 non-null object
url            1308 non-null obje

In [20]:
clean = pd.read_csv(join(base_dir, "clean.csv"), index_col = 0)

strip_blanks = lambda x: x.strip()
strip_dot = lambda x: x.strip('.')

print(clean[['origin','destination','price']].tail(30))
clean.origin = clean.origin.apply(strip_blanks)
clean.destination = clean.destination.apply(strip_blanks)
clean.price = clean.price.apply(strip_dot)
print(clean[['origin','destination','price']].tail(30))


clean.to_csv(join(base_dir, "clean.csv"))

                         origin                       destination    price
12              CDMX, GDL y MTY         Whitehorse, Yukón, Canadá   12,611
13                         CDMX  El Calafate, Patagonia Argentina   10,829
14       CDMX y 23 ciudades más                 San Francisco, CA    3,795
15       CDMX y 23 ciudades más                   Toronto, Canadá   8,486.
16                         CDMX                 Santa Clara, Cuba    4,666
17       CDMX y 23 ciudades más                  Montreal, Canadá   8,367.
18                         CDMX                            Madrid  11,866.
19                         CDMX                         Barcelona  11,921.
20         CDMX, MTY, GDL y CUN                  El Cairo, Egipto  10,038.
21                         CDMX             San Juan, Puerto Rico    4,292
22                         CDMX                    Beirut, Líbano  13,219.
23                   CDMX y GDL         San Francisco, California    3,970
24  CDMX, GDL y 22 ciudad

In [21]:
clean = pd.read_csv(join(base_dir, "clean.csv"), index_col = 0)