# Bajando datos de diferentes fuentes. Ejemplos

## Bajar datos de Bicimad

In [1]:

import requests
import zipfile
# import io
import pandas as pd

# https://opendata.emtmadrid.es/getattachment/037ee8a5-b824-43b1-ac7b-76225c783993/201810_Usage_Bicimad.aspx

filename = 'bicis.zip'
url = 'https://opendata.emtmadrid.es/getattachment/7517a650-ccdf-4ab1-b1b0-a1d13694472e/201906_Usage_Bicimad.aspx'

r = requests.get(url, allow_redirects=True)





In [2]:
with open(filename, 'wb') as f: # wb write binary, abro el fichero y escribo en él. Con with los recursos se cierran bien
   f.write(r.content)

with zipfile.ZipFile(filename, 'r') as zip_ref: # r es read, voy a leer un zip
    zip_ref.extractall(".") # extraigo todo aquí, en . 
    

In [3]:
datos_bicis = pd.read_json('201906_Usage_Bicimad.json',  lines=True, encoding ='latin-1') # leo el json línea a línea


In [4]:
datos_bicis.head()

Unnamed: 0,_id,user_day_code,idplug_base,user_type,idunplug_base,travel_time,idunplug_station,ageRange,idplug_station,unplug_hourTime,zip_code,track
0,{'$oid': '5cf83b752f3843a016be4e2f'},e4d55deb9ac172a8d8f5f0a32599815bd51b7c8760d67e...,21,1,8,219,90,0,66,{'$date': '2019-06-01T00:00:00.000+0200'},,
1,{'$oid': '5cf83b762f3843a016be4e48'},8a0c4123e924a50a958f51985eb71aea750fb072438035...,19,1,19,359,71,4,136,{'$date': '2019-06-01T00:00:00.000+0200'},28039.0,
2,{'$oid': '5cf83b762f3843a016be4e4f'},a6a9c1f74a68496000542210abc4fc2eba79e2756ad535...,17,1,7,375,39,4,38,{'$date': '2019-06-01T00:00:00.000+0200'},28013.0,
3,{'$oid': '5cf83b762f3843a016be4e53'},5706c0bd494acc02279d532821c9666b0e506d4f81c838...,4,1,21,264,66,5,90,{'$date': '2019-06-01T00:00:00.000+0200'},28009.0,
4,{'$oid': '5cf83b762f3843a016be4e54'},eb1b6d32bd4add5d5ff91af72a38786d61075c090383a5...,3,1,13,367,152,4,166,{'$date': '2019-06-01T00:00:00.000+0200'},28006.0,


## Importando datos desde la API de INE

In [5]:
import requests
import pandas as pd
import datetime


url_plantilla = 'http://servicios.ine.es/wstempus/js/ES/DATOS_SERIE/{codigo}?nult={num_datos}'

# codigo de la serie de datos a consultar y numero de datos

codigo = "IPC118150" # Datos de variación del IPC en Andalucía
num_datos = 12 # últimos 12 datos

url = url_plantilla.format(codigo=codigo,
                           num_datos=num_datos)

# realizar la descarga de los datos usando la libreria request, y leyendo el formato json

respuesta = requests.get(url)

datos = respuesta.json()

print(datos)
# obtenemos el nombre de la serie para nombrar la columna en el data frame
nombre = datos['Nombre']

# Creamos una serie con las fechas, y las convertimos a zona horaria española y formateamos

fecha_serie_utc = pd.to_datetime([x['Fecha'] for x in datos['Data']], unit='ms', utc=True)
# en x la columna Fecha sale del valor en datos['Data'] pero formateando a fecha

fecha_serie_madrid = fecha_serie_utc.tz_convert('Europe/Madrid')

fecha_serie = [x.tz_localize(None).date()
               for x in fecha_serie_madrid]


# creamos una lista con los valores de la serie que vienen en el tag 'Valor'

ocupados_serie = [x['Valor']
                  for x in datos['Data']] 

print(type(ocupados_serie))

{'COD': 'IPC118150', 'Nombre': 'Andalucía, Total, Base 2011, General, Variación mensual', 'FK_Unidad': 135, 'FK_Escala': 1, 'Data': [{'Fecha': 1451602800000, 'FK_TipoDato': 1, 'FK_Periodo': 1, 'Anyo': 2016, 'Valor': -2.0, 'Secreto': False}, {'Fecha': 1454281200000, 'FK_TipoDato': 1, 'FK_Periodo': 2, 'Anyo': 2016, 'Valor': -0.4, 'Secreto': False}, {'Fecha': 1456786800000, 'FK_TipoDato': 1, 'FK_Periodo': 3, 'Anyo': 2016, 'Valor': 0.8, 'Secreto': False}, {'Fecha': 1459461600000, 'FK_TipoDato': 1, 'FK_Periodo': 4, 'Anyo': 2016, 'Valor': 0.8, 'Secreto': False}, {'Fecha': 1462053600000, 'FK_TipoDato': 1, 'FK_Periodo': 5, 'Anyo': 2016, 'Valor': 0.6, 'Secreto': False}, {'Fecha': 1464732000000, 'FK_TipoDato': 1, 'FK_Periodo': 6, 'Anyo': 2016, 'Valor': 0.4, 'Secreto': False}, {'Fecha': 1467324000000, 'FK_TipoDato': 1, 'FK_Periodo': 7, 'Anyo': 2016, 'Valor': -0.9, 'Secreto': False}, {'Fecha': 1470002400000, 'FK_TipoDato': 1, 'FK_Periodo': 8, 'Anyo': 2016, 'Valor': 0.1, 'Secreto': False}, {'Fecha'

In [6]:
# contruimos un dataframe de pandas con los valores

import pandas as pd

tabla = pd.DataFrame(ocupados_serie,
                     index=fecha_serie,
                     columns=[nombre])

In [7]:
tabla

Unnamed: 0,"Andalucía, Total, Base 2011, General, Variación mensual"
2016-01-01,-2.0
2016-02-01,-0.4
2016-03-01,0.8
2016-04-01,0.8
2016-05-01,0.6
2016-06-01,0.4
2016-07-01,-0.9
2016-08-01,0.1
2016-09-01,0.2
2016-10-01,1.2


## Web scrapping de IMDB

In [8]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

# Download IMDB's Top 250 data
url = 'http://www.imdb.com/chart/top'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

movies = soup.select('td.titleColumn')
links = [a.attrs.get('href') for a in soup.select('td.titleColumn a')]
crew = [a.attrs.get('title') for a in soup.select('td.titleColumn a')]
ratings = [b.attrs.get('data-value') for b in soup.select('td.posterColumn span[name=ir]')]
votes = [b.attrs.get('data-value') for b in soup.select('td.ratingColumn strong')]

imdb = []

# Store each item into dictionary (data), then put those into a list (imdb)
for index in range(0, len(movies)):
    # Seperate movie into: 'place', 'title', 'year'
    movie_string = movies[index].get_text()
    movie = (' '.join(movie_string.split()).replace('.', ''))
    movie_title = movie[len(str(index))+1:-7]
    year = re.search('\((.*?)\)', movie_string).group(1)
    place = movie[:len(str(index))-(len(movie))]
    data = {"movie_title": movie_title,
            "year": year,
            "place": place,
            "star_cast": crew[index],
            "rating": ratings[index],
            "vote": votes[index],
            "link": links[index]}
    imdb.append(data)
df = pd.DataFrame(imdb)
df


#for item in imdb:
#    print(item['place'], '-', item['movie_title'], '('+item['year']+') -', 'Starring:', item['star_cast'])

Unnamed: 0,movie_title,year,place,star_cast,rating,vote,link
0,Cadena perpetua,1994,1,"Frank Darabont (dir.), Tim Robbins, Morgan Fre...",9.222728413795364,,/title/tt0111161/
1,El padrino,1972,2,"Francis Ford Coppola (dir.), Marlon Brando, Al...",9.14886412325818,,/title/tt0068646/
2,El padrino: Parte II,1974,3,"Francis Ford Coppola (dir.), Al Pacino, Robert...",8.981223713635329,,/title/tt0071562/
3,El caballero oscuro,2008,4,"Christopher Nolan (dir.), Christian Bale, Heat...",8.973391623827071,,/title/tt0468569/
4,12 hombres sin piedad,1957,5,"Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb",8.930434523518265,,/title/tt0050083/
...,...,...,...,...,...,...,...
245,Terminator,1984,246,"James Cameron (dir.), Arnold Schwarzenegger, L...",8.009034585128507,,/title/tt0088247/
246,Mandarinas,2013,247,"Zaza Urushadze (dir.), Lembit Ulfsak, Elmo Nüg...",8.008325813651696,,/title/tt2991224/
247,Aladdín,1992,248,"Ron Clements (dir.), Scott Weinger, Robin Will...",8.00742958808094,,/title/tt0103639/
248,"Swades: We, the People",2004,249,"Ashutosh Gowariker (dir.), Shah Rukh Khan, Gay...",8.00719129464659,,/title/tt0367110/


## Accediendo a datos de twitter

In [9]:
#pip install tweepy

import tweepy  
import time
import csv

# estos token y keys hay que generarlos desde Twitter: https://apps.twitter.com
# https://developer.twitter.com/
# hay que registrarse una cuenta de desarrollador de aplicaciones que usan Twitter
# esos token y keys caducan pasado un tiempo (nos avisa al crearlos)

access_token = "xxx"  
access_token_secret = "xxx"  
consumer_key = "xxx"  
consumer_secret = "xxx"  
 
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)  
auth.set_access_token(access_token, access_token_secret)  
api = tweepy.API(auth,wait_on_rate_limit=True)

In [10]:
csvFile = open('cop25.csv', 'a')
#Use csv Writer
csvWriter = csv.writer(csvFile)

for tweet in tweepy.Cursor(api.search,q="#cop25",count=5000,
                           lang="en",
                           since="2019-10-12").items():
    print (tweet.created_at, tweet.text)
    csvWriter.writerow([tweet.created_at, tweet.text.encode('utf-8')])

2020-10-01 01:26:56 RT @ATD4thWorld: #30daysforEcoSocialJustice #Day14

Last December’s #COP25 was one of the many United Nations events at which ATD Fourth Wo…
2020-10-01 00:35:17 RT @Alex_Verbeek: 🌎 

NATO Review just published my article:

Planetary Security, the security implications of climate change. 

RTs apprec…
2020-09-30 23:55:51 RT @Alex_Verbeek: 🌎 

NATO Review just published my article:

Planetary Security, the security implications of climate change. 

RTs apprec…
2020-09-30 23:09:47 RT @Onalytica: @GeraldKutney Hey, we've conducted some new research into #COP25 analysing the key topics &amp; influencers shaping the #Climate…
2020-09-30 23:02:34 @qikipedia shame the rest of Poland isn't more considerate about pollution:
#AirQuality #COP25 #COP26 https://t.co/YdOaS0UZeY
2020-09-30 18:41:55 RT @Alex_Verbeek: 🌎 

NATO Review just published my article:

Planetary Security, the security implications of climate change. 

RTs apprec…
2020-09-30 18:41:05 RT @Alex_Verbeek: 🌎 

NAT

In [11]:
csvFile = open('final_balonmano.csv', 'a')
#Use csv Writer
csvWriter = csv.writer(csvFile)

for tweet in tweepy.Cursor(api.search,q="#GuerrerasTVE",count=5000,
                           lang="es",
                           since="2019-10-12").items():
    print (tweet.created_at, tweet.text)
    csvWriter.writerow([tweet.created_at, tweet.text.encode('utf-8')])

## Bajando datos de la calidad del aire

También existe un dataset en Kaggle, pero vamos a ver un ejemplo

In [12]:

import pandas as pd
import requests
import io
import csv

url2 = "http://www.mambiente.munimadrid.es/opendata/horario.txt"

datos=pd.read_csv(url2, header=None)#.content




In [13]:
datos

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,50,51,52,53,54,55,56
0,28,79,4,1,38,2,2020,10,1,8.0,...,0,N,0,N,0,N,0,N,0,N
1,28,79,4,6,48,2,2020,10,1,0.1,...,0,N,0,N,0,N,0,N,0,N
2,28,79,4,7,8,2,2020,10,1,1.0,...,0,N,0,N,0,N,0,N,0,N
3,28,79,4,8,8,2,2020,10,1,18.0,...,0,N,0,N,0,N,0,N,0,N
4,28,79,4,12,8,2,2020,10,1,20.0,...,0,N,0,N,0,N,0,N,0,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149,28,79,60,7,8,2,2020,10,1,1.0,...,0,N,0,N,0,N,0,N,0,N
150,28,79,60,8,8,2,2020,10,1,28.0,...,0,N,0,N,0,N,0,N,0,N
151,28,79,60,10,47,2,2020,10,1,11.0,...,0,N,0,N,0,N,0,N,0,N
152,28,79,60,12,8,2,2020,10,1,30.0,...,0,N,0,N,0,N,0,N,0,N
