# Debut de la formation

## APIs Scraping
-Api = Outils et methodes qui autorisent à des applications d'interragir avec elles

-Data dynamique utilisable par les devs

-Mise à disposition par les entreprises

## Requete API

In [28]:
import requests

In [29]:
###Requete GET

# Requete pour obtenir la derniere position de la station ISS depuis l'API OpenNotify

response = requests.get("http://api.open.notify.org/iss-now.json") #iss-now.json -> donne le latitude et longetude de la sation

### Code status

In [30]:
response

<Response [200]>

#### 200 - Tout est ok et le serveur retourne le bon resultat

In [31]:
status_code = response.status_code

In [32]:
print(status_code)

200


#### 301 - Le serveur redirige vers un autre parametre

#### 400 - Mauvaise requete

In [33]:
#exemple
response = requests.get("http://api.open-notify.org/iss-pass.json")
status_response = response.status_code
print(status_response)

400


#### 401 - Mauvaise identification

#### 403 - Pas d'autorisation d'acces

#### 404 - le serveur ne trouve pas le ressource

In [34]:
#exemple
response = requests.get("http://api.open-notify.org/iss-pass")
status_response = response.status_code
print(status_response)

404


### Paramètre de Requete

In [35]:
# Latitude et longitude Paris
param = {"lat":48.87, "lon":2.33}
#http://api.open-notify.org/iss-pass.json?Lat=48.87&Lon=2.33 == Pareil
response = requests.get("http://api.open-notify.org/iss-pass.json", params = param)

In [36]:
content = response.content #on recupere le contenue
print(content)

b'{\n  "message": "success", \n  "request": {\n    "altitude": 100, \n    "datetime": 1552733953, \n    "latitude": 48.87, \n    "longitude": 2.33, \n    "passes": 5\n  }, \n  "response": [\n    {\n      "duration": 438, \n      "risetime": 1552768116\n    }, \n    {\n      "duration": 626, \n      "risetime": 1552773768\n    }, \n    {\n      "duration": 644, \n      "risetime": 1552779548\n    }, \n    {\n      "duration": 641, \n      "risetime": 1552785358\n    }, \n    {\n      "duration": 641, \n      "risetime": 1552791157\n    }\n  ]\n}\n'


In [37]:
# Latitude et longitude San Fransisco
param = {"lat":37.78, "lon":-122.41}
response = requests.get("http://api.open-notify.org/iss-pass.json", params = param)

In [38]:
content = response.content
print(content)

b'{\n  "message": "success", \n  "request": {\n    "altitude": 100, \n    "datetime": 1552733957, \n    "latitude": 37.78, \n    "longitude": -122.41, \n    "passes": 5\n  }, \n  "response": [\n    {\n      "duration": 591, \n      "risetime": 1552735654\n    }, \n    {\n      "duration": 637, \n      "risetime": 1552741438\n    }, \n    {\n      "duration": 358, \n      "risetime": 1552747330\n    }, \n    {\n      "duration": 521, \n      "risetime": 1552795684\n    }, \n    {\n      "duration": 642, \n      "risetime": 1552801393\n    }\n  ]\n}\n'


### Format JSON

Librairie json :

- dumps -- prend un objet et retourne un str
- loads -- prend un str Json et retourne objet (list, dic ...)

In [41]:
#exemple
sports = ["Tennis", "foot", "triathlon"]
print(sports)

['Tennis', 'foot', 'triathlon']


In [42]:
print(type(sports))

<class 'list'>


In [72]:
#importer lib json
import json

In [46]:
# Methode de conversion list to str
sports_str = json.dumps(sports)
print(sports_str)
print(type(sports_str))

["Tennis", "foot", "triathlon"]
<class 'str'>


In [47]:
#Methode str to lst
sport2 = json.loads(sports_str)
print(sport2)
print(type(sport2))

['Tennis', 'foot', 'triathlon']
<class 'list'>


#### Training

In [48]:
#dict nombre de licencier par sport en Fr
sports_number = {
    "Football": 1962241,
    "Tennis": 1039337,
    "Equitation": 663194,
    "Basketball": 641367
}

In [55]:
nb_lic_str = json.dumps(sports_number)
print(type(nb_lic_str))

<class 'str'>


In [56]:
nb_lic_dic = json.loads(nb_lic_str)
print(type(nb_lic_dic))

<class 'dict'>


### Obtenir un JSON depuis une requete

In [59]:
# Requete Paris
parameters = {'lat': 48.87, 'lon': 2.33}
response = requests.get("http://api.open-notify.org/iss-pass.json", params=parameters)

In [61]:
# Obtenir un objet python
json_data = response.json()
print(type(json_data))
print(json_data)

<class 'dict'>
{'message': 'success', 'request': {'altitude': 100, 'datetime': 1552733953, 'latitude': 48.87, 'longitude': 2.33, 'passes': 5}, 'response': [{'duration': 438, 'risetime': 1552768116}, {'duration': 626, 'risetime': 1552773768}, {'duration': 644, 'risetime': 1552779548}, {'duration': 641, 'risetime': 1552785358}, {'duration': 641, 'risetime': 1552791157}]}


In [64]:
first_pass_duration = json_data['response'][0]['duration']
print(first_pass_duration)

438


### Type de contenu

In [65]:
# .headers
print(response.headers)

{'Server': 'nginx/1.10.3', 'Date': 'Sat, 16 Mar 2019 11:21:28 GMT', 'Content-Type': 'application/json', 'Content-Length': '518', 'Connection': 'keep-alive', 'Via': '1.1 vegur'}


In [66]:
content_type = response.headers['Content-type']
print(content_type)

application/json


### Trouver le nb de personne dans l'espace

In [67]:
# Appel de l'api - stock dans response
response = requests.get('http://api.open-notify.org/astros.json')

In [76]:
print(type(response))

<class 'requests.models.Response'>


In [77]:
# recuperer les data json
nb_space_json = response.json()

In [78]:
print(type(nb_space_json))

<class 'dict'>


In [79]:
print(nb_space_json)

{'message': 'success', 'number': 6, 'people': [{'craft': 'ISS', 'name': 'Oleg Kononenko'}, {'craft': 'ISS', 'name': 'David Saint-Jacques'}, {'craft': 'ISS', 'name': 'Anne McClain'}, {'craft': 'ISS', 'name': 'Alexey Ovchinin'}, {'craft': 'ISS', 'name': 'Nick Hague'}, {'craft': 'ISS', 'name': 'Christina Koch'}]}


In [83]:
# recuperer la bonne data dans le dic json
nb_in_space = nb_space_json['number']

In [84]:
print(nb_in_space)

6


## Authentification sur API

In [85]:
import requests

In [86]:
headers = {"Authorization":"token 2a92933d751142502718d3023353c828ac778b97"}

In [94]:
response = requests.get("http://api.github.com/users/Orer0", headers=headers)

In [95]:
print(response.json())

{'login': 'Orer0', 'id': 26159661, 'node_id': 'MDQ6VXNlcjI2MTU5NjYx', 'avatar_url': 'https://avatars1.githubusercontent.com/u/26159661?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/Orer0', 'html_url': 'https://github.com/Orer0', 'followers_url': 'https://api.github.com/users/Orer0/followers', 'following_url': 'https://api.github.com/users/Orer0/following{/other_user}', 'gists_url': 'https://api.github.com/users/Orer0/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/Orer0/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/Orer0/subscriptions', 'organizations_url': 'https://api.github.com/users/Orer0/orgs', 'repos_url': 'https://api.github.com/users/Orer0/repos', 'events_url': 'https://api.github.com/users/Orer0/events{/privacy}', 'received_events_url': 'https://api.github.com/users/Orer0/received_events', 'type': 'User', 'site_admin': False, 'name': 'Aurelien Roblin', 'company': 'Student @ 42 Paris', 'blog': '', 'location': 'Paris',

### Autre points d'accès

In [98]:
response = requests.get("http://api.github.com/users/huandu", headers=headers)
huandu = response.json()
print(huandu)

{'login': 'huandu', 'id': 239739, 'node_id': 'MDQ6VXNlcjIzOTczOQ==', 'avatar_url': 'https://avatars1.githubusercontent.com/u/239739?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/huandu', 'html_url': 'https://github.com/huandu', 'followers_url': 'https://api.github.com/users/huandu/followers', 'following_url': 'https://api.github.com/users/huandu/following{/other_user}', 'gists_url': 'https://api.github.com/users/huandu/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/huandu/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/huandu/subscriptions', 'organizations_url': 'https://api.github.com/users/huandu/orgs', 'repos_url': 'https://api.github.com/users/huandu/repos', 'events_url': 'https://api.github.com/users/huandu/events{/privacy}', 'received_events_url': 'https://api.github.com/users/huandu/received_events', 'type': 'User', 'site_admin': False, 'name': 'Huan Du', 'company': 'R Lab @DiDi', 'blog': '', 'location': 'Beijing, China

In [99]:
response = requests.get("https://api.github.com/orgs/facebook", headers=headers)
facebook = response.json()

In [100]:
print(facebook)

{'login': 'facebook', 'id': 69631, 'node_id': 'MDEyOk9yZ2FuaXphdGlvbjY5NjMx', 'url': 'https://api.github.com/orgs/facebook', 'repos_url': 'https://api.github.com/orgs/facebook/repos', 'events_url': 'https://api.github.com/orgs/facebook/events', 'hooks_url': 'https://api.github.com/orgs/facebook/hooks', 'issues_url': 'https://api.github.com/orgs/facebook/issues', 'members_url': 'https://api.github.com/orgs/facebook/members{/member}', 'public_members_url': 'https://api.github.com/orgs/facebook/public_members{/member}', 'avatar_url': 'https://avatars3.githubusercontent.com/u/69631?v=4', 'description': 'We are working to build community through open source technology. NB: members must have two-factor auth.', 'name': 'Facebook', 'company': None, 'blog': 'https://opensource.fb.com', 'location': 'Menlo Park, California', 'email': '', 'is_verified': True, 'has_organization_projects': True, 'has_repository_projects': True, 'public_repos': 160, 'public_gists': 12, 'followers': 0, 'following': 0,

In [102]:
response = requests.get("http://api.github.com/repos/octocat/Hello-World", headers=headers)
hello_world = response.json()
print(hello_world)

{'id': 1296269, 'node_id': 'MDEwOlJlcG9zaXRvcnkxMjk2MjY5', 'name': 'Hello-World', 'full_name': 'octocat/Hello-World', 'private': False, 'owner': {'login': 'octocat', 'id': 583231, 'node_id': 'MDQ6VXNlcjU4MzIzMQ==', 'avatar_url': 'https://avatars3.githubusercontent.com/u/583231?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/octocat', 'html_url': 'https://github.com/octocat', 'followers_url': 'https://api.github.com/users/octocat/followers', 'following_url': 'https://api.github.com/users/octocat/following{/other_user}', 'gists_url': 'https://api.github.com/users/octocat/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/octocat/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/octocat/subscriptions', 'organizations_url': 'https://api.github.com/users/octocat/orgs', 'repos_url': 'https://api.github.com/users/octocat/repos', 'events_url': 'https://api.github.com/users/octocat/events{/privacy}', 'received_events_url': 'https://api.github.

## Pagination

In [103]:
params = {"per_page":50, "page":1}
response = requests.get("https://api.github.com/users/rakeshsukla53/starred", headers=headers, params=params)

In [104]:
page1_repos = response.json()

In [105]:
print(page1_repos)

[{'id': 126193066, 'node_id': 'MDEwOlJlcG9zaXRvcnkxMjYxOTMwNjY=', 'name': 'drf_tutorial', 'full_name': 'kasulani/drf_tutorial', 'private': False, 'owner': {'login': 'kasulani', 'id': 6205925, 'node_id': 'MDQ6VXNlcjYyMDU5MjU=', 'avatar_url': 'https://avatars1.githubusercontent.com/u/6205925?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/kasulani', 'html_url': 'https://github.com/kasulani', 'followers_url': 'https://api.github.com/users/kasulani/followers', 'following_url': 'https://api.github.com/users/kasulani/following{/other_user}', 'gists_url': 'https://api.github.com/users/kasulani/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/kasulani/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/kasulani/subscriptions', 'organizations_url': 'https://api.github.com/users/kasulani/orgs', 'repos_url': 'https://api.github.com/users/kasulani/repos', 'events_url': 'https://api.github.com/users/kasulani/events{/privacy}', 'received_events_url

In [116]:
params = {"per_page":5, "page":2}
respons = requests.get("https://api.github.com/users/rakeshsukla53/starred")
print(respons.json())

[{'id': 126193066, 'node_id': 'MDEwOlJlcG9zaXRvcnkxMjYxOTMwNjY=', 'name': 'drf_tutorial', 'full_name': 'kasulani/drf_tutorial', 'private': False, 'owner': {'login': 'kasulani', 'id': 6205925, 'node_id': 'MDQ6VXNlcjYyMDU5MjU=', 'avatar_url': 'https://avatars1.githubusercontent.com/u/6205925?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/kasulani', 'html_url': 'https://github.com/kasulani', 'followers_url': 'https://api.github.com/users/kasulani/followers', 'following_url': 'https://api.github.com/users/kasulani/following{/other_user}', 'gists_url': 'https://api.github.com/users/kasulani/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/kasulani/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/kasulani/subscriptions', 'organizations_url': 'https://api.github.com/users/kasulani/orgs', 'repos_url': 'https://api.github.com/users/kasulani/repos', 'events_url': 'https://api.github.com/users/kasulani/events{/privacy}', 'received_events_url

## Requete POST

In [130]:
payload= {"name":"api-scrapping"}
response = requests.post("https://api.github.com/user/repos", json=payload, headers=headers)
status = response.status_code
print(status)

201


## Requete PATCH/PUT

In [131]:
payload = {"name":"api", "description":"test api"}
response = requests.patch("https://api.github.com/repos/Orer0/api-scrapping", json=payload, headers=headers)
status = response.status_code
print(status)

200


## Requete DELETE

In [132]:
response = requests.delete("http://api.github.com/repos/Orer0/api", headers=headers)
print(response.status_code)

204


## Cas pratique API de Reddit

In [133]:
import requests
import requests.auth

In [137]:
#On utilise la methode HTTPBasci de la lib requests.auth pour s'identifier
client_auth = requests.auth.HTTPBasicAuth('e3BkC2YXXT_xzw', 'ef-6VNJ4KYX4B9Y4Y7leCNTbQ8g')
#on ajoute mdp et id du compte reddit
post_data = {"grant_type":"password", "username":"Orer0","password":"dyfy97GEA"}
headers = {'User-agent':'Formation API'}
response = requests.post("https://www.reddit.com/api/v1/access_token", auth=client_auth, data=post_data, headers=headers)
response.json()

{'access_token': '268126869757-VHuAXP2B50XHTa-nZUipM62stKo',
 'token_type': 'bearer',
 'expires_in': 3600,
 'scope': '*'}

In [138]:
headers = {"authorization":"bearer 268126869757-VHuAXP2B50XHTa-nZUipM62stKo", "User-agent":"Formation API"}
params = {"t":"day"}# seuelement dernier jour
#On applique un requete get
response = requests.get("http://oauth.reddit.com/r/python/top", headers=headers, params=params)


In [139]:
print(response.status_code)

200


In [141]:
python_top=response.json()

In [143]:
print(python_top)

{'kind': 'Listing', 'data': {'modhash': None, 'dist': 25, 'children': [{'kind': 't3', 'data': {'approved_at_utc': None, 'subreddit': 'Python', 'selftext': '', 'author_fullname': 't2_4c5by', 'saved': False, 'mod_reason_title': None, 'gilded': 0, 'clicked': False, 'title': 'Coding in python sometimes feels like writing a really weird tongue twister...', 'link_flair_richtext': [], 'subreddit_name_prefixed': 'r/Python', 'hidden': False, 'pwls': 6, 'link_flair_css_class': None, 'downs': 0, 'parent_whitelist_status': 'all_ads', 'hide_score': False, 'name': 't3_b1h8za', 'quarantine': False, 'link_flair_text_color': 'dark', 'author_flair_background_color': None, 'subreddit_type': 'public', 'ups': 843, 'domain': 'i.redd.it', 'media_embed': {}, 'author_flair_template_id': None, 'is_original_content': False, 'user_reports': [], 'secure_media': None, 'is_reddit_media_domain': True, 'is_meta': False, 'category': None, 'secure_media_embed': {}, 'link_flair_text': None, 'can_mod_post': False, 'score'

In [148]:
top_article = python_top['data']['children']
print(top_article)

[{'kind': 't3', 'data': {'approved_at_utc': None, 'subreddit': 'Python', 'selftext': '', 'author_fullname': 't2_4c5by', 'saved': False, 'mod_reason_title': None, 'gilded': 0, 'clicked': False, 'title': 'Coding in python sometimes feels like writing a really weird tongue twister...', 'link_flair_richtext': [], 'subreddit_name_prefixed': 'r/Python', 'hidden': False, 'pwls': 6, 'link_flair_css_class': None, 'downs': 0, 'parent_whitelist_status': 'all_ads', 'hide_score': False, 'name': 't3_b1h8za', 'quarantine': False, 'link_flair_text_color': 'dark', 'author_flair_background_color': None, 'subreddit_type': 'public', 'ups': 843, 'domain': 'i.redd.it', 'media_embed': {}, 'author_flair_template_id': None, 'is_original_content': False, 'user_reports': [], 'secure_media': None, 'is_reddit_media_domain': True, 'is_meta': False, 'category': None, 'secure_media_embed': {}, 'link_flair_text': None, 'can_mod_post': False, 'score': 843, 'approved_by': None, 'thumbnail': '', 'edited': False, 'author_

In [152]:
most_upvoted = ""
most_upvotes = 0
for row in top_article:
    ar = row['data']
    if ar["ups"] >= most_upvotes:
        most_upvoted = ar["id"]
        most_upvotes = ar["ups"]

In [153]:
print(most_upvoted, most_upvotes)

b1h8za 843


In [158]:
#/r/{subreddit}/comments/{article}
respon = requests.get("http://oauth.reddit.com/r/python/comments/b1h8za", headers=headers)


In [159]:
print(respon.json())

[{'kind': 'Listing', 'data': {'modhash': None, 'dist': 1, 'children': [{'kind': 't3', 'data': {'approved_at_utc': None, 'subreddit': 'Python', 'selftext': '', 'user_reports': [], 'saved': False, 'mod_reason_title': None, 'gilded': 0, 'clicked': False, 'title': 'Coding in python sometimes feels like writing a really weird tongue twister...', 'link_flair_richtext': [], 'subreddit_name_prefixed': 'r/Python', 'hidden': False, 'pwls': 6, 'link_flair_css_class': None, 'downs': 0, 'parent_whitelist_status': 'all_ads', 'hide_score': False, 'name': 't3_b1h8za', 'quarantine': False, 'link_flair_text_color': 'dark', 'upvote_ratio': 0.93, 'author_flair_background_color': None, 'subreddit_type': 'public', 'ups': 856, 'domain': 'i.redd.it', 'media_embed': {}, 'author_flair_template_id': None, 'is_original_content': False, 'author_fullname': 't2_4c5by', 'secure_media': None, 'is_reddit_media_domain': True, 'is_meta': False, 'category': None, 'secure_media_embed': {}, 'link_flair_text': None, 'can_mod

In [160]:
comments = respon.json()

In [163]:
lst = comments[1]['data']['children']

In [165]:
most_upvoted = ""
most_up_votes = 0
for comment in lst:
    co = comment['data']
    if co["ups"] >= most_up_votes:
        most_upvoted = co["id"]
        most_up_votes = co["ups"]
print(most_upvoted, most_up_votes)

eilmyi4 85


## Web Scraping

In [1]:
 import requests

In [2]:
response = requests.get("https://raw.githubusercontent.com/codelikerod/web-scraping/master/exemple1.html")
content = response.content
print(content)

b'<html>\r\n  <head>\r\n      <title> Un exemple de page HTML </title>\r\n  </head>\r\n\r\n  <body>\r\n      <p>Un simple paragraphe</p>\r\n  </body>\r\n</html>'


In [5]:
from bs4 import BeautifulSoup as bs

In [8]:
parser = bs(content, 'html.parser')

#obtenir le tag body du doc html
body = parser.body

#obtenir le tag p de body
p = body.p
print(p)

#afficher seulement le texte
print(p.text)

<p>Un simple paragraphe</p>
Un simple paragraphe
