# MODS203 Project
## Ryan Borhani, Mathilde Froger, Apolline Isaia, Solal Urien
### Load libraries

In [1]:
import pandas as pd
import urllib.request
import re
import requests
import random
import unidecode
import time

from urllib.error import HTTPError
from bs4 import BeautifulSoup as bs
from urllib.request import Request, urlopen

### List of towns

In [2]:
my_file = open("Communes.txt", "r")
data = my_file.read()
town_list = data.split("\n")
my_file.close()

### Normalizing the names

In [3]:
for k in range(len(town_list)) :
    town_list[k] = unidecode.unidecode(town_list[k].lower())
    town_list[k] = town_list[k].replace('\'','-')
    town_list[k] = town_list[k].replace(' ','-')
    town_list[k] = town_list[k].replace('ç','c')
len(town_list)

190

### Adding the *arrondissements* of Paris, Marseille and Lyon

In [4]:
for k in range (1,21): #Paris has 20 arrondissements
    town_list.insert(1,'paris-'+str(75000+k))
    if k<17: #Marseille has 16 arrondissements
        town_list.insert(2+k,'marseille-'+str(13000+k))
    if k<10: #Lyon has 9 arrondissements
        town_list.insert(3+2*k,'lyon-'+str(69000+k))

In [5]:
len(town_list)

235

### List of speciality

In [135]:
my_file2 = open("Speciality.txt", "r")
data2 = my_file2.read()
spe_list = data2.split("\n")
my_file2.close()
spe_list

['addictologue',
 'cancerologue',
 'chirurgien',
 'dermatologue',
 'dieteticien',
 'gynecologue',
 'infectiologue',
 'neurologue',
 'ophtalmologue',
 'pediatre',
 'pneumologue',
 'psychiatre',
 'radiologue',
 'rhumatologue']

In [137]:
spe_list = spe_list[0:2]+[spe_list[4]]+[spe_list[8]]

Du fait des impératifs de temps et des contraintes de scrapping (notamment l'utilisation d'un VPN ainsi que le peu de données récoltés pour une requête), nous sélectionnons quatre spécialités sur lesquelles nous allons scrapper.
Nous aurons tout de même certaines des spécialités de la liste ci-dessus car les retours de doctolib pour un médecins spécialiste sont imprécis.

In [138]:
spe_list

['addictologue', 'cancerologue', 'dieteticien', 'ophtalmologue']

The speciality strings are already normalized.
### Make the request and extract the HTML code
For all departments, we request the HTML code. For this doing, we use the previous list and convert the strings into the model that is used for the *Doctolib* URL adress.
The collection of the data was particularly complex given the reaction of the *Doctolib* website.

**Warning 1: the repeated requests provoke a 403 or 429 HTTP error, therefore, you have to change the VPN address or proxy each time you get such error**

**Warning 2: The code might be particularly long to compute, especially with a VPN**


In [238]:
hdr = {'User-Agent': 'Mozilla/6.0'}
list_HTML = []
curr_spe = 0
spe = spe_list[curr_spe]

for town in town_list:
    try :
        url = 'https://www.doctolib.fr/'+spe+'/'+town
        req = Request(url,headers=hdr)
        page = urlopen(req)
        soup = bs(page,'html.parser')
        list_HTML.append(soup)
    except HTTPError as err:
        if err.code == 404: ## we only except 404 HTTP errors: this are the cities that do not have doctors
            print(town)
        else:
            raise

In [239]:
len(list_HTML)

3

This list has to be heavily processed before we can add the information to a Dataframe.

## Data Processing
We previously learned (1st part of the project) how the HTML code was structured.
We therefore divide the list in different parts, each containing the information of all the doctors in a given department.

In [240]:
all_doc = []

for dep_doc in list_HTML:
    all_doc = all_doc + [str(dep_doc.findAll(type= "application/ld+json")[1])]
len(all_doc)

3

We subsequently divide the list, each part containing all the information of a single doctor.

In [241]:
list_inf = []

for depart_doc in all_doc:
    cur_nb = depart_doc.find('{') #our indice
    cur_parent = 1 #nb of parenthesis to close

    for k in range (cur_nb+1, len(depart_doc)):
        if (depart_doc[k]=='{'):
            cur_parent +=1
        elif (depart_doc[k] == '}'):
            cur_parent -=1 
            if (cur_parent == 0): #we found the parenthesis that closes the first parenthesis
                list_inf.append(depart_doc[cur_nb+1:k])
                cur_nb = depart_doc.find('{',k)
len(list_inf)

60

We now have numerous strings, each describing a doctor.
We are going to process these strings one by one to obtain exactly the dataframe we want. We also add more information to the DataFrame with the *Payment_accepted* column.

In [242]:
list_name, list_type, list_spe,list_adr,list_post_cod, list_cit, list_pay = [],[],[],[],[],[],[]

for doc in list_inf: #boucle pour les noms des médecins ou des hopitals
    i=0
    cur_nb = doc.find('name')
    while (doc[cur_nb+6+i] != ','):
        i+=1
    list_name.append(doc[cur_nb+7:cur_nb+5+i])

for doc in list_inf: #boucle pour les type des médecins/hopitals
    i=0
    cur_nb = doc.find('type')
    while (doc[cur_nb+6+i] != ','):
        i+=1
    list_type.append(doc[cur_nb+7:cur_nb+5+i])

for doc in list_inf: #boucle pour les spécialités
    i=0
    cur_nb = doc.find('medicalSpecialty')
    while (doc[cur_nb+18+i] != ','):
        i+=1
    list_spe.append(doc[cur_nb+19:cur_nb+17+i])

for doc in list_inf: #boucle pour les villes
    i=0
    cur_nb = doc.find('addressLocality')
    while (doc[cur_nb+16+i] != ','):
        i+=1
    list_cit.append(doc[cur_nb+18:cur_nb+14+i])

for doc in list_inf: #boucle pour les adresses (rue et numéro)
    i=0
    cur_nb = doc.find('streetAddress')
    while (doc[cur_nb+6+i] != ','):
        i+=1
    list_adr.append(doc[cur_nb+16:cur_nb+5+i])

for doc in list_inf:#boucle pour les codes postaux
    i=0
    cur_nb = doc.find('postalCode')
    while (doc[cur_nb+11+i] != ','):
        i+=1
    try:
        list_post_cod.append(int(doc[cur_nb+13:cur_nb+10+i].replace(' ',''))) # le code postal est un entier, on applique donc un rapide formatage avant de transformer le string en int
    except:
        list_post_cod.append(doc[cur_nb+13:cur_nb+10+i]) #Si la transformation en integer n'est pas possible, on laisse le format string

for doc in list_inf: #boucle pour les moyens de paiement
    i=0
    cur_nb = doc.find('paymentAccepted')
    while (cur_nb+i != len(doc)):
        i+=1
    list_pay.append(doc[cur_nb+18:len(doc)-1])

data = {'Name':list_name,'Type':list_type, 'Speciality':list_spe,'Address':list_adr,'City':list_cit,'Postal_Code':list_post_cod, 'Payment_accepted':list_pay}
df = pd.DataFrame(data)
display(df)

Unnamed: 0,Name,Type,Speciality,Address,City,Postal_Code,Payment_accepted
0,Noemie HENNEQUIN-BOI,Physician,Diététicienne,2 Rue Général de Castelnau,Thionville,57100,"Cash, Credit card"
1,Laurane Aboualchamat,Physician,Diététicienne,5 Rue des Landes,Thionville,57100,"Cash, Check, Credit card"
2,Camille Chiaro,Physician,Diététicienne,1a Rue des Romains,Yutz,57970,"Cash, Check"
3,Adrien Pecourt,Physician,Diététicien,Rue Jean-Pierre BELTOISE,Terville,57180,"Cash, Check, Credit card"
4,Pharmacie Cleyet Merle,Hospital,Pharmacie,61 Route de Kuntzig,Yutz,57970,
5,Selin ZORLU,Physician,Diététicienne,1 Rue des Hirondelles,Basse-Ham,57970,"Cash, Check"
6,Claire Boulard,Physician,Diététicienne,11 Place Saint-Benoît,Guénange,57310,"Cash, Check"
7,Laetitia Dupoirier,Physician,Diététicienne,1 Rue de l'Église,Metzervisse,57940,"Cash, Check"
8,Valérie Sion,Physician,Diététicienne,2a Route d'Evange,Breistroff-la-Grande,57570,"Cash, Check, Credit card"
9,Méryl Audia,Physician,Diététicienne,46 Rue de Nancy,Amnéville,57360,"Cash, Check"


### Export the Data

In [243]:
df.to_csv("Dataset-Speciality.csv", mode='a', index = False, header=False)