# MODS203 Project
## Ryan Borhani, Mathilde Froger, Apolline Isaia, Solal Urien
### Load libraries

In [1]:
import pandas as pd
import urllib.request
import re
import requests
import random
import unidecode

from bs4 import BeautifulSoup as bs
from urllib.request import Request, urlopen

### List of towns

In [2]:
my_file = open("Communes.txt", "r")
data = my_file.read()
town_list = data.split("\n")
my_file.close()

### Normalizing the names

In [3]:
for k in range(len(town_list)) :
    town_list[k] = unidecode.unidecode(town_list[k].lower())
    town_list[k] = town_list[k].replace('\'','-')
    town_list[k] = town_list[k].replace(' ','-')
    town_list[k] = town_list[k].replace('ç','c')
len(town_list)

190

### Adding the *arrondissements* of Paris, Marseille and Lyon

In [4]:
for k in range (1,21): #Paris has 20 arrondissements
    town_list.insert(1,'paris-'+str(75000+k))
    if k<17: #Marseille has 16 arrondissements
        town_list.insert(2+k,'marseille-'+str(13000+k))
    if k<10: #Lyon has 9 arrondissements
        town_list.insert(3+2*k,'lyon-'+str(69000+k))

In [5]:
town_list

['paris',
 'paris-75020',
 'paris-75019',
 'paris-75018',
 'paris-75017',
 'paris-75016',
 'paris-75015',
 'paris-75014',
 'paris-75013',
 'paris-75012',
 'paris-75011',
 'paris-75010',
 'paris-75009',
 'paris-75008',
 'paris-75007',
 'paris-75006',
 'paris-75005',
 'paris-75004',
 'paris-75003',
 'paris-75002',
 'paris-75001',
 'marseille',
 'marseille-13016',
 'marseille-13015',
 'marseille-13014',
 'marseille-13013',
 'marseille-13012',
 'marseille-13011',
 'marseille-13010',
 'marseille-13009',
 'marseille-13008',
 'marseille-13007',
 'marseille-13006',
 'marseille-13005',
 'marseille-13004',
 'marseille-13003',
 'marseille-13002',
 'marseille-13001',
 'lyon',
 'lyon-69009',
 'lyon-69008',
 'lyon-69007',
 'lyon-69006',
 'lyon-69005',
 'lyon-69004',
 'lyon-69003',
 'lyon-69002',
 'lyon-69001',
 'toulouse',
 'nice',
 'nantes',
 'montpellier',
 'strasbourg',
 'bordeaux',
 'lille',
 'rennes',
 'reims',
 'saint-etienne',
 'le-havre',
 'toulon',
 'grenoble',
 'dijon',
 'angers',
 'nimes'

### Make the request and extract the HTML code
For all departments, we request the HTML code. For this doing, we use the previous list and convert the strings into the model that is used for the *Doctolib* URL adress.
The collection of the data was particularly complex given the reaction of the *Doctolib* website.

**Warning 1: the repeated requests might provoke a 403 HTTP error, using a VPN might therefore be useful**

**Warning 2: The code might be particularly long to compute** 

In [6]:
hdr = {'User-Agent': 'Mozilla/6.0'}
list_HTML = []

for town in town_list:
    url = 'https://www.doctolib.fr/medecin-generaliste/'+town
    req = Request(url,headers=hdr)
    page = urlopen(req)
    soup = bs(page,'html.parser')
    list_HTML.append(soup)

In [7]:
len(list_HTML)

235

This list has to be heavily processed before we can add the information to a Dataframe.

## Data Processing
We previously learned (1st part of the project) how the HTML code was structured.
We therefore divide the list in different parts, each containing the information of all the doctors in a given department.

In [8]:
all_doc = []
for dep_doc in list_HTML:
    all_doc = all_doc + [str(dep_doc.findAll(type= "application/ld+json")[1])]
len(all_doc)

235

In [9]:
all_doc[0]

'<script type="application/ld+json">[{"@context":"http://schema.org/","@type":"Physician","name":"Katia SEBBAN","medicalSpecialty":"Médecin généraliste","legalName":"","url":"/medecin-generaliste/paris/katia-sebban","address":{"@type":"PostalAddress","name":"","streetAddress":"178 Bis, Rue Pelleport","postalCode":"75020","addressLocality":"Paris"},"paymentAccepted":"Cash, Check, Credit card"},{"@context":"http://schema.org/","@type":"Hospital","name":"Centre médical Réaumur - CPAM Paris","medicalSpecialty":"Centre médical et dentaire","legalName":null,"url":"/centre-medical-et-dentaire/paris/centre-medical-reaumur-cpam-paris","address":{"@type":"PostalAddress","name":"Centre médical Réaumur - CPAM Paris","streetAddress":"106 Rue Réaumur","postalCode":"75002","addressLocality":"Paris"},"paymentAccepted":"Cash, Check, Credit card"},{"@context":"http://schema.org/","@type":"Physician","name":"Sven THILL","medicalSpecialty":"Médecin généraliste","legalName":null,"url":"/medecin-generaliste

We subsequently divide the list, each part containing all the information of a single doctor.

In [10]:
list_inf = []

for depart_doc in all_doc:
    cur_nb = depart_doc.find('{') #our indice
    cur_parent = 1 #nb of parenthesis to close

    for k in range (cur_nb+1, len(depart_doc)):
        if (depart_doc[k]=='{'):
            cur_parent +=1
        elif (depart_doc[k] == '}'):
            cur_parent -=1 
            if (cur_parent == 0): #we found the parenthesis that closes the first parenthesis
                list_inf.append(depart_doc[cur_nb+1:k])
                cur_nb = depart_doc.find('{',k)
len(list_inf)

4751

We now have numerous strings, each describing a doctor.
We are going to process these strings one by one to obtain exactly the dataframe we want. We also add more information to the DataFrame with the *Payment_accepted* column.

In [15]:
list_name, list_type, list_spe,list_adr,list_post_cod, list_cit, list_pay = [],[],[],[],[],[],[]

for doc in list_inf: #boucle pour les noms des médecins ou des hopitals
    i=0
    cur_nb = doc.find('name')
    while (doc[cur_nb+6+i] != ','):
        i+=1
    list_name.append(doc[cur_nb+7:cur_nb+5+i])

for doc in list_inf: #boucle pour les type des médecins/hopitals
    i=0
    cur_nb = doc.find('type')
    while (doc[cur_nb+6+i] != ','):
        i+=1
    list_type.append(doc[cur_nb+7:cur_nb+5+i])

for doc in list_inf: #boucle pour les spécialités
    i=0
    cur_nb = doc.find('medicalSpecialty')
    while (doc[cur_nb+18+i] != ','):
        i+=1
    list_spe.append(doc[cur_nb+19:cur_nb+17+i])

for doc in list_inf: #boucle pour les villes
    i=0
    cur_nb = doc.find('addressLocality')
    while (doc[cur_nb+16+i] != ','):
        i+=1
    list_cit.append(doc[cur_nb+18:cur_nb+14+i])

for doc in list_inf: #boucle pour les adresses (rue et numéro)
    i=0
    cur_nb = doc.find('streetAddress')
    while (doc[cur_nb+6+i] != ','):
        i+=1
    list_adr.append(doc[cur_nb+16:cur_nb+5+i])

for doc in list_inf:#boucle pour les codes postaux
    i=0
    cur_nb = doc.find('postalCode')
    while (doc[cur_nb+11+i] != ','):
        i+=1
    try:
        list_post_cod.append(int(doc[cur_nb+13:cur_nb+10+i].replace(' ',''))) # le code postal est un entier, on applique donc un rapide formatage avant de transformer le string en int
    except:
        list_post_cod.append(doc[cur_nb+13:cur_nb+10+i]) #Si la transformation en integer n'est pas possible, on laisse le format string

for doc in list_inf: #boucle pour les moyens de paiement
    i=0
    cur_nb = doc.find('paymentAccepted')
    while (cur_nb+i != len(doc)):
        i+=1
    list_pay.append(doc[cur_nb+18:len(doc)-1])

data = {'Name':list_name,'Type':list_type, 'Speciality':list_spe,'Address':list_adr,'City':list_cit,'Postal_Code':list_post_cod, 'Payment_accepted':list_pay}
df = pd.DataFrame(data)
display(df)

Unnamed: 0,Name,Type,Speciality,Address,City,Postal_Code,Payment_accepted
0,Katia SEBBAN,Physician,Médecin généraliste,178 Bi,Paris,75020,"Cash, Check, Credit card"
1,Centre médical Réaumur - CPAM Paris,Hospital,Centre médical et dentaire,106 Rue Réaumur,Paris,75002,"Cash, Check, Credit card"
2,Sven THILL,Physician,Médecin généraliste,77 Rue Pelleport,Paris,75020,"Cash, Check, Credit card"
3,Simon OHAYON,Physician,Médecin généraliste,33 Rue du Ranelagh,Paris,75016,"Cash, Check, Credit card"
4,Amélie Aïm-Eusébi,Physician,Médecin généraliste,186 Boulevard Ney,Paris,75018,"Cash, Check, Credit card"
...,...,...,...,...,...,...,...
4746,Centre municipal et universitaire de santé Arc...,Hospital,Centre de santé,3 Rue du 8 Mai 1945,Arcueil,94110,
4747,Bodosahondra ANDRIAMANJATO,Physician,Médecin généraliste,15 Avenue Georges Clemenceau,Sceaux,92330,"Cash, Check, Credit card"
4748,Lena LANGLOIS,Physician,Médecin généraliste,9 Rue Jacques Margottin,Bourg-la-Reine,92340,"Cash, Check, Credit card"
4749,Delphine LAW-TO-LAGASSE,Physician,Médecin généraliste,6 Avenue de Verdun,Montrouge,92120,"Cash, Check, Credit card"


### Export the Data

In [13]:
df.to_csv(r"Dataframe-Generalist.csv", index=False)