### Třetí úkol 
bude pro prodejce doplnit hodnotu ve sloupci web vyextrovanou ze sloupce email pokud ji nemají uvedenou v příslušném formátu www.mujweb.fr. O rozhodnutí doplnění dané adresy prosím použijte četnostní kriterium, pokud je daných webových adres větší než nějaké množství. Např. pro orange.fr tak nemá smysl doplňovat adresu webu neboť se jedná o lokálního zřizovatele schránek podobně jako gmail.com nebo seznam.cz.

In [262]:
import pandas as pd
from collections import Counter
from typing import Any, Dict

Reading csv file and store it as pandas dataframe

In [212]:
df = pd.read_csv('dealers.csv')

Cast `email` column to a string type

In [213]:
df['email'] = df['email'].astype('str') 

Creating subdataframe just with `web` and `email` columns

In [215]:
df_email = df[['web', 'email']]
# df_email.head(20)

Unnamed: 0,web,email
0,,
1,,filippo@vadilonga.it
2,,cycleschedaleuxcaudan@gmail.com
3,www.cyclexperts-brest.com,contact@cyclexperts-brest.com
4,,cycles.chabbert@orange.fr
5,www.intersport.fr,cycle-epagny@reseau-intersport.fr
6,,lucadezzo@libero.it
7,,plestancycle@orange.fr
8,,contact@becycles.fr
9,,


Convert all the emails to a list type for further iteration

In [283]:
list_of_emails = df['email'].to_list()
list_of_emails

['nan',
 'filippo@vadilonga.it',
 'cycleschedaleuxcaudan@gmail.com',
 'contact@cyclexperts-brest.com',
 'cycles.chabbert@orange.fr',
 'cycle-epagny@reseau-intersport.fr',
 'lucadezzo@libero.it',
 'plestancycle@orange.fr',
 'contact@becycles.fr',
 'nan',
 'cyclesdegueurce@orange.fr',
 'nan',
 'nantes@velo-horizon.fr',
 'info@parisbmx.com',
 'nan',
 'sport2000@gozzi-sport2000.com',
 'anthony@cyclesmagasin.fr',
 'nan',
 'bikeinfinity@libero.it',
 'bordeaux-ouest@cyclable.com',
 'benjamin.perusin@wanadoo.fr',
 'nan',
 'frulli.nicola@gmail.com',
 'bikemania@libero.it',
 'nan',
 'nan',
 'intersportbrignoles@live.fr',
 'nan',
 'nan',
 'ciclomillenniosrl@gmail.com',
 'nan',
 'info@bikerstree.com',
 'c2jvelo@orange.fr',
 'noemotobikesrl@gmail.com',
 'nan',
 'nan',
 'nan',
 'nan',
 'contact@hossegorbike.com',
 'nan',
 'nan',
 'info@bagnolibike.com',
 '69lyon-nord@culturevelo.com',
 'nan',
 'contact@velosdelavalette.fr',
 'info@dolomitibikeshop.com',
 'nan',
 'ebikeworld85@gmail.com',
 'nan',
 'n

In [252]:
def extract_domain(email: str) -> str:
    """
    Extracts the domain from an email address.

    Args:
        email (str): The email address 
        from which to extract the domain.

    Returns:
        str: The domain extracted from the email address, 
        or the original email if it is empty or 'nan'.

    """
    if email and email != 'nan':
        domain = email.split('@')[-1]
        return domain
    return email


List of extracted e-mail domains

In [277]:
extracted_domains = [extract_domain(email) for email in list_of_emails]
# extracted_domains

Frequency of e-mail domains

In [254]:
counter = Counter(extracted_domains)
email_freq = counter.most_common()

In [256]:
email_freq = dict(email_freq)

In [260]:
email_freq.pop('nan')

1034

In [279]:
def should_fill_website(
    domain: str, 
    email_freq: Dict[str, int],
    threshold: int
) -> bool:
    """
    Determines whether the website should be filled 
    based on the frequency of the domain in email addresses.

    Args:
        domain (str): The domain for which to check the frequency.
        threshold (int): The threshold value for the frequency.
        email_freq (Dict[str, int]): A dictionary 
            mapping domains to their corresponding frequency.

    Returns:
        bool: True if the frequency of the domain is below the threshold, 
              False otherwise.

    """
    return email_freq.get(domain, 0) < threshold


In [280]:
def fill_website(row: pd.Series) -> pd.Series:
    """
    Fill the 'web' column in a DataFrame row 
    based on the 'email' column.

    If the 'web' column is empty or 'nan' 
    and the 'email' column is not empty, 
    it extracts the domain from the email
    and checks if it should fill the 'web' column 
    based on the frequency threshold.

    Args:
        row (pd.Series): A row from the DataFrame 
            containing the 'email' and 'web' columns.

    Returns:
        pd.Series: The modified row with the 'web' column filled if applicable.

    """
    email = row['email']
    web = row['web']
    
    if pd.isna(web) and email and email != 'nan':
        domain = extract_domain(email)
        if should_fill_website(domain, email_freq, threshold=100):
            row['web'] = f'www.{domain}'
    
    return row


In [282]:
df = df.apply(fill_website, axis=1)
df.head(20)

Unnamed: 0,id,brand,eshop,lat,lng,country,address,web,telephone,email
0,6105d615-25c7-49b8-b564-2b5801f5c156,gt,Vaunage Passion Velos Sarl,43.813962,4.348726,France,"165 Av. Jean Prouvé, Nîmes, 30000, FR",,466361627,
1,6441b601-37d9-4862-9a1e-3d47e8fcf12f,scott,VADILONGA SRL,39.251569,9.138519,Italy,"VIALE ELMAS 172, CAGLIARI, 09122",www.vadilonga.it,+39070240537,filippo@vadilonga.it
2,9534c4c5-a68b-4d80-a0fc-6a98cd7f06b0,kalkhoff,SARL Cycles Charreteur,47.785181,-3.340796,France,"Rue Jean Baptiste Mertenot 330, Caudan, 56850",,+33297331790,cycleschedaleuxcaudan@gmail.com
3,850475e2-fe0a-493d-8d7c-95e6c608a17e,scott,CYCLEXPERTS BREST,48.42837,-4.458959,France,"RUE ROSEMONDE GERARD 5, GOUESNOU, 29850",www.cyclexperts-brest.com,+330298425857,contact@cyclexperts-brest.com
4,989897bd-430a-412e-9f6e-0120b783c10c,superior,CYCLES CHABBERT,43.4612,3.42283,France,"1 AVENUE FRNCOIS HUE, PEZENAS, 34120",,04.67.98.12.90,cycles.chabbert@orange.fr
5,aa4df157-9987-4536-a119-b0349cf331f4,scott,INTERSPORT EPAGNY,45.934126,6.086538,France,"RUE DU PARMELAN 2, EPAGNY, 74330",www.intersport.fr,+330450222822,cycle-epagny@reseau-intersport.fr
6,af9dd896-a3c7-4000-b6a9-8de154b9d696,haibike,La Bici Volante,45.590835,11.522423,Italy,"Via Gardellina 28, 36030 Rettorgole di Caldogn...",www.libero.it,+390444986265,lucadezzo@libero.it
7,b1429343-1d39-42bc-b29b-2438f49136b1,gt,PLESTAN CYCLES,48.465099,-2.510837,France,"28 RUE CHANTOINE DU TEMPLE, LAMBALLE, 22400, FR",,+330296341690,plestancycle@orange.fr
8,d954d861-06df-4685-8498-2c7d33c2e3d3,haibike,BE CYCLES,45.76671,3.12081,France,"267 RUE DE L ORADOU, 63000 CLERMONT-FERRAND",www.becycles.fr,+33473262624,contact@becycles.fr
9,0983af7a-62a3-4435-aa6c-72a24091b9f9,cannondale,LordGun,45.524746,10.196167,Italy,"Via Fura, 7, Brescia, 25125",,0303757092,
