# Loading local data

In [120]:
from os import listdir
import pandas as pd

In [121]:
files = listdir("../chart")

dfs_chart = {}
for f in files:
    year = int(f.split("_")[0])
    month = f.split("_")[1].split(".")[0]

    if dfs_chart.get(year) is None:
        dfs_chart[year] = {}
        
    dfs_chart[year][month] = pd.read_csv("../chart/" + f)

In [122]:
dfs_chart[2006]["Janvier"]

Unnamed: 0,Rank,Artist,Music,Naissance,Pays d'origine,Origine,Nationalité,Pays
0,1,Juanes,La Camisa Negra,"9 août 1972 (50 ans)Medellin, Colombie",,,,
1,2,Madonna,Hung Up,"16 août 1958 (64 ans)Bay City, Michigan (États...",,,Américaine,
2,3,Johnny Hallyday,Mon Plus Beau Noël,15 juin 1943 Paris 9e (France),,,Française,
3,4,Tina Arena,Aimer Jusqu'à L'impossible,"1er novembre 1967 (55 ans)Melbourne, Australie",,,,
4,5,Star Academy,Santiano,,,,,France
...,...,...,...,...,...,...,...,...
95,96,Jenifer,Serre-moi,15 novembre 1982 (40 ans)Nice,,,,
96,97,Lucie Bernardoni,"Petit Rat, Petit Loup","4 février 1987 (35 ans)Nice, Alpes-Maritimes, ...",,,,
97,98,Ilona Mitrecey,C'est Les Vacances,1er septembre 1993 (29 ans)Fontenay-aux-Roses ...,,,,
98,99,Kanye West,Gold Digger,"24 octobre 1986 (36 ans)Toronto (Ontario, Canada)",,,,


# Supression des dernière column

In [123]:
for k1 in dfs_chart.keys():
    for k2 in dfs_chart[k1].keys():
        dfs_chart[k1][k2].drop(columns=["Naissance", "Pays d'origine", "Origine", "Pays", "Nationalité"], inplace=True)

# Scrapping chart

In [33]:
from bs4 import BeautifulSoup
from requests import get
import pandas as pd
import numpy as np
from tqdm import tqdm

In [33]:
years = np.arange(2004, 2023, 1)
weeks = [int(v) for v in np.linspace(1, 52, 12)]
mois = ["Janvier", "Fevrier", "Mars", "Avril", "Mai", "Juin", "Juillet", "Aout", "Septembre", "Octobre", "Novembre", "Decembre"]

In [34]:
dfs_chart = {y : {} for y in years}
for y in tqdm(years):
    for k, w in enumerate(weeks):
        url = f"https://acharts.co/france_singles_top_100/{y}/{w}"
        rq = get(url)
        m = mois[k]
        if rq.ok:
            soup = BeautifulSoup(rq.text)

            chart = soup.find("table", {"id" : "ChartTable"})
            trs = chart.findAll("tr")[1:]

            data = []
            for tr in trs:
                music_name = tr.find("span", {"itemprop" : "name"}).text
                rank = tr.find("span", {"itemprop" : "position"}).text
                artist_name = tr.find("span", {"itemprop" : "byArtist"}).text[2:-1]
                data.append([rank, artist_name, music_name])

            df = pd.DataFrame(data, columns=["Rank", "Artist", "Music"])
            dfs_chart[y][m] = df

100%|██████████| 19/19 [02:54<00:00,  9.20s/it]


In [124]:
names = [
    ("Djadja", "Djadja et Dinaz"),
    ("Lorie", "Lorie Pester"),
    ("-M-", "Matthieu Chedid"),
    ("Priscilla", "Priscilla Betti"),
    ("I Am", "IAM"),
    ("Sofiane", "Sofiane Zermani"),
    ("Justice", "Justice (groupe"),
    ("Soma Riba", "Collectif Métissé"),
    ("Fresh", "Fresh la Peufra"),
    ("Rosalia", "Rosalía"),
    ("Italo Brothers", "ItaloBrothers"),
    ("Far*east Movement", "Far East Movement"),
    ("Odyssey", "Odyssey (groupe)"),
    ("1789", "1789 : Les Amants de la Bastille"),
    ("Clemence", "Clémence Saint-Preux")

]

for k1 in dfs_chart.keys():
    for k2 in dfs_chart[k1].keys():
        for n1, n2 in names:
            dfs_chart[k1][k2].replace(n1, n2, inplace=True)

# Scrapping birth

In [125]:
import wikipedia
from nltk.metrics.distance import edit_distance

In [126]:
MISSING = " "

In [127]:
def find_title_in_wikipedia(title):
    words = ["(chanteur)", "(chanteuse)", "(groupe)", "(rappeur)", "(rappeuse)", "(musicien)", "(chanteur français)", "(france)", "(producteur)", "(artiste)"]

    wikipedia.set_lang("fr")
    results = wikipedia.search(title, results=10)
    distance = []
    if len(results) > 0:
        for element in results:
            if any(w in element.lower() for w in words):
                return element

            distance.append(edit_distance(title, element))

        return results[np.argmin(distance)]

    return MISSING

In [128]:
def list_to_string(l):
    return " ".join(l)

In [129]:
def wiki_birth(title):
    cols = ["Naissance", "Pays d'origine", "Origine", "Nationalité", "Pays"]
    nats = ["franco", "français", "belge", "canadien", "libanais", "réunionnais"]
    dic = {w : MISSING for w in cols}

    if title == MISSING:
        return dic

    url = f"https://fr.wikipedia.org/wiki/{title}"
    rq = get(url)

    if not rq.ok:
        return dic
    
    soup = BeautifulSoup(rq.text)
    tables = soup.findAll("table")

    for table in tables:
        trs = table.findAll("tr")

        for tr in trs:
            th = tr.find("th")

            if th is not None:
                for w in cols:
                    if w in th.text:
                        td = tr.find("td")
                        if td is not None:
                            dic[w] = td.text.strip()

    if not all(x == " " for x in dic.values()):
        return dic
    else:
        wikipedia.set_lang("fr")
        try:
            summary = wikipedia.summary(title, sentences=1)
            for w in nats:
                if w in summary:
                    dic["Nationalité"] = w
                    return dic
        except:
            pass

    return dic

In [130]:
artist = []
for k1 in dfs_chart.keys():
    for k2 in dfs_chart[k1].keys():
        artist += dfs_chart[k1][k2]["Artist"].tolist()
    
artist = pd.DataFrame(list(set(artist)), columns=["Artist"])

In [131]:
artist["Artist_wiki"] = artist["Artist"].apply(find_title_in_wikipedia)

In [132]:
birth_dic = artist["Artist_wiki"].apply(wiki_birth)



  lis = BeautifulSoup(html).find_all('li')


In [133]:
dfs_birth = []
for dic in birth_dic:
    dfs_birth.append(pd.DataFrame(dic, index=[0]))
birth = pd.concat(dfs_birth, ignore_index=True)

In [134]:
artist = artist.merge(birth, left_index=True, right_index=True)

In [135]:
artist.drop(columns=["Artist_wiki"], inplace=True)
artist = artist.set_index("Artist")

In [136]:
for k1 in dfs_chart.keys():
    for k2 in dfs_chart[k1].keys():
        dfs_chart[k1][k2] = dfs_chart[k1][k2].join(artist, on="Artist", how="left")

# Export

In [137]:
for k1 in dfs_chart.keys():
    for k2 in dfs_chart[k1].keys():
        dfs_chart[k1][k2].to_csv(f"../chart/{k1}_{k2}.csv", encoding="utf-8-sig", index = False)

# Test

In [138]:
missing = artist[
    (artist["Naissance"] == MISSING) & 
    (artist["Pays d'origine"] == MISSING) & 
    (artist["Origine"] == MISSING) & 
    (artist["Nationalité"] == MISSING) & 
    (artist["Pays"] == MISSING)
]

In [139]:
dfs = []
for k1 in dfs_chart.keys():
    for k2 in dfs_chart[k1].keys():
        dfs.append(dfs_chart[k1][k2])

df = pd.concat(dfs, ignore_index=True)

In [140]:
df.drop(columns=["Rank"], inplace=True)
df.drop_duplicates(inplace=True)

In [141]:
df["MISSING"] = (
    (df["Naissance"] == MISSING) &
    (df["Pays d'origine"] == MISSING) &
    (df["Origine"] == MISSING) &
    (df["Nationalité"] == MISSING) &
    (df["Pays"] == MISSING)
)

In [142]:
tab = df.groupby("Artist")["MISSING"].sum().sort_values(ascending=False)
tab[tab > 2]

Artist
Various Artists          14
Landy                    12
Eva                      11
Swedish House Mafia       6
Pigloo                    5
Titou Le Lapinou          4
Basto!                    4
Cauet                     4
Feder                     4
Charlotte Aux Fraises     3
T-rio                     3
Sasso                     3
Lynda                     3
Anaklein                  3
Name: MISSING, dtype: int64

In [146]:
df[df["Artist"] == "Landy"]

Unnamed: 0,Artist,Music,Naissance,Pays d'origine,Origine,Nationalité,Pays,MISSING
19228,Landy,Aucune Limite,,,,,,True
19257,Landy,Toi T'es Chelou,,,,,,True
19372,Landy,Enfants Terribles,,,,,,True
19376,Landy,Millions D'euros,,,,,,True
19410,Landy,Médusa,,,,,,True
19412,Landy,Ma Werss,,,,,,True
19436,Landy,V12,,,,,,True
19443,Landy,Prends Ta Paye,,,,,,True
19448,Landy,Filon,,,,,,True
19450,Landy,Le Même,,,,,,True


In [144]:
find_title_in_wikipedia("Christine And The Queens")

'Redcar (artiste)'

In [145]:
wikipedia.search("Christine And The Queens", results=10)

['Redcar (artiste)',
 'Russella',
 'Redcar',
 'Léo Walk',
 'Discographie de Redcar',
 'Chaleur humaine',
 'Chris (album)',
 'Christine (chanson)',
 'Pansexualité',
 'Les Paradis perdus (chanson)']