# Loading local data

In [1]:
from os import listdir
import pandas as pd

In [3]:
files = listdir("../chart")

dfs_chart = {}
for f in files:
    year = int(f.split("_")[0])
    month = f.split("_")[1].split(".")[0]

    if dfs_chart.get(year) is None:
        dfs_chart[year] = {}
        
    dfs_chart[year][month] = pd.read_csv("../chart/" + f)

# Supression des dernière column

In [3]:
for k1 in dfs_chart.keys():
    for k2 in dfs_chart[k1].keys():
        dfs_chart[k1][k2].drop(columns=["Naissance", "Pays d'origine", "Origine", "Pays", "Nationalité"], inplace=True)

# Scrapping chart

In [4]:
from bs4 import BeautifulSoup
from requests import get
import pandas as pd
import numpy as np
from tqdm import tqdm

In [33]:
years = np.arange(2004, 2023, 1)
weeks = [int(v) for v in np.linspace(1, 52, 12)]
mois = ["Janvier", "Fevrier", "Mars", "Avril", "Mai", "Juin", "Juillet", "Aout", "Septembre", "Octobre", "Novembre", "Decembre"]

In [34]:
dfs_chart = {y : {} for y in years}
for y in tqdm(years):
    for k, w in enumerate(weeks):
        url = f"https://acharts.co/france_singles_top_100/{y}/{w}"
        rq = get(url)
        m = mois[k]
        if rq.ok:
            soup = BeautifulSoup(rq.text)

            chart = soup.find("table", {"id" : "ChartTable"})
            trs = chart.findAll("tr")[1:]

            data = []
            for tr in trs:
                music_name = tr.find("span", {"itemprop" : "name"}).text
                rank = tr.find("span", {"itemprop" : "position"}).text
                artist_name = tr.find("span", {"itemprop" : "byArtist"}).text[2:-1]
                data.append([rank, artist_name, music_name])

            df = pd.DataFrame(data, columns=["Rank", "Artist", "Music"])
            dfs_chart[y][m] = df

100%|██████████| 19/19 [02:54<00:00,  9.20s/it]


In [5]:
for k1 in dfs_chart.keys():
    for k2 in dfs_chart[k1].keys():
        dfs_chart[k1][k2].replace("Djadja", "Djadja et Dinaz", inplace=True)
        dfs_chart[k1][k2].replace("Lorie", "Lorie Pester", inplace=True)
        dfs_chart[k1][k2].replace("-M-", "Matthieu Chedid", inplace=True)
        dfs_chart[k1][k2].replace("Priscilla", "Priscilla Betti", inplace=True)
        dfs_chart[k1][k2].replace("Brigitte", "Brigitte (groupe)", inplace=True)
        dfs_chart[k1][k2].replace("I Am", "IAM", inplace=True)

# Scrapping birth

In [6]:
import wikipedia
from nltk.metrics.distance import edit_distance

In [7]:
MISSING = " "

In [28]:
def find_title_in_wikipedia(title):
    words = ["(chanteur)", "(chanteuse)", "(groupe)", "(rappeur)", "(rappeuse)", "(musicien)", "(chanteur français)"]

    wikipedia.set_lang("fr")
    results = wikipedia.search(title, results=10)
    distance = []
    if len(results) > 0:
        for element in results:
            if any(w in element for w in words):
                return element

            distance.append(edit_distance(title, element))

        return results[np.argmin(distance)]

    return MISSING

In [29]:
def list_to_string(l):
    return " ".join(l)

In [46]:
def wiki_birth(title):
    cols = ["Naissance", "Pays d'origine", "Origine", "Nationalité", "Pays"]
    nats = ["francophone", "français", "belge", "canadien", "libanais", "réunionnais"]
    dic = {w : MISSING for w in cols}

    if title == MISSING:
        return dic

    url = f"https://fr.wikipedia.org/wiki/{title}"
    rq = get(url)

    if not rq.ok:
        return dic
    
    soup = BeautifulSoup(rq.text)
    tables = soup.findAll("table")

    for table in tables:
        trs = table.findAll("tr")

        for tr in trs:
            th = tr.find("th")

            if th is not None:
                for w in cols:
                    if w in th.text:
                        td = tr.find("td")
                        if td is not None:
                            dic[w] = td.text.strip()

    if not all(x == " " for x in dic.values()):
        return dic
    else:
        wikipedia.set_lang("fr")
        try:
            summary = wikipedia.summary(title, sentences=1)
            for w in nats:
                if w in summary:
                    dic["Nationalité"] = w
                    return dic
        except:
            pass

    return dic

In [22]:
artist = []
for k1 in dfs_chart.keys():
    for k2 in dfs_chart[k1].keys():
        artist += dfs_chart[k1][k2]["Artist"].tolist()
    
artist = pd.DataFrame(list(set(artist)), columns=["Artist"])

In [23]:
artist["Artist_wiki"] = artist["Artist"].apply(find_title_in_wikipedia)

In [47]:
birth_dic = artist["Artist_wiki"].apply(wiki_birth)



  lis = BeautifulSoup(html).find_all('li')


In [48]:
dfs_birth = []
for dic in birth_dic:
    dfs_birth.append(pd.DataFrame(dic, index=[0]))
birth = pd.concat(dfs_birth, ignore_index=True)

In [49]:
artist = artist.merge(birth, left_index=True, right_index=True)

In [50]:
artist.drop(columns=["Artist_wiki"], inplace=True)
artist = artist.set_index("Artist")

In [51]:
for k1 in dfs_chart.keys():
    for k2 in dfs_chart[k1].keys():
        dfs_chart[k1][k2] = dfs_chart[k1][k2].join(artist, on="Artist", how="left")

# Export

In [52]:
for k1 in dfs_chart.keys():
    for k2 in dfs_chart[k1].keys():
        dfs_chart[k1][k2].to_csv(f"../chart/{k1}_{k2}.csv", encoding="utf-8-sig", index = False)

# Test

In [53]:
missing = artist[
    (artist["Naissance"] == MISSING) & 
    (artist["Pays d'origine"] == MISSING) & 
    (artist["Origine"] == MISSING) & 
    (artist["Nationalité"] == MISSING) & 
    (artist["Pays"] == MISSING)
]

In [54]:
dfs = []
for k1 in dfs_chart.keys():
    for k2 in dfs_chart[k1].keys():
        dfs.append(dfs_chart[k1][k2])

df = pd.concat(dfs, ignore_index=True)

In [55]:
df.drop(columns=["Rank"], inplace=True)
df.drop_duplicates(inplace=True)

In [56]:
df["MISSING"] = (
    (df["Naissance"] == MISSING) &
    (df["Pays d'origine"] == MISSING) &
    (df["Origine"] == MISSING) &
    (df["Nationalité"] == MISSING) &
    (df["Pays"] == MISSING)
)

In [57]:
df.groupby("Artist")["MISSING"].sum().sort_values(ascending=False)[:20]

Artist
Landy                    12
Eva                      11
Swedish House Mafia       6
Sound Of Legend           6
Justice                   5
Pigloo                    5
Le 6-9                    5
Basto!                    4
Soma Riba                 4
Blue                      4
Glk                       4
Dry                       4
Feder                     4
Cauet                     4
Titou Le Lapinou          4
Kore                      3
Charlotte Aux Fraises     3
Gullia                    3
Far*east Movement         3
Sasso                     3
Name: MISSING, dtype: int64

In [58]:
df[df["Artist"] == "I Am"]

Unnamed: 0,Artist,Music,Naissance,Pays d'origine,Origine,Nationalité,Pays,MISSING
