In [2]:
from os import listdir
import pandas as pd
from bs4 import BeautifulSoup
from requests import get
import numpy as np
from tqdm import tqdm
import wikipedia
from nltk.metrics.distance import edit_distance

In [3]:
MISSING = ""

# Loading local data

In [4]:
#Récupére les données sur le PC pour éviter de scrapper à chaque fois
files = listdir("../chart")

dfs_chart = {}
for f in files:
    year = f.split("_")[0]
    month = f.split("_")[1].split(".")[0]

    if dfs_chart.get(year) is None:
        dfs_chart[year] = {}
        
    dfs_chart[year][month] = pd.read_csv("../chart/" + f)

In [5]:
dfs_chart["2006"]["Janvier"].head(3)

Unnamed: 0,Rank,Artist,Music,Artist_wiki,Naissance,Pays d'origine,Origine,Nationalité,Pays
0,1,Juanes,La Camisa Negra,Juanes,"9 août 1972 (50 ans)Medellin, Colombie",,,,
1,2,Madonna,Hung Up,Madonna,"16 août 1958 (64 ans)Bay City, Michigan (États...",,,Américaine,
2,3,Johnny Hallyday,Mon Plus Beau Noël,Johnny Hallyday,15 juin 1943 Paris 9e (France),,,Française,


In [6]:
artist = pd.read_csv("../artist.csv")

In [7]:
artist

Unnamed: 0,Artist,Artist_wiki,Naissance,Pays d'origine,Origine,Nationalité,Pays
0,Anuel Aa,Anuel AA,26 novembre 1992 (30 ans)Carolina (Porto Rico),,,portoricaine,
1,Childish Gambino,This Is America,,,,,
2,Tydiaz,Foire de Châlons,,,,,
3,Lucenzo,Lucenzo,"27 mai 1983 (39 ans)Bordeaux, France",,,,
4,Chris Garcia,Dany Garcia,29 novembre 1968 (54 ans)Belleville,,,américaine,
...,...,...,...,...,...,...,...
2267,Radio Killer,Radio Killer,,,,,
2268,Star Academy Maghreb,Star Academy (Maghreb),,,,,
2269,Mr. Oizo,Nonfilm,,,,,france
2270,Cee-lo Green,Cee Lo Green,"30 mai 1974 (48 ans)[1],[2]Atlanta, Géorgie, ...",,,,


# Supression des dernières columns

In [76]:
#Supprime les dernières colonnes pour relancer le scrapping sur wikipédia
for k1 in dfs_chart.keys():
    for k2 in dfs_chart[k1].keys():
        dfs_chart[k1][k2].drop(columns=["Naissance", "Pays d'origine", "Origine", "Pays", "Nationalité", "Artist_wiki"], inplace=True)

In [78]:
dfs_chart["2006"]["Janvier"].head(3)

Unnamed: 0,Rank,Artist,Music
0,1,Juanes,La Camisa Negra
1,2,Madonna,Hung Up
2,3,Johnny Hallyday,Mon Plus Beau Noël


In [8]:
artist.drop(columns=["Naissance", "Pays d'origine", "Origine", "Pays", "Nationalité"], inplace=True)
artist.head(3)

Unnamed: 0,Artist,Artist_wiki
0,Anuel Aa,Anuel AA
1,Childish Gambino,This Is America
2,Tydiaz,Foire de Châlons


In [12]:
artist.drop(columns=["Artist_wiki"], inplace=True)

# Scrapping chart

In [72]:
years = np.arange(2004, 2023, 1)
weeks = [int(v) for v in np.linspace(1, 52, 12)]
mois = ["Janvier", "Fevrier", "Mars", "Avril", "Mai", "Juin", "Juillet", "Aout", "Septembre", "Octobre", "Novembre", "Decembre"]

In [73]:
dfs_chart = {y : {} for y in years}
for y in tqdm(years):
    for k, w in enumerate(weeks):
        url = f"https://acharts.co/france_singles_top_100/{y}/{w}"
        rq = get(url)
        m = mois[k]
        if rq.ok:
            soup = BeautifulSoup(rq.text)

            chart = soup.find("table", {"id" : "ChartTable"})
            trs = chart.findAll("tr")[1:]

            data = []
            for tr in trs:
                music_name = tr.find("span", {"itemprop" : "name"}).text
                rank = tr.find("span", {"itemprop" : "position"}).text
                artist_name = tr.find("span", {"itemprop" : "byArtist"}).text[2:-1]
                data.append([rank, artist_name, music_name])

            df = pd.DataFrame(data, columns=["Rank", "Artist", "Music"])
            dfs_chart[y][m] = df

  0%|          | 0/19 [00:08<?, ?it/s]


KeyboardInterrupt: 

In [None]:
names = [
    ("Djadja", "Djadja et Dinaz"),
    ("Lorie", "Lorie Pester"),
    ("-M-", "Matthieu Chedid"),
    ("Priscilla", "Priscilla Betti"),
    ("I Am", "IAM"),
    ("Sofiane", "Sofiane Zermani"),
    ("Justice", "Justice (groupe"),
    ("Soma Riba", "Collectif Métissé"),
    ("Fresh", "Fresh la Peufra"),
    ("Rosalia", "Rosalía"),
    ("Italo Brothers", "ItaloBrothers"),
    ("Far*east Movement", "Far East Movement"),
    ("Odyssey", "Odyssey (groupe)"),
    ("1789", "1789 : Les Amants de la Bastille"),
    ("Clemence", "Clémence Saint-Preux"),
    ("C\x9cUr De Pirate", "Cœur de pirate"),
    ("Rose", "Rose (chanteuse)"),
    ("Laeti", "Laetitia Kerfa"),
    ("La Troupe", "Mozart, l'opéra rock"),
    ("Victoria", "Victoria Sio")
    
]

for k1 in dfs_chart.keys():
    for k2 in dfs_chart[k1].keys():
        for n1, n2 in names:
            dfs_chart[k1][k2].replace(n1, n2, inplace=True)

# Scrapping birth

In [48]:
def find_title_in_wikipedia(title):
    pourcentage = 0.7
    words = ["(chanteur)", "(chanteuse)", "(groupe)", "(rappeur)", "(rappeuse)", "(musicien)", "(chanteur français)", "(france)", "(producteur)", "(artiste)", "(groupe de musique)"]

    wikipedia.set_lang("fr")
    results = wikipedia.search(title, results=10)
    distance = []
    if len(results) > 0:
        for element in results:
            if any((w in element.lower()) and (edit_distance(element.lower().rstrip(w), title.lower())/len(title) < pourcentage) for w in words):
                return element

            distance.append(edit_distance(title.lower(), element.lower()))

        return results[np.argmin(distance)] if min(distance)/len(title) <= pourcentage else MISSING

In [15]:
def wiki_birth(title):
    cols = ["Naissance", "Pays d'origine", "Origine", "Nationalité", "Pays"]
    nats = ["franco", "français", "belge", "canadien", "libanais", "réunionnais"]
    dic = {w : MISSING for w in cols}

    if title == MISSING:
        return dic

    url = f"https://fr.wikipedia.org/wiki/{title}"
    rq = get(url)

    if not rq.ok:
        return dic
    
    soup = BeautifulSoup(rq.text)
    tables = soup.findAll("table")

    for table in tables:
        trs = table.findAll("tr")

        for tr in trs:
            th = tr.find("th")

            if th is not None:
                for w in cols:
                    if w in th.text:
                        td = tr.find("td")
                        if td is not None:
                            dic[w] = td.text.strip()

    if not all(x == " " for x in dic.values()):
        return dic
    else:
        wikipedia.set_lang("fr")
        try:
            summary = wikipedia.summary(title, sentences=1)
            for w in nats:
                if w in summary:
                    dic["Nationalité"] = w
                    return dic
        except:
            pass

    return dic

In [16]:
#Récupére tous les artistes uniques
artist = []
for k1 in dfs_chart.keys():
    for k2 in dfs_chart[k1].keys():
        artist += dfs_chart[k1][k2]["Artist"].tolist()
    
artist = pd.DataFrame(list(set(artist)), columns=["Artist"])

In [17]:
artist["Artist"] = artist["Artist"].str.split(" X ").str[0] #Supprime les feat pour garder l'artiste principal
artist["Artist_wiki"] = artist["Artist"].apply(find_title_in_wikipedia) #Trouve les pages wikipedia de chaque artistes

In [18]:
birth_dic = artist["Artist_wiki"].apply(wiki_birth) #Cherche les infos de naissance sur les pages wikipedia

In [19]:
#Transforme les infos trouver sur wikipédia en dataframe
dfs_birth = []
for dic in birth_dic:
    dfs_birth.append(pd.DataFrame(dic, index=[0]))
birth = pd.concat(dfs_birth, ignore_index=True)

In [20]:
#Merge les infos de naissance avec les infos des artistes
artist = artist.merge(birth, left_index=True, right_index=True)
artist = artist.set_index("Artist")

for k1 in dfs_chart.keys():
    for k2 in dfs_chart[k1].keys():
        dfs_chart[k1][k2] = dfs_chart[k1][k2].merge(artist, on="Artist", how="left")

# CLEANNING

In [21]:
cols = artist.columns.tolist()
cols.remove("Naissance")
cols.remove("Artist_wiki")

for col in cols:
    artist[col] = artist[col].apply(lambda x : x if len(x) < 50 else MISSING)

In [22]:
def cleanning(data, replace_words):
    new_df = data.copy()
    new_df = new_df.str.lower()
    for w1, w2 in replace_words:
        new_df[new_df.str.contains(w1)] = w2

    return new_df

In [23]:
replace_words = [
    ("français", "française"),
    ("franco", "française"),
    ("canadien", "canadienne"),
    ("américain", "américaine"),
    ("états-unis", "américaine"),
    ("algérien", "algérienne"),
    ("brésil", "brésilienne"),
    ("france", "française"),
    ("marocain", "marocaine"),
    ("anglais", "britannique"),
    ("royaume-unis", "britannique"),
    ("britannique", "britannique"),
    ("espagn", "espagnole"),
    ("italie", "italienne"),
    ("japon", "japonaise"),
    ("corée", "coréenne"),
    ("israél", "israélienne"),
    ("royaume-uni", "britannique"),
    ("suède", "suédoise"),
    ("allemand", "allemande"),
    ("dominicain", "dominicaine")
]

artist["Nationalité"] = cleanning(artist["Nationalité"], replace_words)
artist["Origine"] = cleanning(artist["Origine"], replace_words)

In [24]:
replace_words = [
    ("france", "france"),
    ("états-unis", "états-unis"),
    ("royaume-uni", "royaume-unis"),
    ("allemagne", "allemagne"),
    ("angleterre", "angleterre"),
    ("canada", "canada"),
    ("australie", "australie"),
    ("pays-bas", "pays-bas"),
    ("ghana", "ghana"),
    ("corée", "corée"),
    ("autriche", "autriche"),
    ("suède", "suède"),
    ("italie", "italie"),
    ("biélorussie", "biélorussie"),
    ("mauritanie", "mauritanie")
]

artist["Pays"] = cleanning(artist["Pays"], replace_words)
artist["Pays d'origine"] = cleanning(artist["Pays d'origine"], replace_words)

# Birthday case

# Export

In [27]:
for k1 in dfs_chart.keys():
    for k2 in dfs_chart[k1].keys():
        dfs_chart[k1][k2].to_csv(f"../chart/{k1}_{k2}.csv", encoding="utf-8-sig", index = False)

In [28]:
artist.to_csv("../artist.csv", encoding="utf-8-sig")