In [2]:
import json
import pandas as pd
import pycountry

def clean_data(df):
    # Convert country names in 'region' to alpha-3 codes
    def convert_to_alpha_3(country_name):
        try:
            return pycountry.countries.lookup(country_name).alpha_3
        except LookupError:
            return None
    df['region'] = df['region'].apply(convert_to_alpha_3)
    # Convert the 'date' column to datetime
    df['date'] = pd.to_datetime(df['date'])
    # Create a new column 'ID' by splitting the 'link' column and taking the last part
    df['ID'] = df['url'].apply(lambda x: x.split('/')[-1])
    #drop useless tables
    df.drop(columns=['url', 'trend', 'rank', 'chart'], inplace=True)
    #add IsChristmasSong
    top_xmas_songs_df = pd.read_csv('data/top_xmas_songs_with_ids.csv')
    # Create a new column 'IsChristmasSong' and set it to True if the 'ID' is in the top Christmas songs DataFrame
    df['IsChristmasSong'] = df['ID'].isin(top_xmas_songs_df['SongID'])
    # # Create a new column 'IsChristmasSongByTitle' and set it to True if 'christmas' is in the 'title' column
    df['IsChristmasSongByTitle'] = df['title'].str.lower().str.contains('christmas')
    # # Update 'IsChristmasSong' to True if either 'IsChristmasSong' or 'IsChristmasSongByTitle' is True
    df['IsChristmasSong'] = df['IsChristmasSong'] | df['IsChristmasSongByTitle']
    # Aggregate streams on ID, keep title and artist
    df = df[df['IsChristmasSong'] == True]
    df = df.groupby(['ID', 'title', 'artist'], as_index=False)['streams'].sum()
    df = df.sort_values(by=['streams'], ascending=[False]).reset_index(drop=True)
    # Reset index as rank and add 1
    df.index = df.index + 1
    df.reset_index(inplace=True)
    df.rename(columns={'index': 'rank'}, inplace=True)
    df.to_csv('data/christmas_songs_ranked.csv', index=False)
    data_dict = df.to_dict('list')
    with open(f'data/christmas_songs_ranked.json', 'w') as f:
        json.dump(data_dict, f)

    return df

# Loaded variable 'df' from URI: f:\Daten\Privat\Projekte\#01_Active_Projects\14_DJ_WS2024_Music-Scraper\daily_charts.csv
df = pd.read_csv(r'f:\Daten\Privat\Projekte\#01_Active_Projects\14_DJ_WS2024_Music-Scraper\daily_charts.csv')

df_clean = clean_data(df.copy())
df_clean.head()

Unnamed: 0,rank,ID,title,artist,streams
0,1,0bYg9bo50gSsH3LtXe2SQn,All I Want for Christmas Is You,Mariah Carey,1447467000.0
1,2,2FRnf9qhLbvw8fu4IBXx78,Last Christmas,Wham!,1091586000.0
2,3,0lLdorYw7lVrJydTINhWdI,It's Beginning to Look a Lot like Christmas,Michael Bublé,769008000.0
3,4,0lizgQ7Qw35od7CYaoMBZb,Santa Tell Me,Ariana Grande,736516700.0
4,5,2EjXfH91m7f8HiJN1yQg97,Rockin' Around The Christmas Tree,Brenda Lee,649636400.0
