In [1]:
import json
import pandas as pd
import pycountry

def clean_data(df):
    # Convert country names in 'region' to alpha-3 codes
    def convert_to_alpha_3(country_name):
        try:
            return pycountry.countries.lookup(country_name).alpha_3
        except LookupError:
            return None
    df['region'] = df['region'].apply(convert_to_alpha_3)
    # Convert the 'date' column to datetime
    df['date'] = pd.to_datetime(df['date'])
    # Create a new column 'ID' by splitting the 'link' column and taking the last part
    df['ID'] = df['url'].apply(lambda x: x.split('/')[-1])
    #drop useless tables
    df.drop(columns=['url', 'trend', 'rank', 'chart'], inplace=True)
    #add IsChristmasSong
    top_xmas_songs_df = pd.read_csv('data/top_xmas_songs_with_ids.csv')
    # Create a new column 'IsChristmasSong' and set it to True if the 'ID' is in the top Christmas songs DataFrame
    df['IsChristmasSong'] = df['ID'].isin(top_xmas_songs_df['SongID'])
    # # Create a new column 'IsChristmasSongByTitle' and set it to True if 'christmas' is in the 'title' column
    # df['IsChristmasSongByTitle'] = df['title'].str.lower().str.contains('christmas')
    # # Update 'IsChristmasSong' to True if either 'IsChristmasSong' or 'IsChristmasSongByTitle' is True
    # df['IsChristmasSong'] = df['IsChristmasSong'] | df['IsChristmasSongByTitle']
    #aggregate christmas (and non) streams per day
    df = df.groupby(['date', 'IsChristmasSong'])['streams'].sum().reset_index()
    #resample to thu weeks, plus filter christmas
    df.set_index('date', inplace=True)
    df_true = df[df['IsChristmasSong'] == True]
    # df_false = df[df['IsChristmasSong'] == False]
    df_true = df_true.resample('W-THU').agg({
        'streams': 'sum'
    })
    # df_false = df_false.resample('W-THU').agg({
    #     'streams': 'sum'
    # })
    df = df_true
    df = df[df['streams']>0]
    df = df.reset_index()
    # df['year'] = df['date'].dt.year
    # df['month'] = df['date'].dt.month
    # df['day'] = df['date'].dt.day
    #add weather data
    weather_data_df = pd.read_csv('data/aggregated_weekly_weather_data_with_weeks.csv')
    weather_data_df['time'] = pd.to_datetime(weather_data_df['time'])
    # weather_data_df = weather_data_df[weather_data_df['Country'] == 'Austria']
    # Group weather_data_df by time and create the required aggregations
    weather_data_df = weather_data_df.groupby(['time', 'calendar_week']).agg({
        'tavg': 'mean',
        'tmax': 'max',
        'tmin': 'min',
        'snow': 'mean'
    }).reset_index()
    # Create a 'year' column from the 'time' column in weather_data_df
    weather_data_df['year'] = weather_data_df['time'].dt.year
    # Merge the dataframes on the date columns
    df = pd.merge(df, weather_data_df, left_on='date', right_on='time', how='right')
    # Sort merged_df by year and calendar_week
    df = df.sort_values(by=['year', 'calendar_week'])
    #aggregate to global avg
    df = df.groupby('calendar_week').agg({
        'tavg': 'mean',
        'streams': 'mean'
    }).reset_index()
    df = df[df['calendar_week'] > 39]
    #exort data
    df.to_csv(f'data/xmas_streams_weekly_data_Q4.csv', index=False)
    data_dict = df.where(pd.notnull(df), None).to_dict('list') #doesn't really work
    # Save dictionary to JSON file
    with open(f'data/xmas_streams_weekly_data_Q4.json', 'w') as f:
        json.dump(data_dict, f)
    return df

# Loaded variable 'df' from URI: f:\Daten\Privat\Projekte\#01_Active_Projects\14_DJ_WS2024_Music-Scraper\daily_charts.csv
df = pd.read_csv(r'f:\Daten\Privat\Projekte\#01_Active_Projects\14_DJ_WS2024_Music-Scraper\daily_charts.csv')

df_clean = clean_data(df.copy())
df_clean.head()

Unnamed: 0,calendar_week,tavg,streams
39,40,22.055766,
40,41,21.60248,24489.0
41,42,21.245658,23276.0
42,43,20.828713,53135.33
43,44,20.313138,1680446.0
