In [2]:
import json
import pandas as pd

def clean_data(df):
    #clean data
    df['date'] = pd.to_datetime(df['date'])
    # Create a new column 'ID' by splitting the 'link' column and taking the last part
    df['ID'] = df['url'].apply(lambda x: x.split('/')[-1])
    #drop useless tables
    df.drop(columns=['url', 'trend', 'rank', 'chart'], inplace=True)
    #add IsChristmasSong
    top_xmas_songs_df = pd.read_csv('data/top_xmas_songs_with_ids.csv')
    # Create a new column 'IsChristmasSong' and set it to True if the 'ID' is in the top Christmas songs DataFrame
    df['IsChristmasSong'] = df['ID'].isin(top_xmas_songs_df['SongID'])
    # Create a new column 'IsChristmasSongByTitle' and set it to True if 'christmas' is in the 'title' column
    df['IsChristmasSongByTitle'] = df['title'].str.lower().str.contains('christmas')
    # Update 'IsChristmasSong' to True if either 'IsChristmasSong' or 'IsChristmasSongByTitle' is True
    df['IsChristmasSong'] = df['IsChristmasSong'] | df['IsChristmasSongByTitle']
    df = df[df['IsChristmasSong'] == True]
    #aggregate christmas (and non) streams per day
    df = df.groupby(['date'])['streams'].sum().reset_index()
    # delete days without streams
    df = df[df['streams']>0]
    #import weather data
    weather_data_df = pd.read_csv('data/aggregated_daily_weather_data.csv')
    weather_data_df['time'] = pd.to_datetime(weather_data_df['time'])
    #aggregate weather data
    weather_data_df = weather_data_df.groupby(['time']).agg({
        'tavg': 'mean',
        'tmax': 'max',
        'tmin': 'min',
        'snow': 'mean'
    }).reset_index()
    # Merge the dataframes on the date columns
    df = pd.merge(df, weather_data_df, left_on='date', right_on='time', how='right')
    df = df[df['time'].dt.year > 2016]
    # remove year
    df['year'] = df['time'].dt.year
    df['month'] = df['time'].dt.month
    df['day'] = df['time'].dt.day
    df['time'] = df.apply(lambda row: f"{int(row['month']):02d}-{int(row['day']):02d}", axis=1)
    # avg the days
    df = df.groupby(['time', 'month', 'day']).agg({
            'tavg': 'mean',
            'streams': 'mean'
        }).reset_index()
    df = df[(df['month']>9)| (df['month']<2)]
    df_begin = df[df['month']<10]
    df_rest = df[df['month']>9]
    df_begin = df_begin.sort_values(by=['month', 'day'], ascending=[True, True])
    df_rest = df_rest.sort_values(by=['month', 'day'], ascending=[True, True])
    df = pd.concat([df_rest, df_begin])
    def change_date(date):
        date = date.split('-')
        return f"{date[1]}.{date[0]}"
    df['time'] = df['time'].apply(change_date)
    #exort data
    df.to_csv(f'data/xmas_streams_weekly_data_Q4.csv', index=False)
    data_dict = df.where(pd.notnull(df), None).to_dict('list') #doesn't really work
    # Save dictionary to JSON file
    with open(f'data/xmas_streams_weekly_data_Q4.json', 'w') as f:
        json.dump(data_dict, f)
    return df

# Loaded variable 'df' from URI: f:\Daten\Privat\Projekte\#01_Active_Projects\14_DJ_WS2024_Music-Scraper\data\daily_charts.csv
df = pd.read_csv(r'f:\Daten\Privat\Projekte\#01_Active_Projects\14_DJ_WS2024_Music-Scraper\data\daily_charts.csv')

df_clean = clean_data(df.copy())
df_clean.head()

Unnamed: 0,time,month,day,tavg,streams
274,1.1,10,1,21.93253,2627.0
275,2.1,10,2,21.8544,
276,3.1,10,3,21.769454,
277,4.1,10,4,21.677555,
278,5.1,10,5,21.640234,
