Packages

In [1]:
import pandas as pd
import ast
import sklearn
import category_encoders as ce
import numpy as np
sklearn.set_config(transform_output="pandas")

Inladen data in dataframes


In [2]:
brabant_df = pd.read_csv(r'uva-relevance-windows-brabantsdagblad.csv', sep=';')
volkskrant_df = pd.read_csv(r'uva-relevance-windows-volkskrant.csv', sep=';')
trouw_df = pd.read_csv(r'uva-relevance-windows-trouw.csv', sep=';')
parool_df = pd.read_csv(r'uva-relevance-windows-parool.csv', sep=';')
nu_df = pd.read_csv(r'uva-relevance-windows-nu.csv', sep=';')
destentor_df = pd.read_csv(r'uva-relevance-windows-destentor.csv', sep=';')
ad_df = pd.read_csv(r'uva-relevance-windows-ad.csv', sep=';')

Kolom namen
- 'article_id'
- 'brand'
- 'publication_timestamp'
- 'publication_weekday'
- 'title'
- 'first_paragraph'
- 'full_body'
- 'num_words'
- 'num_paragraphs'
- 'author_ids'
- 'labels'
- 'main_section'
- 'subsections'
- 'user_needs'
- 'lda_topics'
- 'iptc_topics_level_0'
- 'iptc_topics_level_1'
- 'time_indications_mentioned'
- 'relevance_window_in_hours'

In [3]:
def split_dictionary_column(df: pd.DataFrame, column_name: str) -> None:

    # convert string containing a dictionary to a normal dictionary
    df[column_name] = df[column_name].apply(ast.literal_eval)

    set_for_keys = set()
    df[column_name].apply(lambda x: set_for_keys.update(x.keys()))

    for key in set_for_keys:
        if f'{key}_' in df.columns:
            raise ResourceWarning(f'te veel features met dezelfde naam ({key})')
        elif key in df.columns:
            df[f'{key}_'] = df[column_name].apply(lambda x: x.get(key, 0))    
        else:
            df[key] = df[column_name].apply(lambda x: x.get(key, 0))
    df.drop(columns=[column_name], inplace=True, axis=1)



In [4]:
def split_date_time(df: pd.DataFrame, column_name: str) -> None:
    df[column_name] = pd.to_datetime(df[column_name])

    # save the date in a seperate column in epoch
    df["date"] = df[column_name].apply(lambda x: int(x.timestamp()))

    # rounded to the nearest hour
    df["hour"] = df[column_name].dt.hour

    # save the time rounded to the hour in a sin and cos column to maintain the cyclic nature of hours
    df['sin_time'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['cos_time'] = np.cos(2 * np.pi * df['hour'] / 24)
    df.drop(columns=[column_name, 'hour'], inplace=True)


In [5]:
def target_encoding(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, base_column: str) -> None:

    
    encoder = ce.TargetEncoder(cols=[base_column])
    encoder.fit(X_train[base_column], y_train)
    X_train[base_column] = encoder.transform(X_train[base_column])
    X_test[base_column] = encoder.transform(X_test[base_column])

In [6]:
def combine(df: pd.DataFrame, column_1: str, column_2: str, new_name: str) -> None:
    # either the label or the subsection column only contain NaN values
    df[new_name] = df[column_1].fillna(df[column_2])
    df.drop(columns=[column_1, column_2], inplace=True)

In [7]:
def split_on_time_indication(df: pd.DataFrame, text_column_name: str, new_column_name: str, indications: list[str], index_of_extra_indication: int=None) -> pd.Series:

    def check_time_indication(row):
        dynamic_indications = indications.copy()
        if index_of_extra_indication is not None:
            extra_indication = row['centered_weekday'][index_of_extra_indication]
            dynamic_indications.append(extra_indication)
        if type(row[text_column_name])==str:
            return any(indication.lower() in row[text_column_name].lower() for indication in dynamic_indications)
        else:
            return False
    
    df[new_column_name] = df.apply(check_time_indication, axis=1)
    df[new_column_name] = df[new_column_name].apply(int)

def weekday_index(weekday: str) -> int:
    weekdays = {
        'maandag': 0,
        'dinsdag': 1,
        'woensdag': 2,
        'donderdag': 3,
        'vrijdag': 4,
        'zaterdag': 5,
        'zondag': 6
    }
    if weekday not in weekdays:
        raise ValueError(f"'{weekday}' is niet bekend")
    return weekdays[weekday]
    

def weekday_to_cyclic(df: pd.DataFrame, column_name: str) -> None:

    df['weekday'] = df[column_name].apply(weekday_index)

    # save in to seperate columns to retain the cyclic nature of weekdays
    df['sin_weekday'] = np.sin(2 * np.pi * df['weekday'] / 7)
    df['cos_weekday'] = np.cos(2 * np.pi * df['weekday'] / 7)

    # delete the unnecessary columns 
    df.drop(columns=[column_name, 'weekday'], inplace=True)

def convert_indices_to_weekdays(list_of_indices: list[int]) -> list[str]:
    weekday_names = ['maandag', 'dinsdag', 'woensdag', 'donderdag', 'vrijdag', 'zaterdag', 'zondag']
    result = []
    for index in list_of_indices:   
        result.append(weekday_names[index])
    return result

def centered_weekday_names(df: pd.DataFrame, column_name: str) -> None:
    df['weekday_indices'] = df[column_name].apply(weekday_index)
    
    df['centered_indices'] = df['weekday_indices'].apply(lambda x: ((x + np.arange(7)) % 7))  
    df['centered_weekday'] = df['centered_indices'].apply(convert_indices_to_weekdays)




def time_indications(df: pd.DataFrame, text_column_name: str) -> None:
    split_on_time_indication(df, text_column_name, 'jaar', ['dit jaar', 'vorig jaar', 'komend jaar', 'komende jaren', 'afgelopen jaar', 'afgelopen jaren', 'aankomende jaar', 'aankomende jaren'])
    split_on_time_indication(df, text_column_name, 'maand', ['deze maand', 'deze maanden', 'vorige maand', 'vorige maanden', 'afgelopen maanden', 'afgelopen maand', 'komende maanden', 'komende maand', 'aankomende week', 'aankomende weken'])
    split_on_time_indication(df, text_column_name, 'week', ['deze week', 'deze weken', 'vorige week', 'vorige weken', 'afgelopen weken', 'afgelopen week' 'komende week', 'komende weken', 'aankomende week', 'aankomende weken'])
    centered_weekday_names(df, 'publication_weekday')
    split_on_time_indication(df, text_column_name, '-3 dagen', [], -3)
    split_on_time_indication(df, text_column_name, '-2 dagen', ['eergister', 'eergisteren'], -2)
    split_on_time_indication(df, text_column_name, 'Gister', ['gister', 'gisteren'], -1)
    split_on_time_indication(df, text_column_name, 'Vandaag', ['vandaag', 'vanavond', 'vanmiddag', 'vanochtend', 'vanmorgen'], 0)
    split_on_time_indication(df, text_column_name, 'Morgen', ['morgen'], 1)
    split_on_time_indication(df, text_column_name, '+2 dagen', ['overmorgen'], 2)
    split_on_time_indication(df, text_column_name, '+3 dagen', [], 3)

    weekday_to_cyclic(df, 'publication_weekday')



    df.drop(columns=['weekday_indices', 'centered_indices', 'centered_weekday'], inplace=True)



In [8]:

parool_df = pd.read_csv(r'uva-relevance-windows-parool.csv', sep=';')

In [9]:
def preprocess_dataframe_fully_seperated_columns(uncopied_df, filename: str, target_column: str, with_brand: bool)-> pd.DataFrame:
    X = uncopied_df.copy()

    split_date_time(X, 'publication_timestamp')
    time_indications(X, 'full_body')

    split_dictionary_column(X, 'user_needs')
    split_dictionary_column(X, 'lda_topics')
    split_dictionary_column(X, 'iptc_topics_level_0')
    split_dictionary_column(X, 'iptc_topics_level_1')

    combine(X, 'labels', 'subsections', 'subsections_labels')
    X.drop(columns=['time_indications_mentioned', 'article_id', 'full_body', 'title', 'first_paragraph'], axis=1, inplace=True)

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


    # target_encoding(X_train, X_test, y_train, 'author_ids')
    # target_encoding(X_train, X_test, y_train, 'main_section')
    # target_encoding(X_train, X_test, y_train, 'subsections_labels')


    # # datasets with only one brand don't need the brand feature
    # if not with_brand:
    #     X_train.drop(columns=['brand'], inplace=True)
    #     X_test.drop(columns=['brand'], inplace=True)
    # else:
    #     target_encoding(X_train, X_test, y_train, 'brand')

    
    # os.makedirs(filename, exist_ok=True)   

    # file_path_X_train = os.path.join(filename, f'{filename}_X_train.csv')
    # file_path_X_test = os.path.join(filename, f'{filename}_X_test.csv')
    # file_path_y_train = os.path.join(filename, f'{filename}_y_train.csv')
    # file_path_y_test = os.path.join(filename, f'{filename}_y_test.csv')

    X.to_csv('alle_kranten_ongesplitst', index=False)
    # X_test.to_csv(file_path_X_test, index=False)
    # y_train.to_csv(file_path_y_train, index=False)
    # y_test.to_csv(file_path_y_test, index=False)
    
    return X
    
    
# new__parool_df = preprocess_dataframe_fully_seperated_columns(parool_df, 'parool', 'relevance_window_in_hours', False)
# preprocess_dataframe_fully_seperated_columns(brabant_df, 'brabant', 'relevance_window_in_hours', False)
# preprocess_dataframe_fully_seperated_columns(volkskrant_df, 'volkskrant', 'relevance_window_in_hours', False)
# preprocess_dataframe_fully_seperated_columns(ad_df, 'ad', 'relevance_window_in_hours', False)
# preprocess_dataframe_fully_seperated_columns(nu_df, 'nu', 'relevance_window_in_hours', False)
# preprocess_dataframe_fully_seperated_columns(destentor_df, 'destentor', 'relevance_window_in_hours', False)
# preprocess_dataframe_fully_seperated_columns(trouw_df, 'trouw', 'relevance_window_in_hours', False)


In [10]:

compleet = pd.concat([ad_df, trouw_df, volkskrant_df, destentor_df, parool_df, nu_df, brabant_df], axis=0, ignore_index=True)

preprocess_dataframe_fully_seperated_columns(compleet, 'alle_kranten', 'relevance_window_in_hours', True)

  df[key] = df[column_name].apply(lambda x: x.get(key, 0))
  df[key] = df[column_name].apply(lambda x: x.get(key, 0))
  df[key] = df[column_name].apply(lambda x: x.get(key, 0))
  df[key] = df[column_name].apply(lambda x: x.get(key, 0))
  df[key] = df[column_name].apply(lambda x: x.get(key, 0))
  df[key] = df[column_name].apply(lambda x: x.get(key, 0))
  df[key] = df[column_name].apply(lambda x: x.get(key, 0))
  df[key] = df[column_name].apply(lambda x: x.get(key, 0))
  df[key] = df[column_name].apply(lambda x: x.get(key, 0))
  df[key] = df[column_name].apply(lambda x: x.get(key, 0))
  df[key] = df[column_name].apply(lambda x: x.get(key, 0))
  df[key] = df[column_name].apply(lambda x: x.get(key, 0))
  df[key] = df[column_name].apply(lambda x: x.get(key, 0))
  df[key] = df[column_name].apply(lambda x: x.get(key, 0))
  df[key] = df[column_name].apply(lambda x: x.get(key, 0))
  df[key] = df[column_name].apply(lambda x: x.get(key, 0))
  df[key] = df[column_name].apply(lambda x: x.get(key, 0

Unnamed: 0,brand,num_words,num_paragraphs,author_ids,main_section,relevance_window_in_hours,date,sin_time,cos_time,jaar,...,met pensioen,studenten,gemeenschappen,sociale wetenschappen,werkgelegenheid,religieuze leider,arbeidswetgeving,leerplan,religieus festival en vakantie,subsections_labels
0,ad,43,2,['1ad8d704-fc8a-34e8-84cc-c988f31a567c'],regio,171.0,1714524150,0.000000,1.000000,0,...,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,"den-haag,112-nieuws-den-haag"
1,ad,60,2,['1ad8d704-fc8a-34e8-84cc-c988f31a567c'],regio,19.0,1714524213,0.000000,1.000000,0,...,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,"arnhem,112-nieuws-arnhem"
2,ad,173,4,['b163fc4b-3173-36bb-bd76-33de20eb10b6'],show,18.0,1714524329,0.000000,1.000000,0,...,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,
3,ad,58,2,['1ad8d704-fc8a-34e8-84cc-c988f31a567c'],regio,55.0,1714524906,0.000000,1.000000,0,...,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,"barendrecht,112-nieuws-barendrecht"
4,ad,504,14,['a44196e8-5c16-3698-99ff-b88e9fa68034'],regio,20.0,1714525206,0.258819,0.965926,0,...,0.0,0.0,0.0,0.4122,0.4046,0.0,0.0,0.0,0.0,opinie
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213461,brabantsdagblad,326,7,['f298d4bf-3c65-3d9f-bda9-76dbcbc50c67'],sport,11.0,1730410727,-0.707107,0.707107,1,...,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,met samenvatting
213462,brabantsdagblad,125,6,['e3467224-255e-3245-a34a-e6421d897443'],regio,22.0,1730411733,-0.707107,0.707107,0,...,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,brabant
213463,brabantsdagblad,129,6,['e3467224-255e-3245-a34a-e6421d897443'],regio,10.0,1730412716,-0.500000,0.866025,0,...,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,update
213464,brabantsdagblad,234,6,['9dbbb436-12ef-3466-8de2-54345e798ba8'],nieuws,17.0,1730414422,-0.500000,0.866025,0,...,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,binnenland
