In [1]:
# import library
import pandas as pd
import numpy as np

In [2]:
# read the CSV
df = pd.read_csv("event_appearances.csv")
# change the cols to keep
keep_col = ['uuid','event_uuid','participant_uuid','participant_name','participant_type','appearance_type']
df = df[keep_col]

In [3]:
# # view distinct values
# participant_type_series = df['participant_type'].value_counts()
# print(participant_type_series)

In [4]:
# appearance_type_series = df['appearance_type'].value_counts()
# print(appearance_type_series)

In [5]:
# one-hot encoder for participant_type, appearance_type
def encoding_event_app(df: pd.DataFrame, col_name: str, num_selected: int) -> pd.DataFrame:
    '''
    Performs one-hot encoding on a dataframe's column for its values with most occurrences

    :param pd.DataFrame df: dataframe to be processed
    :param str col_name: name of the encoded column
    :param int num_selected: number of values with most occurrences
    :return: processed dataframe
    :rtype: pd.DataFrame
    '''
    series = df[col_name].value_counts()
    selected_col_name = series.head(num_selected).index.tolist()

    for item in selected_col_name:
        df[item] = np.where(df[col_name] == item, 1, 0) # whenever df[col_name] == cat replace it with 1 else 0

    return df

In [6]:
num_participant_type = 2 # there are only 2: person, organization
df = encoding_event_app(df, 'participant_type', num_participant_type)
df.drop(labels='participant_type', axis=1, inplace=True)

In [7]:
num_appearance_type = 5 # there are only 5: speaker, sponsor, exhibitor, organizer, contestant
df = encoding_event_app(df, 'appearance_type', num_appearance_type)
df.drop(labels='appearance_type', axis=1, inplace=True)

In [8]:
# df.info()

In [9]:
# df

In [10]:
df.to_csv("event_appearances_cleaned.csv", index=False)