In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None,'display.max_row', None)

default_figsize = (15,5)

In [None]:
df = pd.read_csv("ipos_cleaned.csv")

In [None]:
# df.head()

In [None]:
col_to_fillna = ['share_price_usd', 'valuation_price_usd', 'money_raised_usd']

for col in col_to_fillna:
    mean = df[col].mean()
    df[col] = df[col].fillna(mean)

df.dropna(how = "any", inplace = True)
df = df.reset_index(drop = True)
df.head()

In [None]:
# draft by Stardust to visualize how many stuff needs to be labelled individually
# the rest can be grouped into, say, continents
# country_series = df['country_code'].value_counts()
# country_series.head(35)
# stock_exchange_series = df['stock_exchange_symbol'].value_counts()
# stock_exchange_series.head(40)

In [None]:
# Stardust ver one-hot for country_code, stock_exchange_symbol
def encoding_ipo(df: pd.DataFrame, col_name: str, num_selected: int) -> pd.DataFrame:
    '''
    Performs one-hot encoding on a dataframe's column for its values with most occurrences

    :param pd.DataFrame df: dataframe to be processed
    :param str col_name: name of the encoded column
    :param int num_selected: number of values with most occurrences
    :return: processed dataframe
    :rtype: pd.DataFrame
    '''
    series = df[col_name].value_counts()
    selected_col_name = series.head(num_selected).index.tolist()

    for item in selected_col_name:
        df[item] = np.where(df[col_name] == item, 1, 0) # whenever df[col_name] == cat replace it with 1 else 0

    return df

In [None]:
num_country = 35 # select 35 most significant countries
num_stock_exchange = 40 # select 40 most significant stock exchanges
df = encoding_ipo(df, 'country_code', num_country)
df = encoding_ipo(df, 'stock_exchange_symbol', num_stock_exchange)

In [None]:
df.drop(columns=['stock_symbol'], inplace = True)

In [None]:
df.head()