In [1]:
import pandas as pd
pd.options.display.max_columns = None
from collections import Counter

In [2]:
def unique_keyword_search(df_col: pd.Series, num_selected: int) -> list:
    '''
    Performs unique keyword search on a dataframe's column for its most common keywords

    :param pd.Series df_col: column of a pd.DataFrame (e.g. df['col'])
    :param int num_selected: number of keywords
    :return: list of keywords in decreasing occurrence
    :rtype: list
    '''
    keywords = []
    for entry in df_col:
        words = entry.split(',')
        [keywords.append(word) for word in words]
    sorted_keywords = Counter(keywords).most_common()
    output_keywords = []
    for i in range(num_selected):
        output_keywords.append(sorted_keywords[i][0])
    return output_keywords

In [3]:
org_df=pd.read_csv("org_final_joined_new.csv")

In [4]:
org_df.info(max_cols=1000)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1018183 entries, 0 to 1018182
Data columns (total 135 columns):
 #   Column                                          Non-Null Count    Dtype  
---  ------                                          --------------    -----  
 0   Unnamed: 0                                      1018183 non-null  int64  
 1   uuid                                            1018183 non-null  object 
 2   partner_type                                    626902 non-null   object 
 3   country_code                                    1012303 non-null  object 
 4   region                                          1012303 non-null  object 
 5   city                                            1012303 non-null  object 
 6   status                                          1018183 non-null  object 
 7   category_list                                   1018183 non-null  object 
 8   category_groups_list                            1018183 non-null  object 
 9   num_funding_

In [5]:
new_df= org_df[["Unnamed: 0", "uuid"]].copy()

In [6]:
#D_country code
country_df= org_df[["country_code"]].copy()
country_df=pd.get_dummies(country_df, prefix=["country"], columns=["country_code"], drop_first=False)
new_df=new_df.join(country_df)

In [7]:
#E_Region
new_df["region"]= org_df["region"]

In [8]:
#F_CIty
city_df= org_df[["city"]].copy()
city_df=pd.get_dummies(city_df, prefix=["city"], columns=["city"], drop_first=False)
new_df=new_df.join(city_df)

In [9]:
#G_Status
status_df= org_df[["status"]].copy()
status_df=pd.get_dummies(status_df, prefix=["status"], columns=["status"], drop_first=False)


keep_col = ["status_operating", "status_closed"] # change the cols to keep
drop_row = [0, 1] # change the rows to drop
status_df = status_df[keep_col].drop(drop_row)
new_df=new_df.join(status_df)

In [10]:
#H:category_list & I: category_groups_list (p.s. there is no null cell)
category_df=org_df[["category_list","category_groups_list"]].copy()

#Concantante 2 columns
category_df["category_concat"]= category_df["category_list"]+ "," + category_df["category_groups_list"]

#keyword search
kw= unique_keyword_search(category_df["category_concat"], 100)
#kw[0]
name_arr= [[] for _ in range(100)] #When access name_arr[column][row]
counter=0 

for i in category_df["category_concat"]:    
    for j in range(len(kw)):
        if kw[j] in i: 
            #print(counter)
            name_arr[j].append(1)
        else: 
            name_arr[j].append(0)
    counter=counter+1

for i in range(100): 
    category_df["cat_"+kw[i]]=name_arr[i]
category_df= category_df.drop(columns=["category_list", "category_groups_list", "category_concat"])

new_df=new_df.join(category_df)

In [11]:
#O: Employee_count
def em_count_func(df): 
    if df["employee_count"]=="1-10" :
        return 1
    if df["employee_count"]=="11-50" :
        return 2
    if df["employee_count"]=="51-100" :
        return 3
    if df["employee_count"]=="101-250" :
        return 4
    if df["employee_count"]=="251-500" :
        return 5
    if df["employee_count"]=="501-1000" :
        return 6
    if df["employee_count"]=="1001-5000" :
        return 7
    if df["employee_count"]=="5001-10000" :
        return 8
    if df["employee_count"]=="10000+" :
        return 9
    if df["employee_count"]=="unknown" :
        return 0

Employee_count_df= org_df[["employee_count"]].copy()
Employee_count_df["employee_count_scaled"]= Employee_count_df.apply(em_count_func, axis=1)


keep_col = ["employee_count_scaled"] # change the cols to keep
drop_row = [0, 1] # change the rows to drop
Employee_count_df = Employee_count_df[keep_col].drop(drop_row)
new_df=new_df.join(Employee_count_df)

In [12]:
#P_primary_role
pri_role_df= org_df[["primary_role"]].copy()
pri_role_df=pd.get_dummies(pri_role_df, prefix=["pri_role"], columns=["primary_role"], drop_first=False)
new_df=new_df.join(pri_role_df)

In [13]:
#Q_num_exists 
num_exits_df= org_df[["num_exits"]].copy()
num_exits_df["num_exits"]=num_exits_df["num_exits"].fillna(0)

new_df=new_df.join(num_exits_df)

In [14]:
#R_lead_investor_name

dummy_df= org_df[["lead_investor_name"]].copy()
new_df=new_df.join(dummy_df)

In [15]:
#S_lead_investor_type

lead_invest_type_df=org_df[["lead_investor_type"]].copy()
lead_invest_type_df=pd.get_dummies(lead_invest_type_df, prefix=["investor_type"], columns=["lead_investor_type"], drop_first=False)
#new_df=new_df.join(lead_invest_type_df)

In [16]:
#T_lead_investor_roles
lead_investor_role_df= org_df[["lead_investor_roles"]].copy()
investor=[]
company=[]
school=[]


inv_type_name_arr= ["investor", "company", "school"]
inv_type_arr= [investor, company, school]
lead_investor_role_df["lead_investor_roles"]= lead_investor_role_df["lead_investor_roles"].fillna(0)

for i in lead_investor_role_df["lead_investor_roles"]:
    if i==0: 
        inv_type_arr[0].append(0)
        inv_type_arr[1].append(0)
        inv_type_arr[2].append(0)
        continue
    for j in range(len(inv_type_name_arr)):
        if inv_type_name_arr[j] in i: 
            inv_type_arr[j].append(1)
        else: 
            inv_type_arr[j].append(0)

lead_investor_role_df["inv_investor"]= investor
lead_investor_role_df["inv_company"]= company
lead_investor_role_df["inv_school"]= school

keep_col = ["inv_investor", "inv_company", "inv_school"] # change the cols to keep
drop_row = [0, 1] # change the rows to drop
lead_investor_role_df = lead_investor_role_df[keep_col].drop(drop_row)
new_df=new_df.join(lead_investor_role_df)


In [17]:
#V_lead_investor_country_code

lead_country_df= org_df[["lead_investor_country_code"]].copy()
lead_country_df=pd.get_dummies(lead_country_df, prefix=["investor_country"], columns=["lead_investor_country_code"], drop_first=False)
new_df=new_df.join(lead_country_df)

In [18]:
#W_Lead_investor_region
dummy_df= org_df[["lead_investor_region"]].copy()
new_df=new_df.join(dummy_df)

In [19]:
#X_lead_investor_city
lead_city_df= org_df[["lead_investor_city"]].copy()
lead_city_df=pd.get_dummies(lead_city_df, prefix=["investor_city"], columns=["lead_investor_city"], drop_first=False)
new_df=new_df.join(lead_city_df)

In [20]:
#Y_lead_investor_type

lead_investor_type_df= org_df[["lead_investor_investor_types"]].copy()
lead_investor_type_df=pd.get_dummies(lead_investor_type_df, prefix=["investor_type"], columns=["lead_investor_investor_types"], drop_first=False)
lead_investor_type_df
new_df=new_df.join(lead_investor_type_df)

In [21]:
#Z_lead_investor_investment_count
lead_investor_count_df= org_df[["lead_investor_investment_count"]].copy()
lead_investor_count_df["lead_investor_investment_count"]=lead_investor_count_df["lead_investor_investment_count"].fillna(0)

new_df=new_df.join(lead_investor_count_df)

In [22]:
#AA_lead_investor_total_funding_usd
lead_investor_fund_df= org_df[["lead_investor_total_funding_usd"]].copy()
lead_investor_fund_df["lead_investor_total_funding_usd"]=lead_investor_fund_df["lead_investor_total_funding_usd"].fillna(0)

new_df=new_df.join(lead_investor_fund_df)

In [23]:
#AD_fund_rd_type
fund_rd_type_df= org_df[["fund_rd_type"]].copy()
fund_rd_type_df["fund_rd_type"]=fund_rd_type_df["fund_rd_type"].fillna(0)

new_df=new_df.join(fund_rd_type_df)

In [24]:
#AE_country_code

fund_country_df= org_df[["fund_rd_country_code"]].copy()
fund_country_df=pd.get_dummies(fund_country_df, prefix=["fund_country"], columns=["fund_rd_country_code"], drop_first=False)
new_df=new_df.join(fund_country_df)

In [25]:
#AF& AG region and statecode
dummy_df= org_df[["fund_rd_state_code", "fund_rd_region"]].copy()
new_df=new_df.join(dummy_df)

In [26]:
#AH_city

fund_city_df= org_df[["fund_rd_city"]].copy()
fund_city_df=pd.get_dummies(fund_city_df, prefix=["fund_country"], columns=["fund_rd_city"], drop_first=False)
new_df=new_df.join(fund_city_df)


In [27]:
#AI_fund_rd_raised_amount_usd (Discovered that if there is no entry for "type", there will be nan)

fund_raised_df= org_df[["fund_rd_raised_amount_usd"]].copy()
fund_raised_df=fund_raised_df.fillna(fund_raised_df.mean())

new_df=new_df.join(fund_raised_df)


In [28]:
# AN_ fund_rd_post_money_valuation_usd

fund_posted_df= org_df[["fund_rd_post_money_valuation_usd"]].copy()
fund_posted_df=fund_posted_df.fillna(fund_posted_df.mean())
new_df=new_df.join(fund_posted_df)

In [29]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1018183 entries, 0 to 1018182
Columns: 5865 entries, Unnamed: 0 to fund_rd_post_money_valuation_usd
dtypes: float64(11), int64(101), object(7), uint8(5746)
memory usage: 6.4+ GB
