In [1]:
import pandas as pd
import csv

## Get all currently used industry labels as list

In [2]:
# read csv file
df = pd.read_csv("../data/raw/raw_data.csv")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4204 entries, 0 to 4203
Data columns (total 4 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   startup_ID                   4204 non-null   int64 
 1   description_startupdetector  583 non-null    object
 2   startup_description          4072 non-null   object
 3   industry                     4204 non-null   object
dtypes: int64(1), object(3)
memory usage: 131.5+ KB


Unnamed: 0,startup_ID,description_startupdetector,startup_description,industry
0,4330,The company develops software for the analysis...,,manufacturing_tech
1,4707,The company is developing an information and p...,,education_tech
2,5255,The company develops bicycle accessories that ...,,Sport
3,5256,The company is developing sensor technology th...,,Industry
4,5257,The company is developing an as yet unknown mo...,,tourism


In [3]:
# get all unique industry labels
# Split the values in the column
df['split_column'] = df['industry'].str.split(',')

# Explode the lists into separate rows
df_exploded = df.explode('split_column')

# Extract the unique values
unique_values = df_exploded['split_column'].unique()

print(len(unique_values))

192


In [4]:
unique_values = sorted(unique_values, key=str.lower)
unique_values

['additive_manufacturing',
 'Advertising',
 'agrar_tech',
 'AgrarTech',
 'anti_counterfeit',
 'Architectural technology',
 'architectural_technology',
 'Artificial intelligence',
 'artificial_intelligence',
 'Augmented Reality',
 'augmented_reality',
 'automotive',
 'autonomous_driving',
 'aviation',
 'battery',
 'Battery',
 'bio_informatics',
 'bio_tech',
 'BioTech',
 'blockchain',
 'Blockchain',
 'Blockchain / Crypto',
 'Building automation',
 'building_automation',
 'carbon_capture',
 'charging_station',
 'chem_tech',
 'ChemTech',
 'clean_tech',
 'Cloud computing',
 'cloud_computing',
 'Collaborative Tech',
 'collaborative_tech',
 'communication_technology',
 'computer_vision',
 'condition_monitoring',
 'construction',
 'Construction Tech',
 'construction_tech',
 'consultancy',
 'Consultancy',
 'consumer_goods',
 'cosmetics_luxury',
 'Crafts / DIY',
 'crafts_diy',
 'Creative / Design',
 'creative_design',
 'Cyber Security',
 'cyber_security',
 'Data analytics',
 'Data management',
 

In [6]:
fields = ['INDUSTRY']
rows = []
for item in unique_values:
    rows.append([item])

with open('../data/industry_tags_unique.csv', 'w', newline='') as f:
      
    # using csv.writer method from CSV package
    write = csv.writer(f)
      
    write.writerow(fields)
    write.writerows(rows)

In [7]:
# create data frame with industry labels
df = pd.DataFrame(unique_values, columns=['unique_values'])

# create industry match by lowercasing and removing special characters
def transform_industry_for_matching(industry):
    industry = industry.strip()
    industry = industry.lower()
    industry = ''.join(e for e in industry if e.isalnum())
    return industry
df['industry_match'] = df['unique_values'].apply(transform_industry_for_matching)

df.head()

Unnamed: 0,unique_values,industry_match
0,additive_manufacturing,additivemanufacturing
1,Advertising,advertising
2,agrar_tech,agrartech
3,AgrarTech,agrartech
4,anti_counterfeit,anticounterfeit


In [8]:
# read in nuntium tech labels
n_df = pd.read_csv('../data/gruendermotor-tech-labels.csv')
n_df['industry_match'] = n_df['Label'].apply(transform_industry_for_matching)
n_df.head()

Unnamed: 0,Label,Type,molecule,industry_match
0,Unknown,MOLECULE,,unknown
1,Med & Health,MOLECULE,,medhealth
2,MedTech,ATOM,Med & Health,medtech
3,HealthTech,ATOM,Med & Health,healthtech
4,PharmaTech,ATOM,Med & Health,pharmatech


In [10]:
# add molecule to unique industry label
m_df = df.merge(n_df[['molecule', 'industry_match']], 'left', 'industry_match')
m_df.to_csv('../data/industry_label_merge.csv')
m_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   unique_values   192 non-null    object
 1   industry_match  192 non-null    object
 2   molecule        153 non-null    object
dtypes: object(3)
memory usage: 4.6+ KB


In [11]:
# read in filled merge dataframe
industry_match = pd.read_csv('../data/industry_label_merge_filled.csv')
industry_match.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   unique_values   192 non-null    object
 1   industry_match  192 non-null    object
 2   molecule        192 non-null    object
dtypes: object(3)
memory usage: 4.6+ KB


In [12]:
# merge new industry tag to original exploded df
new = df_exploded.merge(industry_match[['unique_values', 'molecule']], 'left', left_on='split_column', right_on='unique_values')
new.tail()

Unnamed: 0,startup_ID,description_startupdetector,startup_description,industry,split_column,unique_values,molecule
6898,5486,,ZURÜCK gives textiles from the hotel industry ...,"recycling,creative_design,textile_tech",recycling,recycling,Sustainability & GreenTech
6899,5486,,ZURÜCK gives textiles from the hotel industry ...,"recycling,creative_design,textile_tech",creative_design,creative_design,Retail & Living
6900,5486,,ZURÜCK gives textiles from the hotel industry ...,"recycling,creative_design,textile_tech",textile_tech,textile_tech,Materials & Deeptech
6901,5184,,"µWind designs, produces and sells small wind t...","smart_home,energy_tech",smart_home,smart_home,Retail & Living
6902,5184,,"µWind designs, produces and sells small wind t...","smart_home,energy_tech",energy_tech,energy_tech,Energy & Climate


In [13]:
print(new.info())
new = new.drop_duplicates(subset=['startup_ID'], keep='first')
print(new.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6903 entries, 0 to 6902
Data columns (total 7 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   startup_ID                   6903 non-null   int64 
 1   description_startupdetector  953 non-null    object
 2   startup_description          6771 non-null   object
 3   industry                     6903 non-null   object
 4   split_column                 6903 non-null   object
 5   unique_values                6903 non-null   object
 6   molecule                     6903 non-null   object
dtypes: int64(1), object(6)
memory usage: 377.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 4204 entries, 0 to 6901
Data columns (total 7 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   startup_ID                   4204 non-null   int64 
 1   description_startupdetector  583 non-null    ob

In [14]:
new.tail()

Unnamed: 0,startup_ID,description_startupdetector,startup_description,industry,split_column,unique_values,molecule
6894,2967,,ZU master students Jan Schmiedgen and Christia...,"collaborative_tech,sustainability",collaborative_tech,collaborative_tech,People & Learning
6896,2947,,ZU student Jeanette Orminski and ZU alumnus Ti...,hr_tech,hr_tech,hr_tech,People & Learning
6897,852,,Zündholz is a brand strategy and design agency...,user_interface,user_interface,user_interface,Data & Computing
6898,5486,,ZURÜCK gives textiles from the hotel industry ...,"recycling,creative_design,textile_tech",recycling,recycling,Sustainability & GreenTech
6901,5184,,"µWind designs, produces and sells small wind t...","smart_home,energy_tech",smart_home,smart_home,Retail & Living


In [16]:
# only keep necessary columns and save as raw_data.csv
raw_data = new.drop(columns=['industry', 'split_column', 'unique_values'])
raw_data = raw_data.rename(columns={"molecule": "industry"})
raw_data.to_csv('../data/small_labels/raw_data.csv')