In [1]:
import pandas as pd

# Load the CSV file
file_path = 'data/student_data_unprocessed.csv'
df = pd.read_csv(file_path)

# Display the first few rows to understand the structure of the data
df.head()


Unnamed: 0,Name,Email,EMBA,Resume,Evaluating Potential Markets/Choosing Target Market,Developing Pricing Model,Customer Decision Journey,Pilot Project Design and Implementation,Investment Pitch Deck,Hiring Plan,...,Deep Tech track ranking,Digital Tech track ranking,Life Sciences track ranking,Digital Health track ranking,tracks preference comment,Deep Tech Companies Preferences,Digital Tech Companies Preferences,Life Sciences Companies Preferences,Digital Health Companies Preferences,Contacted Startup?
0,"Chang, Wei Lin",wc2623@stern.nyu.edu,No,https://nyu.qualtrics.com/WRQualtricsSurveyEng...,5,4,3,3,5,3,...,2,3,1,4,,"ATOMICS, Free Form Fibers, RISE Robotics, Schn...","dataspan.ai, Hoox, Simulacra SDS, Trust Science","Cellinfinity Bio, IDP Pharma, Koi Biotherapeut...","Atmo Biosciences, GrayMatters Health, nSight S...",
1,"McDougall, Dawn",dem9907@stern.nyu.edu,No,https://nyu.qualtrics.com/WRQualtricsSurveyEng...,3,2,1,5,5,6,...,1,2,4,3,,"ATOMICS, Avol, Enigma Aerospace, RISE Robotics...","Banyan Infrastructure, CLIKA, CTGT, Lore Machine","Apricity Health, DoriNano, Olfera, SEED Therap...","GrayMatters Health, LUCID, Sama Therapeutics, ...",
2,"Akers, Travis",tda2296@stern.nyu.edu,No,https://nyu.qualtrics.com/WRQualtricsSurveyEng...,6,4,4,5,5,3,...,1,3,4,2,I have class conflicts with the 10/18 and 2/7 ...,"Avol, Cyanotype Bio, Enigma Aerospace, REEV, R...","CTGT, dataspan.ai, Elm AI, Lore Machine, PLATM...","BreakBio Corp, CartaBio, Ceramedix, DoriNano, ...","Banquet Health, Fort Health, LUCID, nSight Sur...",
3,"Gradelski, Mia",mig301@stern.nyu.edu,No,https://nyu.qualtrics.com/WRQualtricsSurveyEng...,5,1,4,4,4,2,...,3,1,4,2,,"Navaflex, REEV, Robochef, Werewool","CTGT, DataHive, Hoox, PLATMA, syd.life, Trust ...","Rasayana Therapeutics, Rejuvenation Technologi...","AdviNOW Medical, Banquet Health, Fort Health, ...",
4,"Lahanis, Niko",ngl6694@stern.nyu.edu,No,https://nyu.qualtrics.com/WRQualtricsSurveyEng...,6,5,1,5,1,1,...,2,1,4,3,Digital Tech and Deep Tech are my 1a and 1b pr...,"ATOMICS, Relyion Energy, Verne, Wright Electric","CLIKA, CTGT, DataHive, PLATMA, syd.life, Trust...","Apricity Health, BlueWhale Bio, BreakBio Corp,...","Banquet Health, nSight Surgical, SOAP Health, ...",


In [5]:
# Step 1: Create the required columns
output_df = pd.DataFrame()

# List of startups in current cohort - MUST match the order they appear in startups_data sheet
startup_names = [
    'ATOMICS', 'Free Form Fibers', 'RISE Robotics', 'Avol', 'Cyanotype Bio', 'Enigma Aerospace', 
    'REEV', 'Robochef', 'Werewool', 'Navaflex', 'Verne', 'Wright Electric', 'dataspan.ai', 'Hoox', 
    'Simulacra SDS', 'Trust Science', 'Banyan Infrastructure', 'CLIKA', 'CTGT', 'Lore Machine', 
    'Elm AI', 'PLATMA', 'DataHive', 'syd.life', 'Apricity Health', 'DoriNano', 'Olfera', 
    'SEED Therapeutics', 'BreakBio Corp', 'CartaBio', 'Ceramedix', 'Rasayana Therapeutics', 
    'Rejuvenation Technologies', 'Cellinfinity Bio', 'IDP Pharma', 'Koi Biotherapeutics', 
    'AdviNOW Medical', 'Banquet Health', 'Fort Health', 'LUCID', 'nSight Surgical', 'SOAP Health'
]

# Extract student name
output_df['student name'] = df['Name']

# Track rankings columns (Deep Tech, Digital Tech, Life Sciences, Digital Health)
# These are scaled to a range of 0.25 to 1 based on the rank value
def scale_ranking(rank):
    return 1.25 - (0.25 * rank)

output_df['Deep tech rank'] = df['Deep Tech track ranking'].apply(scale_ranking)
output_df['Digital tech rank'] = df['Digital Tech track ranking'].apply(scale_ranking)
output_df['Life sciences rank'] = df['Life Sciences track ranking'].apply(scale_ranking)
output_df['Digital health rank'] = df['Digital Health track ranking'].apply(scale_ranking)

# Step 2: Normalize skill values by dividing by 7 (assuming certain columns correspond to skills)
# Columns containing skill values (manual inspection required based on column names)
skill_columns = [
    'Evaluating Potential Markets/Choosing Target Market',
    'Developing Pricing Model',
    'Customer Decision Journey',
    'Pilot Project Design and Implementation',
    'Investment Pitch Deck',
    'Hiring Plan',
    'Marketing Plan & Execution',
    'Business Development & Sales',
    'Scaling Operations'
]
for col in skill_columns:
    output_df[col] = df[col] / 7

# Step 3: Generate startup preference columns
# Assuming preference columns are under the following structure:
preference_columns = [
    'Deep Tech Companies Preferences',
    'Digital Tech Companies Preferences',
    'Life Sciences Companies Preferences',
    'Digital Health Companies Preferences'
]

# Create columns for each startup
for startup in startup_names:
    output_df[startup] = 0.1  # Default value

# Function to assign weights based on preference rank
def assign_weights(pref_list, no_of_companies):
    weights = {}
    for idx, company in enumerate(pref_list):
        weight = (no_of_companies - idx) * (1 / no_of_companies)
        weights[company.strip()] = weight
    return weights

# Fill the startup preferences with scaled weights based on the student's preference
for i, row in df.iterrows():
    for col in preference_columns:
        if row[col]:
            company_list = row[col].split(',')
            company_weights = assign_weights(company_list, len(company_list))
            for company, weight in company_weights.items():
                if company in output_df.columns:
                    output_df.at[i, company] = weight

# Round all floating point values to 2 decimal places
output_df = output_df.round(2)

# Save the updated output to CSV
output_file_rounded = 'data/processed_student_data.csv'
output_df.to_csv(output_file_rounded, index=False)

output_file_rounded

'data/processed_student_data.csv'

In [6]:
output_df.head

<bound method NDFrame.head of        student name  Deep tech rank  Digital tech rank  Life sciences rank  \
0    Chang, Wei Lin            0.75               0.50                1.00   
1   McDougall, Dawn            1.00               0.75                0.25   
2     Akers, Travis            1.00               0.50                0.25   
3    Gradelski, Mia            0.50               1.00                0.25   
4     Lahanis, Niko            0.75               1.00                0.25   
5    Johnston, Maya            1.00               0.75                0.25   
6       Yin, Chunge            0.50               0.25                1.00   
7     Renshaw, Lena            0.75               1.00                0.25   
8       Hill, Sarah            0.50               1.00                0.25   
9         Weng, Shu            0.50               1.00                0.25   
10     Garg, Naresh            0.25               1.00                0.50   
11   Malhotra, Rhea            0.5