# Feature Engineering

Code authored by: Shaw Talebi

### imports

In [1]:
import pandas as pd
from sklearn.cluster import KMeans

from utils import (
    load_clean_data,
    compute_openai_embedding, 
    summarize_all_responses,
    classify_all_company_sizes, 
    classify_all_job_categories, 
    create_category_boolean_columns, 
    greater_than_or_equal_x,
)

### students

In [2]:
df_students = load_clean_data('students')

#### join reason

In [3]:
# summarize course_join_question and add to df
col_name = 'course_join_question'
summarized_responses = summarize_all_responses(df_students, col_name)

df_students.insert(
        df_students.columns.get_loc(col_name) + 1,
        'course_join_question_summarized',
        summarized_responses['course_join_question_summarized']
    )

Summarizing response for:   Understand Gen AI usecases...
Summarizing response for: I would like to get practical experience in building and deploying ai applications. I have some grou...
Summarizing response for: New to the field, I will learn as much as I can so I can grow and buikd a career in AI...
Summarizing response for: I would like to start to have a grasp of how I can use AI to build products....
Summarizing response for: Learn more about AI customization. Building my own program using different tools . Learning also bas...
Summarizing response for: Two things: (1) Build AI applications to streamline my work, primarily for corporate finance / bizop...
Summarizing response for: Uplevel my knowledge on AI and ideally be able to build something using AI....
Summarizing response for: Ai app building intro...
Summarizing response for: Gain hands-on experience interfacing with LLMs, networking...
Summarizing response for: Being able to build end to end AI applications....
Summarizi

In [4]:
col_name = 'course_join_question_summarized'

# compute embeddings
embeddings = compute_openai_embedding(df_students[col_name], "")

# cluster with kmeans
N = 5
kmeans = KMeans(n_clusters=N, random_state=0, n_init="auto").fit(embeddings)
df_students['course_join_question_cluster'] = kmeans.labels_ + 1

# add clusters to df
create_category_boolean_columns(df_students, 'course_join_question_cluster')

#### company size

In [5]:
# use an LLM to guess company size
result = classify_all_company_sizes(df_students, 'company', 'gpt-4.1')

# Find the position of the 'company' column
company_col_position = df_students.columns.get_loc('company')

# Add each key from result dict as a new column right after the company column
for i, (key, values) in enumerate(result.items()):
    df_students.insert(
        company_col_position + 1 + i,  # Position to insert (after company column + previous result columns)
        key,                           # Column name
        values                         # Column values
    )

# add company size categories to df
create_category_boolean_columns(df_students, 'company_size_category')

Loaded 98 existing company classifications


#### job title

In [6]:
# use an LLM to guess company size
result = classify_all_job_categories(df_students, 'job_title', 'company_size_category', 'gpt-4.1')

# Find the position of the 'company' column
company_col_position = df_students.columns.get_loc('job_title')

# Add each key from result dict as a new column right after the company column
for i, (key, values) in enumerate(result.items()):
    df_students.insert(
        company_col_position + 1 + i,  # Position to insert (after company column + previous result columns)
        key,                           # Column name
        values                         # Column values
    )

# add job title categories to df
create_category_boolean_columns(df_students, 'job_category')

Classifying job title for: Student at unknown...
Classifying job title for: Product Manager at unknown...
Classifying job title for: Intern - CS Undergrad at smb...
Classifying job title for: Data Analyst/Economist at enterprise...
Classifying job title for: VP Finance at enterprise...
Classifying job title for: Head of Product at smb...
Classifying job title for: Sr. Product Manager at enterprise...
Classifying job title for: Data Scientist at smb...
Classifying job title for: Program Manager at unknown...
Classifying job title for: Technical Project Manager at enterprise...
Classifying job title for: CIO at smb...
Classifying job title for: Solutions Engineer at enterprise...
Classifying job title for: Technology Analyst at enterprise...
Classifying job title for: Data Scientist at smb...
Classifying job title for: Data Analyst at enterprise...
Classifying job title for: Software Engineer at smb...
Classifying job title for: ML Engineer at enterprise...
Classifying job title for: Pro

#### source

In [7]:
# add source categories to df
create_category_boolean_columns(df_students, 'source')

### activity

In [8]:
df_activity = load_clean_data('activity')

In [9]:
# create boolean columns for different levels of engagement
col_name_list = ["projects_submitted", "community_posts"]
x_list = [3, 1]

for col_name in col_name_list:
    for x in x_list:
        greater_than_or_equal_x(df_activity, col_name, x)

### reviews

In [10]:
df_reviews = load_clean_data('reviews')

In [11]:
# create boolean column for 10/10 reviews
greater_than_or_equal_x(df_reviews, "rating", 10)

### join dataframes

In [12]:
# inner join students and activity data
df_students_activity = df_students.merge(df_activity, on='name', how='inner')

In [13]:
df_students_activity.shape

(92, 41)

In [14]:
# outer join student/activity data and review data
df_combined = df_students_activity.merge(df_reviews, on='name', how='outer')

In [15]:
# fill missing review data
pd.set_option('future.no_silent_downcasting', True)
df_combined['rating_gte_10'] = df_combined['rating_gte_10'].fillna(False).astype(bool)
df_combined["rating_exists"] = df_combined["rating"].notna()

In [16]:
# save file
new_filename = 'data/2-clean/students_activity_reviews.csv'
df_combined.to_csv(new_filename, index=False)