In [2]:
import pandas as pd

In [3]:
# Read in data
layoffs_df = pd.read_csv('layoffs.csv')

# Observe the initial dataframe dimensions
print(f"Number of initial feature columns: {len(layoffs_df.columns)}")
print(f"Number of initial observations: {len(layoffs_df)}")

# Convert data to the correct datatypes
layoffs_df['company']  = layoffs_df['company'].astype('string')
layoffs_df['location']  = layoffs_df['location'].astype('string')
layoffs_df['industry'] = layoffs_df['industry'].astype('string')
layoffs_df['date'] = pd.to_datetime(layoffs_df['date'])
layoffs_df['stage'] = layoffs_df['stage'].astype('string')
layoffs_df['country'] = layoffs_df['country'].astype('string')

# Remove observations with at least one feature missing, or with unknown stage
layoffs_df = layoffs_df[layoffs_df['stage'] != 'Unknown']
# Also removing location (implicity covered by "country")
layoffs_df = layoffs_df.drop('location', axis=1)
# Also removing the names of companies (should not be important for our analysis)
layoffs_df = layoffs_df.drop('company', axis=1)

layoffs_df = layoffs_df.dropna()

# Chunking dates into quarters
# Compute the number of months since the minimum year and month
years = layoffs_df['date'].dt.year
months = layoffs_df['date'].dt.month

min_date = layoffs_df['date'].min()
chunks = (years - min_date.year) * 12 + (months - min_date.month)

# Add the chunk column if needed
layoffs_df['date'] = chunks

# Chunking stages
mapping = {
    'Seed': 'Early Stage',
    'Series A': 'Early Stage',
    'Series B': 'Growth Stage',
    'Series C': 'Growth Stage',
    'Series D': 'Growth Stage',
    'Series E': 'Growth Stage',
    'Series F': 'Growth Stage',
    'Series G': 'Growth Stage',
    'Series H': 'Growth Stage',
    'Series I': 'Growth Stage',
    'Series J': 'Growth Stage',
    'Post-IPO': 'Post-IPO',
    'Private Equity': 'Post-IPO',
    'Acquired': 'Acquired / Subsidiary',
    'Subsidiary': 'Acquired / Subsidiary',
}
layoffs_df['stage'] = layoffs_df['stage'].map(lambda stage: mapping.get(stage))

# One-hot encoding
layoffs_df = pd.get_dummies(layoffs_df, drop_first=True, dtype = int)

# Observe the new dataframe dimensions
print(f"Number of feature columns after cleaning: {len(layoffs_df.columns)}")
print(f"Number of observations after cleaning: {len(layoffs_df)}")



layoffs_df.to_csv('cleaned_layoffs.csv', index=False)

Number of initial feature columns: 9
Number of initial observations: 3101
Number of feature columns after cleaning: 69
Number of observations after cleaning: 1182
