# FDA Recalls Data Preprocessing
**By Lorena Dorado & Parisa Kamizi**
- This notebook preprocesses the FDA recall data to create the final dataset for modeling

In [1]:
# Import Libraries
import re
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import TruncatedSVD

# NLP
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)

True

# Data Cleaning

In [2]:
# Load the data
file_path = '../data/recalls_details.xlsx'
df = pd.read_excel(file_path)

# Display basic information
print(f"Original data shape: {df.shape}")
print("\nColumn names:")
print(df.columns.tolist())

Original data shape: (95082, 17)

Column names:
['FEI Number', 'Recalling Firm Name', 'Product Type', 'Product Classification', 'Status', 'Distribution Pattern', 'Recalling Firm City', 'Recalling Firm State', 'Recalling Firm Country', 'Center Classification Date', 'Reason for Recall', 'Product Description', 'Event ID', 'Event Classification', 'Product ID', 'Center', 'Recall Details']


In [3]:
# Drop ID variables
drop_ID_cols = [
    # ID variables
    "FEI Number", "Event ID", "Product ID", "Recall Details"]

df = df.drop(columns=drop_ID_cols)

In [4]:
# Handle missing values in Distribution Pattern
df['Distribution Pattern'] = df['Distribution Pattern'].fillna('Unknown')

In [5]:
# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
Recalling Firm Name           0
Product Type                  0
Product Classification        0
Status                        0
Distribution Pattern          0
Recalling Firm City           0
Recalling Firm State          0
Recalling Firm Country        0
Center Classification Date    0
Reason for Recall             0
Product Description           0
Event Classification          0
Center                        0
dtype: int64


In [6]:
# Display cleaned data information
print(f"Cleaned data shape: {df.shape}")
print("\nColumn names:")
print(df.columns.tolist())

Cleaned data shape: (95082, 13)

Column names:
['Recalling Firm Name', 'Product Type', 'Product Classification', 'Status', 'Distribution Pattern', 'Recalling Firm City', 'Recalling Firm State', 'Recalling Firm Country', 'Center Classification Date', 'Reason for Recall', 'Product Description', 'Event Classification', 'Center']


# Data Preparation and Train-Test Split

In [7]:
# Define the target variable
X = df.drop(columns=['Event Classification'])
y = df['Event Classification']

In [8]:
# Create a stratified train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"\nTraining set class distribution: {y_train.value_counts(normalize=True)}")
print(f"\nTest set class distribution: {y_test.value_counts(normalize=True)}")

Training set shape: (76065, 12)
Test set shape: (19017, 12)

Training set class distribution: Event Classification
Class II     0.708065
Class I      0.211516
Class III    0.080418
Name: proportion, dtype: float64

Test set class distribution: Event Classification
Class II     0.708103
Class I      0.211495
Class III    0.080402
Name: proportion, dtype: float64


# Feature Engineering on Training Data

In [9]:
# Create a copy to work with for the training data
X_train_processed = X_train.copy()

#### Feature engineering for 'Center Classification Date'

In [10]:
# Convert to datetime
X_train_processed['Center Classification Date'] = pd.to_datetime(X_train_processed['Center Classification Date'], errors='coerce')

# Extract temporal features
X_train_processed['Classification Year'] = X_train_processed['Center Classification Date'].dt.year
X_train_processed['Classification Month'] = X_train_processed['Center Classification Date'].dt.month
X_train_processed['Classification Day'] = X_train_processed['Center Classification Date'].dt.day
X_train_processed['Classification DayOfWeek'] = X_train_processed['Center Classification Date'].dt.dayofweek

# Add cyclical encoding for month, day, and day of week
X_train_processed['Month_sin'] = np.sin(2 * np.pi * X_train_processed['Classification Month']/12)
X_train_processed['Month_cos'] = np.cos(2 * np.pi * X_train_processed['Classification Month']/12)
X_train_processed['Day_sin'] = np.sin(2 * np.pi * X_train_processed['Classification Day']/31)
X_train_processed['Day_cos'] = np.cos(2 * np.pi * X_train_processed['Classification Day']/31)
X_train_processed['DayOfWeek_sin'] = np.sin(2 * np.pi * X_train_processed['Classification DayOfWeek']/7)
X_train_processed['DayOfWeek_cos'] = np.cos(2 * np.pi * X_train_processed['Classification DayOfWeek']/7)

# Standardize year (continuous variable)
min_year = X_train_processed['Classification Year'].min()
X_train_processed['Years_Since_First'] = X_train_processed['Classification Year'] - min_year

#### Feature engineering for 'Recalling Firm Name'

In [11]:
# Extract business structure from company name
def extract_business_structure(firm_name):
    """Extract business structure from company name"""
    if not isinstance(firm_name, str):
        return 'Unknown'
    
    # Convert to uppercase for consistent pattern matching
    name = firm_name.upper()
    
    # Define patterns to search for, in order of specificity
    if any(x in name for x in [' LLC', ', LLC', ' L.L.C.', ', L.L.C.', 'LIMITED LIABILITY COMPANY']):
        return 'LLC'
    elif any(x in name for x in [' LP', ', LP', ' L.P.', ', L.P.', 'LIMITED PARTNERSHIP']):
        return 'LP'
    elif any(x in name for x in [' LLP', ', LLP', ' L.L.P.', ', L.L.P.', 'LIMITED LIABILITY PARTNERSHIP']):
        return 'LLP'
    elif any(x in name for x in [' CORP', ', CORP', ' CORPORATION', ', CORPORATION', ' CORP.', ', CORP.']):
        return 'Corporation'
    elif any(x in name for x in [' INC', ', INC', ' INC.', ', INC.', 'INCORPORATED']):
        return 'Inc'
    elif any(x in name for x in [' CO', ', CO', ' CO.', ', CO.', 'COMPANY']):
        return 'Company'
    elif any(x in name for x in ['ASSOCIATION', 'ASSOC.', 'ASSN']):
        return 'Association'
    elif any(x in name for x in ['PARTNERS', 'PARTNERSHIP']):
        return 'Partnership'
    elif any(x in name for x in ['FOUNDATION', 'NON-PROFIT', 'NONPROFIT', 'CHARITY']):
        return 'Non-Profit'
    elif any(x in name for x in [' PLC', ', PLC', 'PUBLIC LIMITED COMPANY']):
        return 'PLC'
    elif any(x in name for x in [' AG', ', AG', 'AKTIENGESELLSCHAFT']):
        return 'AG'
    elif any(x in name for x in [' SA', ', SA', 'SOCIEDAD ANONIMA']):
        return 'SA'
    elif any(x in name for x in [' GmbH', ', GmbH']):
        return 'GmbH'
    elif any(x in name for x in [' LTD', ', LTD', ' LIMITED', ', LIMITED']):
        return 'Ltd'
    else:
        # If no business structure indicators found, sole proprietorship or classify as unknown
        return 'Other'

In [12]:
# Create new column with business structure
X_train_processed['Business_Structure'] = X_train_processed['Recalling Firm Name'].apply(extract_business_structure)

#### Feature engineering for 'Recalling Firm Country'

In [13]:
# Create binary US indicator
X_train_processed['Is_US'] = (X_train_processed['Recalling Firm Country'] == 'United States').astype(int)

#### Create encoders during preprocessing of training data

In [14]:
# Create dictionary
categorical_encoders = {}
categorical_features = ['Product Classification', 'Product Type', 'Status', 'Business_Structure']

# Create and fit encoders on training data
for feature in categorical_features:
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first')
    encoder.fit(X_train_processed[[feature]])
    categorical_encoders[feature] = encoder
    
    # Transform the training data
    feature_array = encoder.transform(X_train_processed[[feature]])
    feature_names = [f"{feature.replace(' ', '')}_{cat}" for cat in encoder.categories_[0][1:]]
    
    # Create DataFrame with proper column names and index
    encoded_df = pd.DataFrame(
        feature_array, 
        columns=feature_names,
        index=X_train_processed.index
    )
    
    # Concatenate with the processed dataframe
    X_train_processed = pd.concat([X_train_processed, encoded_df], axis=1)

#### Clean and Normalize Text Data

In [15]:
# Define text cleaning function
def text_cleaner(text):
    if not isinstance(text, str):
        return ""
    
    # Convert to lowercase 
    text = str(text).lower()
    
    # Remove control characters and encoding artifacts
    text = re.sub(r'\*x[0-9a-f]{4}\*', ' ', text)
    text = re.sub(r'[\x00-\x1F\x7F-\x9F]', ' ', text)
    
    # Remove special characters but preserve important regulatory codes
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Remove numbers but keep important regulatory codes intact
    text = re.sub(r'\b\d+\b(?!\s*cfr|\s*usc|\s*fda)', ' ', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Normalize common terms
    text = re.sub(r'\b(upc|sku|item|code|number)s?\b', 'product_code', text)
    text = re.sub(r'\b(possible|potential|may|might|could)\b', 'potential', text)
    
    # Group measurement units
    text = re.sub(r'\b\d+\s*(oz|ml|mg|g)\b', 'quantity_measure', text)
    
    # Remove FDA-specific stopwords in addition to standard ones
    stop_words = set(stopwords.words('english') + 
                    stopwords.words('spanish') + 
                    stopwords.words('french') + 
                    stopwords.words('german'))
    
    fda_stopwords = {'recalled', 'recalling', 'firm', 'product', 'products', 'recall', 
                     'various', 'due', 'manufactured'}
    all_stopwords = stop_words.union(fda_stopwords)
    
    tokens = text.split()
    filtered_tokens = [w for w in tokens if w not in all_stopwords]
    
    return ' '.join(filtered_tokens)

#### Text Feature Engineering

In [16]:
# Process the text data in training set
X_train_text = X_train.copy()  # This should have the same index as X_train
X_train_text['reason_cleaned'] = X_train_text['Reason for Recall'].apply(text_cleaner)
X_train_text['description_cleaned'] = X_train_text['Product Description'].apply(text_cleaner)
X_train_text['combined_text'] = X_train_text['reason_cleaned'] + ' ' + X_train_text['description_cleaned']

In [17]:
# Create TF-IDF features with n-grams (fit on training data)
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    min_df=5,                   # Ignore terms that appear in fewer than 5 documents
    max_df=0.7,                 # Ignore terms that appear in more than 70% of documents
    ngram_range=(1, 2),         # Include unigrams and bigrams
    use_idf=True,
    sublinear_tf=True           # Apply sublinear tf scaling (1 + log(tf))
)

In [18]:
# Fit and transform on training data
tfidf_matrix_train = tfidf_vectorizer.fit_transform(X_train_text['combined_text'])

In [19]:
# Get the feature names for interpretability
feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"Number of TF-IDF features created: {len(feature_names)}")

Number of TF-IDF features created: 5000


In [20]:
# Apply dimensionality reduction (fit on training data)
n_components = min(300, tfidf_matrix_train.shape[1] - 1)
svd = TruncatedSVD(n_components=n_components, random_state=42)
tfidf_svd_train = svd.fit_transform(tfidf_matrix_train)

In [21]:
# Calculate explained variance
explained_var = svd.explained_variance_ratio_.sum()
print(f"Explained variance from {n_components} SVD components: {explained_var:.2f}")

Explained variance from 300 SVD components: 0.48


In [22]:
# Create DataFrame with the reduced features with X_train index
train_text_features_df = pd.DataFrame(
    tfidf_svd_train, 
    columns=[f'text_svd_{i}' for i in range(n_components)],
    index=X_train.index  # X_train index
)

# Create pre-processed TRAINING dataset for modeling

In [23]:
# FINAL CLEANUP - Drop all original columns now that feature engineering is complete
original_cols_to_drop = [
    # Original columns that have been transformed
    "Center", "Center Classification Date",
    "Recalling Firm Name", "Distribution Pattern",
    "Recalling Firm City", "Recalling Firm State", "Recalling Firm Country",
    "Product Type", "Status", "Business_Structure",
    # Text columns that have been processed
    "Reason for Recall", "Product Description",
    # Original categorical column now fully dummy encoded
    "Product Classification",
    # Intermediate columns
    "Classification Year", "Classification Month", "Classification Day", "Classification DayOfWeek"
]


In [24]:
# Filter to only include existing columns
cols_to_drop = [col for col in original_cols_to_drop if col in X_train_processed.columns]

In [25]:
# Drop columns
X_train_baseline = X_train_processed.drop(columns=cols_to_drop)

In [26]:
# Add the text features to the processed dataframe - use index-based joining
X_train_final = pd.concat([X_train_baseline, train_text_features_df], axis=1)
print(f"Training dataset shape: {X_train_final.shape}")

Training dataset shape: (76065, 329)


#### Process the TEST set using the same transformations learned from TRAINING

In [27]:
# Process the test data using the same steps and parameters learned from training
X_test_processed = X_test.copy()

In [28]:
# Convert to datetime
X_test_processed['Center Classification Date'] = pd.to_datetime(X_test_processed['Center Classification Date'], errors='coerce')

# Extract temporal features
X_test_processed['Classification Year'] = X_test_processed['Center Classification Date'].dt.year
X_test_processed['Classification Month'] = X_test_processed['Center Classification Date'].dt.month
X_test_processed['Classification Day'] = X_test_processed['Center Classification Date'].dt.day
X_test_processed['Classification DayOfWeek'] = X_test_processed['Center Classification Date'].dt.dayofweek

# Add cyclical encoding
X_test_processed['Month_sin'] = np.sin(2 * np.pi * X_test_processed['Classification Month']/12)
X_test_processed['Month_cos'] = np.cos(2 * np.pi * X_test_processed['Classification Month']/12)
X_test_processed['Day_sin'] = np.sin(2 * np.pi * X_test_processed['Classification Day']/31)
X_test_processed['Day_cos'] = np.cos(2 * np.pi * X_test_processed['Classification Day']/31)
X_test_processed['DayOfWeek_sin'] = np.sin(2 * np.pi * X_test_processed['Classification DayOfWeek']/7)
X_test_processed['DayOfWeek_cos'] = np.cos(2 * np.pi * X_test_processed['Classification DayOfWeek']/7)

# Standardize year (continuous variable)
# Use the same reference year from training
X_test_processed['Years_Since_First'] = X_test_processed['Classification Year'] - min_year

In [29]:
# Extract business structure for test set
X_test_processed['Business_Structure'] = X_test_processed['Recalling Firm Name'].apply(extract_business_structure)

In [30]:
# Country indicator for test set
X_test_processed['Is_US'] = (X_test_processed['Recalling Firm Country'] == 'United States').astype(int)

In [31]:
# Use encoders previously fit on training data
for feature in categorical_features:
    if feature in X_test_processed.columns:
        # Transform using the encoder fit on training data
        encoder = categorical_encoders[feature]
        feature_array = encoder.transform(X_test_processed[[feature]])
        
        # Get the same feature names used in training
        feature_names = [f"{feature.replace(' ', '')}_{cat}" for cat in encoder.categories_[0][1:]]
        
        # Create DataFrame with proper column names and index
        encoded_df = pd.DataFrame(
            feature_array, 
            columns=feature_names,
            index=X_test_processed.index
        )
        
        # Concatenate with the processed dataframe
        X_test_processed = pd.concat([X_test_processed, encoded_df], axis=1)

In [32]:
# Text features - process using the same vectorizer and SVD fit on training
X_test_text = X_test.copy()
X_test_text['reason_cleaned'] = X_test_text['Reason for Recall'].apply(text_cleaner)
X_test_text['description_cleaned'] = X_test_text['Product Description'].apply(text_cleaner)
X_test_text['combined_text'] = X_test_text['reason_cleaned'] + ' ' + X_test_text['description_cleaned']

In [33]:
# Transform using the vectorizer fit on training data
tfidf_matrix_test = tfidf_vectorizer.transform(X_test_text['combined_text'])

In [34]:
# Apply the SVD transformation fit on training data
tfidf_svd_test = svd.transform(tfidf_matrix_test)
print(f"X_test_text shape: {X_test_text.shape}, tfidf_matrix_test shape: {tfidf_matrix_test.shape}")

X_test_text shape: (19017, 15), tfidf_matrix_test shape: (19017, 5000)


In [35]:
# Create DataFrame with the reduced features with X_test index
test_text_features_df = pd.DataFrame(
    tfidf_svd_test, 
    columns=[f'text_svd_{i}' for i in range(n_components)],
    index=X_test.index  # X_test index
)

In [36]:
# Cleanup original columns (same as test set)
X_test_baseline = X_test_processed.drop(columns=cols_to_drop)

In [37]:
# Combine structured and text features with index-based joining
X_test_final = pd.concat([X_test_baseline, test_text_features_df], axis=1)

In [38]:
# Check that test has same columns as train
missing_cols = set(X_train_final.columns) - set(X_test_final.columns)
for col in missing_cols:
    X_test_final[col] = 0

In [39]:
# Check columns are in the same order
X_test_final = X_test_final[X_train_final.columns]
print(f"Test dataset shape: {X_test_final.shape}")

Test dataset shape: (19017, 329)


In [40]:
# Check both datasets have the same columns in the same order
assert all(X_train_final.columns == X_test_final.columns), "Column mismatch between train and test"
print("Training and test columns match")

Training and test columns match


# Save the processed datasets

Create BASELINE datasets (without text features)

In [41]:
# Add the target variable to the baseline training dataset
train_baseline = X_train_baseline.copy()
train_baseline['Event Classification'] = y_train.values

In [42]:
# Add the target variable to the baseline test dataset
test_baseline = X_test_baseline.copy()
test_baseline['Event Classification'] = y_test.values

print(f"Baseline Training set shape: {train_baseline.shape}")
print(f"Baseline Test set shape: {test_baseline.shape}")
print("\n Baseline Column names:")
print(test_baseline.columns.tolist())

Baseline Training set shape: (76065, 30)
Baseline Test set shape: (19017, 30)

 Baseline Column names:
['Month_sin', 'Month_cos', 'Day_sin', 'Day_cos', 'DayOfWeek_sin', 'DayOfWeek_cos', 'Years_Since_First', 'Is_US', 'ProductClassification_Class II', 'ProductClassification_Class III', 'ProductType_Devices', 'ProductType_Drugs', 'ProductType_Food/Cosmetics', 'ProductType_Tobacco', 'ProductType_Veterinary', 'Status_Ongoing', 'Status_Terminated', 'Business_Structure_Association', 'Business_Structure_Company', 'Business_Structure_Corporation', 'Business_Structure_Inc', 'Business_Structure_LLC', 'Business_Structure_LLP', 'Business_Structure_LP', 'Business_Structure_Ltd', 'Business_Structure_Non-Profit', 'Business_Structure_Other', 'Business_Structure_PLC', 'Business_Structure_SA', 'Event Classification']


Create HYBRID datasets (with text features)

In [43]:
# Add the target variable to the hybrid training dataset
train_hybrid = X_train_final.copy()
train_hybrid['Event Classification'] = y_train.values

In [44]:
# Add the target variable to the hybrid test dataset
test_hybrid = X_test_final.copy()
test_hybrid['Event Classification'] = y_test.values

print(f"Hybrid Training set shape: {train_hybrid.shape}")
print(f"Hybrid Test set shape: {test_hybrid.shape}")
print("\n Hybrid Column names:")
print(test_hybrid.columns.tolist())

Hybrid Training set shape: (76065, 330)
Hybrid Test set shape: (19017, 330)

 Hybrid Column names:
['Month_sin', 'Month_cos', 'Day_sin', 'Day_cos', 'DayOfWeek_sin', 'DayOfWeek_cos', 'Years_Since_First', 'Is_US', 'ProductClassification_Class II', 'ProductClassification_Class III', 'ProductType_Devices', 'ProductType_Drugs', 'ProductType_Food/Cosmetics', 'ProductType_Tobacco', 'ProductType_Veterinary', 'Status_Ongoing', 'Status_Terminated', 'Business_Structure_Association', 'Business_Structure_Company', 'Business_Structure_Corporation', 'Business_Structure_Inc', 'Business_Structure_LLC', 'Business_Structure_LLP', 'Business_Structure_LP', 'Business_Structure_Ltd', 'Business_Structure_Non-Profit', 'Business_Structure_Other', 'Business_Structure_PLC', 'Business_Structure_SA', 'text_svd_0', 'text_svd_1', 'text_svd_2', 'text_svd_3', 'text_svd_4', 'text_svd_5', 'text_svd_6', 'text_svd_7', 'text_svd_8', 'text_svd_9', 'text_svd_10', 'text_svd_11', 'text_svd_12', 'text_svd_13', 'text_svd_14', 'te

In [45]:
# Baseline datasets
train_baseline.to_csv('../data/train_baseline.csv', index=False)
test_baseline.to_csv('../data/test_baseline.csv', index=False)

In [46]:
# Hybrid datasets
train_hybrid.to_csv('../data/train_hybrid.csv', index=False)
test_hybrid.to_csv('../data/test_hybrid.csv', index=False)