In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

Reading Data and creating Features and Classification splits

In [3]:
# Read the data
df = pd.read_csv('data.csv', low_memory=False)

# Split into features and target
X = df.drop('cancer_type', axis=1)
X = X.drop('patient_id', axis=1)
y = df['cancer_type']

Converting Strings into one-hot encoded data

In [5]:
def encode_categorical_cols(cols, df=X):

    for col in cols:
        df[col] = df[col].fillna("missing") # Fills NA values with a string 

        df[col] = pd.Categorical(df[col], categories=df[col].unique(), ordered=True) # Categorizes

        df[col] = df[col].cat.codes # Replaces strings with categories

    return df

nominal_cols =[]

# Automatically fill nominal_cols with columns containing strings that are not in ordinal_cols
for col in X.columns:
    if X[col].dtype == 'object':
        nominal_cols.append(col)

# Encode categorical columns
X = encode_categorical_cols(nominal_cols)
X = X.fillna(0)

label_encoder = LabelEncoder()

# Convert string labels to integer labels
y = label_encoder.fit_transform(y)

Splitting the features into train, validation, and test datasets. Then oversampling minority classes and normalizing data.

In [8]:
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y, shuffle=True
) # Splits data into 80% training and 20% testing
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp, shuffle=True
) # Splits the 80% training into 60% training and 20% validation
print('Distribution in y_train:', np.bincount(y_train))
smote = SMOTE()
X_train, y_train = smote.fit_resample(X_train, y_train) # Oversamples minority classes

print('Distribution in y_train:', np.bincount(y_train))
print('Distribution in y_val:', np.bincount(y_val))
print('Distribution in y_test:', np.bincount(y_test))

# Normalizing numerical features
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

Distribution in y_train: [719  68  11  99]
Distribution in y_train: [719 719 719 719]
Distribution in y_val: [240  23   3  33]
Distribution in y_test: [240  23   4  33]
