## Cell 1 : Imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split


## Cell 2 : Load dataset

In [None]:
df = pd.read_csv("../data/lung_data_nov18_d070819.csv")


## Cell 3 : Select 15 variables

In [None]:
selected_vars = [
    "age", "sex", "race7", "educat",
    "cig_stat", "pack_years", "cig_years",
    "cigpd_f", "cig_stop", "rsmoker_f",
    "emphys_f", "bronchit_f",
    "bmi_curr", "lung_fh",
    "lung_cancer"      # target
]

df = df[selected_vars].copy()

print("Initial shape:", df.shape)


## Cell 4 : Replace special missing codes

In [None]:
missing_codes = [".M", ".N", ".U", ".NA", "NA", " ", ""]
df = df.replace(missing_codes, pd.NA)


## Cell 5 : Convert numeric columns

In [None]:
numeric_cols = [
    "age", "pack_years", "cig_years", "cigpd_f",
    "cig_stop", "bmi_curr"
]

df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")


## Cell 6 : Convert categorical columns to category type

In [None]:
cat_cols = [
    "sex", "race7", "educat", "cig_stat",
    "rsmoker_f", "emphys_f", "bronchit_f", "lung_fh"
]

for col in cat_cols:
    df[col] = df[col].astype("category")


## Cell 7 : Handle missing values

In [None]:
# Numeric missing → median
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

# Categorical missing → mode (most common)
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])


## Cell 8 : One-hot encode categorical variables

In [None]:
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

print("After encoding:", df.shape)


## Cell 9 : Save cleaned dataset

In [None]:
df.to_csv("lung_15_variable_cleaned.csv", index=False)
print("Cleaned dataset saved as: lung_15_variable_cleaned.csv")
