Reading the csv file

In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("Data/lung_cancer_mortality_data_large_v2.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3250000 entries, 0 to 3249999
Data columns (total 18 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   id                           int64  
 1   age                          float64
 2   gender                       object 
 3   country                      object 
 4   diagnosis_date               object 
 5   cancer_stage                 object 
 6   beginning_of_treatment_date  object 
 7   family_history               object 
 8   smoking_status               object 
 9   bmi                          float64
 10  cholesterol_level            int64  
 11  hypertension                 int64  
 12  asthma                       int64  
 13  cirrhosis                    int64  
 14  other_cancer                 int64  
 15  treatment_type               object 
 16  end_treatment_date           object 
 17  survived                     int64  
dtypes: float64(2), int64(7), object(9)
memory 

In [3]:
import pandas as pd

# Load the first 100,000 rows from the CSV file
df = pd.read_csv("lung_cancer_mortality_data_large_v2.csv", nrows=100000)

# Identify categorical columns
categorical_columns = ['gender', 'country', 'cancer_stage', 'family_history', 
                       'smoking_status', 'treatment_type']

# Perform one-hot encoding on categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_columns)

# Check the result
df_encoded.info()

# Optionally, you can save the result to a new CSV file
#df_encoded.to_csv("lung_cancer_mortality_encoded.csv", index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 55 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   id                             100000 non-null  int64  
 1   age                            100000 non-null  float64
 2   diagnosis_date                 100000 non-null  object 
 3   beginning_of_treatment_date    100000 non-null  object 
 4   bmi                            100000 non-null  float64
 5   cholesterol_level              100000 non-null  int64  
 6   hypertension                   100000 non-null  int64  
 7   asthma                         100000 non-null  int64  
 8   cirrhosis                      100000 non-null  int64  
 9   other_cancer                   100000 non-null  int64  
 10  end_treatment_date             100000 non-null  object 
 11  survived                       100000 non-null  int64  
 12  gender_Female                  

Applying feature engineering

In [11]:
import pandas as pd
import featuretools as ft

# Step 1: Load the first 100,000 rows from the CSV file
df = pd.read_csv("lung_cancer_mortality_data_large_v2.csv", nrows=700000)

# Step 2: Identify categorical columns to perform one-hot encoding
categorical_columns = ['gender', 'country', 'cancer_stage', 'family_history', 
                       'smoking_status', 'treatment_type']

# Step 3: Perform one-hot encoding on the specified categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_columns)

# Step 4: Apply Featuretools for automated feature engineering

# Create an EntitySet for Featuretools
es = ft.EntitySet(id="lung_cancer_data")

# Add the dataframe to the EntitySet using `add_dataframe` method
es = es.add_dataframe(dataframe_name="mortality", dataframe=df_encoded, index="id")

# Step 5: Define compatible aggregation and transformation primitives to increase features

# Aggregation primitives
agg_primitives = ["count", "mean", "mode", "sum", "min", "max", "std", "median", 
                  "skew", "num_true"]

# Transformation primitives
trans_primitives = ["multiply_numeric", "divide_numeric", "time_since", "day", 
                    "month", "year", "add_numeric", "subtract_numeric", 
                    "absolute", "percentile", "is_weekend", "is_leap_year"]

# Run DFS to generate new features using the extended primitives and set max_depth for deeper features
feature_matrix, feature_defs = ft.dfs(entityset=es, 
                                      target_dataframe_name="mortality", 
                                      agg_primitives=agg_primitives,
                                      trans_primitives=trans_primitives,
                                      max_features=250,  # Limit to 250 features
                                      max_depth=2)       # Increase depth for more complex features

# Step 6: Ensure 'survived' is the target variable
# Remove 'survived' from the feature matrix to prevent data leakage during feature generation
if 'survived' in feature_matrix.columns:
    feature_matrix = feature_matrix.drop('survived', axis=1)

# Add the original 'survived' column back to the feature matrix
feature_matrix['survived'] = df_encoded['survived']

# Step 7: Save the final feature matrix to a CSV file
feature_matrix.to_csv("lung_cancer_mortality_featured.csv", index=False)

# Check the new feature matrix structure
print(feature_matrix.info())


  agg_primitives: ['count', 'max', 'mean', 'median', 'min', 'mode', 'num_true', 'skew', 'std', 'sum']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.


<class 'pandas.core.frame.DataFrame'>
Index: 700000 entries, 1 to 700000
Columns: 225 entries, age to survived
dtypes: bool(43), boolean(6), category(9), float64(161), int64(6)
memory usage: 942.2 MB
None
