In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the dataset
file_path = r'C:\Users\Srinjoy RayChaudhuri\Hackathon work\lung cancer data set.csv.csv'
data = pd.read_csv(file_path)


In [None]:
# Display the first few rows to understand the data structure
print("Initial data snapshot:\n", data.head())

In [None]:
# 1. Drop unnecessary columns
# Dropping 'index' and 'Patient Id' as they are unique identifiers and don't provide predictive value.
data = data.drop(['index', 'Patient Id'], axis=1)

In [None]:
# 2. Handle missing values
# Filling missing values with the mean for numerical columns, and mode for categorical columns
for column in data.columns:
    if data[column].dtype == 'object':
        data[column].fillna(data[column].mode()[0], inplace=True)
    else:
        data[column].fillna(data[column].mean(), inplace=True)

In [None]:
# 3. Encode categorical features
# 'Gender' and 'Level' are categorical; we'll encode 'Level' as the target variable
# Label encode 'Level' (target variable)
le = LabelEncoder()
data['Level'] = le.fit_transform(data['Level'])  # Encoding 'Low', 'Medium', 'High' to numerical

# Print the encoding classes to make it easier for others to interpret the output
print("Label Encoding Classes for 'Level':", dict(zip(le.classes_, le.transform(le.classes_))))


In [None]:
# 4. Standardize the numerical features
# Initialize the scaler
scaler = StandardScaler()

In [None]:
# Identifying numeric columns for scaling
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns
# Apply scaling
data[numeric_features] = scaler.fit_transform(data[numeric_features])

In [None]:
# 5. Verify the preprocessed data
print("Preprocessed data snapshot:\n", data.head())

In [None]:
# 6. Save the preprocessed dataset for easy access in future work
data.to_csv("lung cancer data set.csv.csv", index=False)
print("Preprocessed dataset saved as 'lung cancer data set.csv.csv'.")