In [139]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [140]:
df=pd.read_csv("lung cancer data.csv")
print(df.dtypes)

GENDER                   object
AGE                       int64
SMOKING                   int64
YELLOW_FINGERS            int64
ANXIETY                   int64
PEER_PRESSURE             int64
CHRONIC DISEASE           int64
FATIGUE                   int64
ALLERGY                   int64
WHEEZING                  int64
ALCOHOL CONSUMING         int64
COUGHING                  int64
SHORTNESS OF BREATH       int64
SWALLOWING DIFFICULTY     int64
CHEST PAIN                int64
LUNG_CANCER              object
dtype: object


In [141]:
df['GENDER'] = df['GENDER'].astype('string')
df['LUNG_CANCER']=df['LUNG_CANCER'].astype('string')


In [125]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 309 non-null    string
 1   AGE                    309 non-null    int64 
 2   SMOKING                309 non-null    int64 
 3   YELLOW_FINGERS         309 non-null    int64 
 4   ANXIETY                309 non-null    int64 
 5   PEER_PRESSURE          309 non-null    int64 
 6   CHRONIC DISEASE        309 non-null    int64 
 7   FATIGUE                309 non-null    int64 
 8   ALLERGY                309 non-null    int64 
 9   WHEEZING               309 non-null    int64 
 10  ALCOHOL CONSUMING      309 non-null    int64 
 11  COUGHING               309 non-null    int64 
 12  SHORTNESS OF BREATH    309 non-null    int64 
 13  SWALLOWING DIFFICULTY  309 non-null    int64 
 14  CHEST PAIN             309 non-null    int64 
 15  LUNG_CANCER            

In [126]:
df.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [127]:
df.describe()

Unnamed: 0,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN
count,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0
mean,62.673139,1.563107,1.569579,1.498382,1.501618,1.504854,1.673139,1.556634,1.556634,1.556634,1.579288,1.640777,1.469256,1.556634
std,8.210301,0.496806,0.495938,0.500808,0.500808,0.500787,0.469827,0.497588,0.497588,0.497588,0.494474,0.480551,0.499863,0.497588
min,21.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,57.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,62.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0
75%,69.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
max,87.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [128]:
###DATA CLEANING

# remove whitespace from column names
df.columns = df.columns.str.strip()

# Cleaning 'GENDER' and 'LUNG_CANCER' columns
df['GENDER'] = df['GENDER'].str.strip()
df['LUNG_CANCER'] = df['LUNG_CANCER'].str.strip()

# Replace numeric values
df.replace({2: 1, 1: 0}, inplace=True)

# Map categorical values to binary for modeling
df['LUNG_CANCER'] = df['LUNG_CANCER'].map({'YES': 1, 'NO': 0})
df['GENDER'] = df['GENDER'].map({'M': 1, 'F': 0})

df.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,1,69,0,1,1,0,0,1,0,1,1,1,1,1,1,1
1,1,74,1,0,0,0,1,1,1,0,0,0,1,1,1,1
2,0,59,0,0,0,1,0,1,0,1,0,1,1,0,1,0
3,1,63,1,1,1,0,0,0,0,0,1,0,0,1,1,0
4,0,63,0,1,0,0,0,0,0,1,0,1,1,0,0,0


In [129]:
# Checking for Missing Values and Duplicates

df.isna().sum()
df.duplicated().sum()
df.drop_duplicates(inplace=True)

In [130]:
# Class Balance Check

#print("\nClass balance:")
#print(df['LUNG_CANCER'].value_counts(normalize=True))

In [131]:
df.corr()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
GENDER,1.0,-0.01312,0.041131,-0.202506,-0.152032,-0.261427,-0.189925,-0.07902,0.150174,0.121047,0.434264,0.120228,-0.052893,-0.048959,0.361547,0.053666
AGE,-0.01312,1.0,-0.07341,0.025773,0.050605,0.037848,-0.003431,0.021606,0.037139,0.052803,0.052049,0.168654,-0.009189,0.003199,-0.035806,0.106305
SMOKING,0.041131,-0.07341,1.0,-0.020799,0.153389,-0.030364,-0.149415,-0.037803,-0.030179,-0.147081,-0.052771,-0.138553,0.051761,0.042152,0.106984,0.034878
YELLOW_FINGERS,-0.202506,0.025773,-0.020799,1.0,0.558344,0.313067,0.015316,-0.099644,-0.14713,-0.058756,-0.273643,0.020803,-0.109959,0.333349,-0.099169,0.189192
ANXIETY,-0.152032,0.050605,0.153389,0.558344,1.0,0.210278,-0.006938,-0.181474,-0.159451,-0.174009,-0.152228,-0.218843,-0.155678,0.47882,-0.123182,0.144322
PEER_PRESSURE,-0.261427,0.037848,-0.030364,0.313067,0.210278,1.0,0.042893,0.094661,-0.066887,-0.037769,-0.132603,-0.068224,-0.214115,0.327764,-0.074655,0.195086
CHRONIC DISEASE,-0.189925,-0.003431,-0.149415,0.015316,-0.006938,0.042893,1.0,-0.099411,0.134309,-0.040546,0.010144,-0.160813,-0.01176,0.068263,-0.048895,0.143692
FATIGUE,-0.07902,0.021606,-0.037803,-0.099644,-0.181474,0.094661,-0.099411,1.0,-0.001841,0.152151,-0.181573,0.148538,0.407027,-0.115727,0.013757,0.160078
ALLERGY,0.150174,0.037139,-0.030179,-0.14713,-0.159451,-0.066887,0.134309,-0.001841,1.0,0.166517,0.378125,0.206367,-0.01803,-0.037581,0.24544,0.333552
WHEEZING,0.121047,0.052803,-0.147081,-0.058756,-0.174009,-0.037769,-0.040546,0.152151,0.166517,1.0,0.261061,0.353657,0.042289,0.108304,0.142846,0.249054


In [132]:
# Calculate Correlation with Target Variable

# Generate correlation matrix
corr_matrix = df.corr()

# Sort correlation values with respect to the 'LUNG_CANCER' column
target_correlation = corr_matrix['LUNG_CANCER'].drop('LUNG_CANCER').sort_values(ascending=True).reset_index()

# Bar plot to display feature correlations with the target variable
fig = px.bar(target_correlation, x='index', y='LUNG_CANCER', title='Feature Correlations with Target Variable')

# Update plot layout for clarity
fig.update_layout(xaxis_title='Attributes', yaxis_title='Correlation Coefficient')
fig.show()


In [133]:
# Plot Age Distribution

# Create histogram for age data
hist_fig = px.histogram(df, x='AGE', title='Age Distribution', text_auto=True, opacity=0.7)

# Update marker style for visual clarity
hist_fig.update_traces(marker=dict(line=dict(color='blue', width=1)))
hist_fig.show()


In [134]:
# Scatter Plot of Age vs. Coughing

# Create scatter plot to visualize relationship between age and coughing, colored by lung cancer status
scatter_fig = px.scatter(df, x='AGE', y='COUGHING', color='LUNG_CANCER',
                         title='Age vs. Coughing with Lung Cancer Diagnosis')

# Customize axis labels and color bar
scatter_fig.update_layout(xaxis_title='AGE', yaxis_title='Coughing Indicator', coloraxis_colorbar=dict(title="Lung Cancer Diagnosis"))
scatter_fig.show()


In [135]:
### Confusion Matrix

##Train-Test Split and Feature Scaling

# Splitting data into features and target variable
X = df.drop('LUNG_CANCER', axis=1)
y = df['LUNG_CANCER']

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=52, stratify=y)

# Feature scaling using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handling Imbalance with SMOTE
smote = SMOTE(random_state=52)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Feature selection using Recursive Feature Elimination (RFE)
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=10, step=1)
rfe_selector = rfe_selector.fit(X_train_resampled, y_train_resampled)
selected_features = X.columns[rfe_selector.support_]

# Display selected features
print("\nSelected features:", selected_features)

# Transforming data to keep only selected features
X_train_selected = X_train_resampled[:, rfe_selector.support_]
X_test_selected = X_test_scaled[:, rfe_selector.support_]




Selected features: Index(['ANXIETY', 'PEER_PRESSURE', 'CHRONIC DISEASE', 'FATIGUE', 'ALLERGY',
       'WHEEZING', 'ALCOHOL CONSUMING', 'COUGHING', 'SWALLOWING DIFFICULTY',
       'CHEST PAIN'],
      dtype='object')


In [136]:
## Model Training and Evaluation

# Define models to train
models = {
    'Logistic Regression': LogisticRegression(random_state=52),
    'Random Forest': RandomForestClassifier(random_state=52),
    'SVM': SVC(random_state=52),
    'XGBoost': XGBClassifier(random_state=52)
}

# Train models and display results
for name, model in models.items():
    model.fit(X_train_selected, y_train_resampled)
    y_pred = model.predict(X_test_selected)
    print(f"\n{name} Results:")
    print(classification_report(y_test, y_pred))





Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.69      0.82      0.75        11
           1       0.97      0.94      0.96        72

    accuracy                           0.93        83
   macro avg       0.83      0.88      0.85        83
weighted avg       0.93      0.93      0.93        83


Random Forest Results:
              precision    recall  f1-score   support

           0       0.78      0.64      0.70        11
           1       0.95      0.97      0.96        72

    accuracy                           0.93        83
   macro avg       0.86      0.80      0.83        83
weighted avg       0.92      0.93      0.92        83


SVM Results:
              precision    recall  f1-score   support

           0       0.78      0.64      0.70        11
           1       0.95      0.97      0.96        72

    accuracy                           0.93        83
   macro avg       0.86      0.80      0.83        83
weight

In [137]:
# Visualize the confusion matrix for Random Forest model

cm = confusion_matrix(y_test, y_pred)
fig = px.imshow(cm, text_auto=True, aspect='auto', color_continuous_scale='Blues')
fig.update_layout(title_text=f'Confusion Matrix - {name}', xaxis_title='Predicted', yaxis_title='Actual')
fig.show()


In [138]:
# Prepare features and target variable
features = df.drop('LUNG_CANCER', axis=1)
target = df['LUNG_CANCER']

# Split dataset into training and testing sets
features_train, features_test, target_train, target_test = train_test_split(
    features, target, test_size=0.3, random_state=52, stratify=target
)

# Feature scaling
scaler = StandardScaler()
features_train_scaled = scaler.fit_transform(features_train)
features_test_scaled = scaler.transform(features_test)

# Address class imbalance using SMOTE
smote_handler = SMOTE(random_state=52)
features_train_resampled, target_train_resampled = smote_handler.fit_resample(features_train_scaled, target_train)

# Feature selection with RFE
feature_selector = RFE(estimator=LogisticRegression(), n_features_to_select=10)
feature_selector.fit(features_train_resampled, target_train_resampled)
selected_features = features.columns[feature_selector.support_]

# Transform datasets to keep only selected features
features_train_selected = features_train_resampled[:, feature_selector.support_]
features_test_selected = features_test_scaled[:, feature_selector.support_]

# Model Training and Evaluation
model_dict = {
    'Logistic Regression': LogisticRegression(random_state=52),
    'Random Forest': RandomForestClassifier(random_state=52),
    'SVM': SVC(random_state=52),
    'XGBoost': XGBClassifier(random_state=52)
}

# Train models and display results
for model_name, model_instance in model_dict.items():
    model_instance.fit(features_train_selected, target_train_resampled)
    predictions = model_instance.predict(features_test_selected)
    print(f"\n{model_name} Evaluation:")
    print(classification_report(target_test, predictions))

    # Visualize the confusion matrix for the Random Forest model
    if model_name == 'Random Forest':
        cm_matrix = confusion_matrix(target_test, predictions)
        fig = px.imshow(cm_matrix, text_auto=True, aspect='auto', color_continuous_scale='Blues')
        fig.update_layout(title_text=f'Confusion Matrix - {model_name}', xaxis_title='Predicted', yaxis_title='Actual')
        fig.show()



Logistic Regression Evaluation:
              precision    recall  f1-score   support

           0       0.69      0.82      0.75        11
           1       0.97      0.94      0.96        72

    accuracy                           0.93        83
   macro avg       0.83      0.88      0.85        83
weighted avg       0.93      0.93      0.93        83


Random Forest Evaluation:
              precision    recall  f1-score   support

           0       0.78      0.64      0.70        11
           1       0.95      0.97      0.96        72

    accuracy                           0.93        83
   macro avg       0.86      0.80      0.83        83
weighted avg       0.92      0.93      0.92        83




SVM Evaluation:
              precision    recall  f1-score   support

           0       0.78      0.64      0.70        11
           1       0.95      0.97      0.96        72

    accuracy                           0.93        83
   macro avg       0.86      0.80      0.83        83
weighted avg       0.92      0.93      0.92        83


XGBoost Evaluation:
              precision    recall  f1-score   support

           0       0.78      0.64      0.70        11
           1       0.95      0.97      0.96        72

    accuracy                           0.93        83
   macro avg       0.86      0.80      0.83        83
weighted avg       0.92      0.93      0.92        83

