In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [2]:
df= pd.read_csv('lung cancer survey.csv')

In [3]:
print(df.head())

  GENDER  AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \
0      M   69        1               2        2              1   
1      M   74        2               1        1              1   
2      F   59        1               1        1              2   
3      M   63        2               2        2              1   
4      F   63        1               2        1              1   

   CHRONIC DISEASE  FATIGUE   ALLERGY   WHEEZING  ALCOHOL CONSUMING  COUGHING  \
0                1         2         1         2                  2         2   
1                2         2         2         1                  1         1   
2                1         2         1         2                  1         2   
3                1         1         1         1                  2         1   
4                1         1         1         2                  1         2   

   SHORTNESS OF BREATH  SWALLOWING DIFFICULTY  CHEST PAIN LUNG_CANCER  
0                    2                      

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 309 non-null    object
 1   AGE                    309 non-null    int64 
 2   SMOKING                309 non-null    int64 
 3   YELLOW_FINGERS         309 non-null    int64 
 4   ANXIETY                309 non-null    int64 
 5   PEER_PRESSURE          309 non-null    int64 
 6   CHRONIC DISEASE        309 non-null    int64 
 7   FATIGUE                309 non-null    int64 
 8   ALLERGY                309 non-null    int64 
 9   WHEEZING               309 non-null    int64 
 10  ALCOHOL CONSUMING      309 non-null    int64 
 11  COUGHING               309 non-null    int64 
 12  SHORTNESS OF BREATH    309 non-null    int64 
 13  SWALLOWING DIFFICULTY  309 non-null    int64 
 14  CHEST PAIN             309 non-null    int64 
 15  LUNG_CANCER            

In [5]:
print(df.describe())

              AGE     SMOKING  YELLOW_FINGERS     ANXIETY  PEER_PRESSURE  \
count  309.000000  309.000000      309.000000  309.000000     309.000000   
mean    62.673139    1.563107        1.569579    1.498382       1.501618   
std      8.210301    0.496806        0.495938    0.500808       0.500808   
min     21.000000    1.000000        1.000000    1.000000       1.000000   
25%     57.000000    1.000000        1.000000    1.000000       1.000000   
50%     62.000000    2.000000        2.000000    1.000000       2.000000   
75%     69.000000    2.000000        2.000000    2.000000       2.000000   
max     87.000000    2.000000        2.000000    2.000000       2.000000   

       CHRONIC DISEASE    FATIGUE     ALLERGY     WHEEZING  ALCOHOL CONSUMING  \
count       309.000000  309.000000  309.000000  309.000000         309.000000   
mean          1.504854    1.673139    1.556634    1.556634           1.556634   
std           0.500787    0.469827    0.497588    0.497588           0.4

In [26]:
# Preprocess the data
# Encode the 'GENDER' and 'LUNG_CANCER' columns since they are categorical
gender_encoder = LabelEncoder()
lung_cancer_encoder = LabelEncoder()
df['GENDER'] = gender_encoder.fit_transform(df['GENDER'])
df['LUNG_CANCER'] = lung_cancer_encoder.fit_transform(df['LUNG_CANCER'])

In [9]:
print(df.head())

   GENDER  AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \
0       1   69        1               2        2              1   
1       1   74        2               1        1              1   
2       0   59        1               1        1              2   
3       1   63        2               2        2              1   
4       0   63        1               2        1              1   

   CHRONIC DISEASE  FATIGUE   ALLERGY   WHEEZING  ALCOHOL CONSUMING  COUGHING  \
0                1         2         1         2                  2         2   
1                2         2         2         1                  1         1   
2                1         2         1         2                  1         2   
3                1         1         1         1                  2         1   
4                1         1         1         2                  1         2   

   SHORTNESS OF BREATH  SWALLOWING DIFFICULTY  CHEST PAIN  LUNG_CANCER  
0                    2               

In [11]:
# separate the features and the target variable
x = df.drop(columns=['LUNG_CANCER'])
y = df['LUNG_CANCER']

In [13]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [14]:
# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000) # Increase max_iter if the model doesn't converge
model.fit(X_train, y_train)

In [15]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [17]:
#Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))




Accuracy: 0.967741935483871
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.50      0.50         2
           1       0.98      0.98      0.98        60

    accuracy                           0.97        62
   macro avg       0.74      0.74      0.74        62
weighted avg       0.97      0.97      0.97        62

Confusion Matrix:
 [[ 1  1]
 [ 1 59]]


In [28]:
# Fit the encoder on a list that includes 'Male'
gender_encoder.fit(['Female', 'Male'])  # Or fit it on the entire 'GENDER' column of your original dataset

# Example: Predicting for a new patient (Example Input data)
new_patient = pd.DataFrame({
    'GENDER' : [gender_encoder.transform(['Male'])[0]],
    'AGE' : [65],
    'SMOKING' : [1],
    'YELLOW_FINGERS' : [0],
    'ANXIETY' : [1],
    'PEER_PRESSURE' : [1],
    'CHRONIC DISEASE' : [1],
    'FATIGUE ' : [1],
    'ALLERGY ' : [0],
    'WHEEZING' : [1],
    'ALCOHOL CONSUMING' : [1],
    'COUGHING' : [1],
    'SHORTNESS OF BREATH' : [1],
    'SWALLOWING DIFFICULTY' : [0],
    'CHEST PAIN' : [1]
})

# Predict lung cancer probability for the new patient
predicted_lung_cancer = model.predict(new_patient)
predicted_lung_cancer_label = lung_cancer_encoder.inverse_transform(predicted_lung_cancer)[0]
print("Predicted Lung Cancer:", predicted_lung_cancer_label)

Predicted Lung Cancer: 0
