## Import Libraries

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

## Load the Data

In [5]:
# Loading the dataset
data = pd.read_csv('survey lung cancer.csv')

# Displaying the first few rows
data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


## Data Preprocessing

In [7]:
# Check for missing values
print(data.isnull().sum())

# Encode categorical variables if necessary
data['GENDER'] = data['GENDER'].map({'M': 0, 'F': 1})

# Encode target variable
data['LUNG_CANCER'] = data['LUNG_CANCER'].map({'NO': 0, 'YES': 1})

# Display the first few rows again to confirm changes
data.head()

GENDER                   0
AGE                      0
SMOKING                  0
YELLOW_FINGERS           0
ANXIETY                  0
PEER_PRESSURE            0
CHRONIC DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL CONSUMING        0
COUGHING                 0
SHORTNESS OF BREATH      0
SWALLOWING DIFFICULTY    0
CHEST PAIN               0
LUNG_CANCER              0
dtype: int64


Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,0,69,1,2,2,1,1,2,1,2,2,2,2,2,2,1
1,0,74,2,1,1,1,2,2,2,1,1,1,2,2,2,1
2,1,59,1,1,1,2,1,2,1,2,1,2,2,1,2,0
3,0,63,2,2,2,1,1,1,1,1,2,1,1,2,2,0
4,1,63,1,2,1,1,1,1,1,2,1,2,2,1,1,0


##  Split the Data into Features and Target

In [9]:
# Split the data into features and target
X = data.drop('LUNG_CANCER', axis=1)  # Features
y = data['LUNG_CANCER']  # Target

## Split into Training and Test Sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
#random_state=42 is arbitrary, 25% of the data is randomly split for testing, and the remaining to the training dataset.

## Feature Scaling

In [13]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the training data, and transform the test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Train the Logistic Regression Model

In [15]:
# Initialize and train the Logistic Regression model
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train_scaled, y_train)

# Predict using Logistic Regression
y_pred_logreg = logreg.predict(X_test_scaled)

## Train Random Forest Classifier and predict

In [17]:
# Initialize and train the Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_scaled, y_train)

# Predict using Random Forest
y_pred_rf = rf.predict(X_test_scaled)

## Train Support Vector Machine (SVM) and predict

In [19]:
# Initialize and train the SVM model
svm = SVC(random_state=42)
svm.fit(X_train_scaled, y_train)

# Predict using SVM
y_pred_svm = svm.predict(X_test_scaled)

## Train K-Nearest Neighbors (KNN) and predict

In [21]:
# Initialize and train the KNN model
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)

# Predict using KNN
y_pred_knn = knn.predict(X_test_scaled)

## Train Decision Tree Model and predict

In [23]:
# Initialize and train Decision Tree model
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train)

# Predict using Decision Tree
y_pred_dt = dt.predict(X_test_scaled)

## Evaluate performance of the Model

In [25]:
# Function to print model performance
def print_model_performance(y_test, y_pred, model_name):
    print(f"Performance of {model_name}:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred, pos_label=1):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred, pos_label=1):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred, pos_label=1):.4f}")
    print("\n")

# Compare all models
print_model_performance(y_test, y_pred_logreg, "Logistic Regression")
print_model_performance(y_test, y_pred_rf, "Random Forest")
print_model_performance(y_test, y_pred_svm, "Support Vector Machine")
print_model_performance(y_test, y_pred_knn, "K-Nearest Neighbors")
print_model_performance(y_test, y_pred_dt, "Decision Tree")

Performance of Logistic Regression:
Accuracy: 0.9744
Precision: 0.9733
Recall: 1.0000
F1 Score: 0.9865


Performance of Random Forest:
Accuracy: 0.9744
Precision: 0.9863
Recall: 0.9863
F1 Score: 0.9863


Performance of Support Vector Machine:
Accuracy: 0.9615
Precision: 0.9730
Recall: 0.9863
F1 Score: 0.9796


Performance of K-Nearest Neighbors:
Accuracy: 0.9231
Precision: 0.9589
Recall: 0.9589
F1 Score: 0.9589


Performance of Decision Tree:
Accuracy: 0.9615
Precision: 0.9861
Recall: 0.9726
F1 Score: 0.9793




## Create a DataFrame with Actual and Predicted Values

In [27]:
# Ensure indices are aligned by resetting the index of X_test and y_test
X_test_reset = X_test.reset_index(drop=True)
y_test_reset = y_test.reset_index(drop=True)

# Create the DataFrame with actual and predicted values for each model
results_df = pd.DataFrame({
    'Actual': y_test_reset.map({0: 'NO', 1: 'YES'}),
    'Logistic Regression': pd.Series(y_pred_logreg).map({0: 'NO', 1: 'YES'}),
    'Random Forest': pd.Series(y_pred_rf).map({0: 'NO', 1: 'YES'}),
    'Support Vector Machine': pd.Series(y_pred_svm).map({0: 'NO', 1: 'YES'}),
    'K-Nearest Neighbors': pd.Series(y_pred_knn).map({0: 'NO', 1: 'YES'}),
    'Decision Tree': pd.Series(y_pred_dt).map({0: 'NO', 1: 'YES'})
})

# Optionally, include the test features such as GENDER
if 'GENDER' in X_test.columns:
    X_test_reset['GENDER'] = X_test_reset['GENDER'].map({0: 'M', 1: 'F'})
    results_df = pd.concat([X_test_reset, results_df], axis=1)
print(results_df['Actual'].value_counts())
print(results_df['Logistic Regression'].value_counts())
print(results_df['Random Forest'].value_counts())
print(results_df['Support Vector Machine'].value_counts())
print(results_df['K-Nearest Neighbors'].value_counts())
print(results_df['Decision Tree'].value_counts())
# Display the first few rows of the DataFrame
results_df.head()

YES    73
NO      5
Name: Actual, dtype: int64
YES    75
NO      3
Name: Logistic Regression, dtype: int64
YES    73
NO      5
Name: Random Forest, dtype: int64
YES    74
NO      4
Name: Support Vector Machine, dtype: int64
YES    73
NO      5
Name: K-Nearest Neighbors, dtype: int64
YES    72
NO      6
Name: Decision Tree, dtype: int64


Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,...,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,Actual,Logistic Regression,Random Forest,Support Vector Machine,K-Nearest Neighbors,Decision Tree
0,F,61,1,2,2,2,1,1,2,2,...,2,1,2,1,YES,YES,YES,YES,YES,YES
1,M,53,2,2,2,2,2,1,2,1,...,1,1,2,2,YES,YES,YES,YES,YES,YES
2,M,73,1,1,1,1,2,1,2,1,...,2,2,2,2,YES,YES,YES,YES,YES,YES
3,M,70,1,2,1,2,2,2,2,2,...,2,1,2,2,YES,YES,YES,YES,YES,YES
4,M,65,1,2,2,1,1,2,1,2,...,2,2,2,2,YES,YES,YES,YES,YES,YES


In [28]:
# Save the DataFrame to a CSV file
results_df.to_csv('Lung_cancer_predictions.csv', index=False)

# Confirm the file was saved by displaying the file path (optional)
print("DataFrame saved as 'Lung_cancer_predictions.csv'")

DataFrame saved as 'Lung_cancer_predictions.csv'
