In [29]:
# =====================> Dataset Cleaning <=====================

import pandas as pd

# Load the dataset
myDataset = pd.read_csv('lung cancer.csv')

# Display the first few rows of the dataset
print("Before cleaning:")
print(myDataset)

# Check for missing values and handle them
missing_values = myDataset.isnull().sum()
print("\nMissing values before handling:")
print(missing_values)

# Drop rows with missing values
myDataset = myDataset.dropna()

# Check for duplicate rows and remove them
duplicate_rows = myDataset.duplicated().sum()
print("\nNumber of duplicate rows:", duplicate_rows)
myDataset = myDataset.drop_duplicates()

# Display the cleaned dataset
print("\nAfter cleaning:")
print(myDataset.head())

# Save the cleaned dataset to a new CSV file
myDataset.to_csv('Cleaned_lung cancer.csv', index=False)

print("\nDataset cleaning completed!")

Before cleaning:
    GENDER  AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \
0        M   69        1               2        2              1   
1        M   74        2               1        1              1   
2        F   59        1               1        1              2   
3        M   63        2               2        2              1   
4        F   63        1               2        1              1   
..     ...  ...      ...             ...      ...            ...   
304      F   56        1               1        1              2   
305      M   70        2               1        1              1   
306      M   58        2               1        1              1   
307      M   67        2               1        2              1   
308      M   62        1               1        1              2   

     CHRONIC DISEASE  FATIGUE   ALLERGY   WHEEZING  ALCOHOL CONSUMING  \
0                  1         2         1         2                  2   
1                  2

In [35]:
import pandas as pd
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.preprocessing import LabelEncoder

# Load the data
data = pd.read_csv('lung cancer.csv')

# Separate features and target variable
X = data.drop('LUNG_CANCER', axis=1)
y = data['LUNG_CANCER']

# Encode the "Gender" feature
le = LabelEncoder()
X["GENDER"] = le.fit_transform(X["GENDER"])

# Feature selection using chi-square test
chi2_selector = SelectKBest(chi2, k=10)

# Fit the selector
chi2_selector.fit(X, y)

# Get selected features
selected_features = X.columns[chi2_selector.get_support(indices=True)]

print("Selected features using chi-square test:", selected_features)


Selected features using chi-square test: Index(['AGE', 'YELLOW_FINGERS', 'ANXIETY', 'PEER_PRESSURE', 'ALLERGY ',
       'WHEEZING', 'ALCOHOL CONSUMING', 'COUGHING', 'SWALLOWING DIFFICULTY',
       'CHEST PAIN'],
      dtype='object')


In [37]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the data assuming it's in a CSV file named 'lung_cancer.csv'
data = pd.read_csv('lung cancer.csv')

# Separate features and target variable
X = data.drop('LUNG_CANCER', axis=1)  # All columns except 'Lung Cancer'
y = data['LUNG_CANCER']

# Handle categorical features (e.g., Gender) using label encoding
categorical_features = ['GENDER']  # Add other categorical features if needed
le = LabelEncoder()
for feature in categorical_features:
    X[feature] = le.fit_transform(X[feature])

# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler on the features (now numerical)
scaler.fit(X)

# Transform the features using the fitted scaler
X_scaled = scaler.transform(X)

# Combine the scaled features and target variable
data_scaled = pd.concat([pd.DataFrame(X_scaled, columns=X.columns), y], axis=1)

# Print a sample of the scaled data
print(data_scaled.head())


     GENDER       AGE   SMOKING  YELLOW_FINGERS   ANXIETY  PEER_PRESSURE  \
0  0.952579  0.771850 -1.135292        0.869300  1.003241      -1.003241   
1  0.952579  1.381829  0.880830       -1.150351 -0.996769      -1.003241   
2 -1.049781 -0.448107 -1.135292       -1.150351 -0.996769       0.996769   
3  0.952579  0.039876  0.880830        0.869300  1.003241      -1.003241   
4 -1.049781  0.039876 -1.135292        0.869300 -0.996769      -1.003241   

   CHRONIC DISEASE  FATIGUE   ALLERGY   WHEEZING  ALCOHOL CONSUMING  COUGHING  \
0        -1.009756  0.696833 -1.120480  0.892475           0.892475  0.852207   
1         0.990338  0.696833  0.892475 -1.120480          -1.120480 -1.173424   
2        -1.009756  0.696833 -1.120480  0.892475          -1.120480  0.852207   
3        -1.009756 -1.435063 -1.120480 -1.120480           0.892475 -1.173424   
4        -1.009756 -1.435063 -1.120480  0.892475          -1.120480  0.852207   

   SHORTNESS OF BREATH  SWALLOWING DIFFICULTY  CHEST PAI

In [38]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets (default test_size is 0.25)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Print the sizes of the training and testing sets
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


Training set size: (231, 15)
Testing set size: (78, 15)


In [39]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score

# Load the data assuming it's in a CSV file named 'lung_cancer.csv'
data = pd.read_csv('lung cancer.csv')

# Separate features and target variable
X = data.drop('LUNG_CANCER', axis=1)  # All columns except 'Lung Cancer'
y = data['LUNG_CANCER']

# Handle categorical features (e.g., Gender) using label encoding
categorical_features = ['GENDER']  # Add other categorical features if needed
le = LabelEncoder()
for feature in categorical_features:
    X[feature] = le.fit_transform(X[feature])

# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler on the features (now numerical)
scaler.fit(X)

# Transform the features using the fitted scaler
X_scaled = scaler.transform(X)

# Split data into training and testing sets (75% training, 25% testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42)

# Create an SVC object with a linear kernel
clf = SVC(kernel='linear', random_state=42)

# Train the model on the training data
clf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = clf.predict(X_test)

# Evaluate the model using accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9743589743589743


In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score

# Load the data assuming it's in a CSV file named 'lung_cancer.csv'
data = pd.read_csv('lung cancer.csv')

# Separate features and target variable
X = data.drop('LUNG_CANCER', axis=1)  # All columns except 'Lung Cancer'
y = data['LUNG_CANCER']

# Handle categorical features (e.g., Gender) using label encoding
categorical_features = ['GENDER']  # Add other categorical features if needed
le = LabelEncoder()
for feature in categorical_features:
    X[feature] = le.fit_transform(X[feature])

# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler on the features (now numerical)
scaler.fit(X)

# Transform the features using the fitted scaler
X_scaled = scaler.transform(X)

# Split data into training and testing sets (75% training, 25% testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42)

# Create a Random Forest Classifier object
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
clf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = clf.predict(X_test)

# Evaluate the model using accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9743589743589743


In [41]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)


              precision    recall  f1-score   support

          NO       0.80      0.80      0.80         5
         YES       0.99      0.99      0.99        73

    accuracy                           0.97        78
   macro avg       0.89      0.89      0.89        78
weighted avg       0.97      0.97      0.97        78


In [42]:
# ... your code for data loading, preprocessing, splitting, model training ...

# Make predictions on the testing set
y_pred = clf.predict(X_test)

# Evaluate the model using accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Get the classification report
report = classification_report(y_test, y_pred)
print(report)

# Feature Importance (if applicable)
# ... analyze feature importance scores from your model ...

# Analysis and Conclusions (written as comments)
print("\nAnalysis and Conclusions:")
print(f"The model achieved an accuracy of {accuracy:.2f}.")
print(f"The classification report reveals:\n{report}")
print("However, the dataset size (100 samples) limits the generalizability of these results.")
print("Feature importance analysis (if applicable) can provide insights into the most predictive features for lung cancer.")
print("This initial exploration highlights the potential for using machine learning for lung cancer prediction, but a larger dataset is crucial for building a robust model.")


Accuracy: 0.9743589743589743
              precision    recall  f1-score   support

          NO       0.80      0.80      0.80         5
         YES       0.99      0.99      0.99        73

    accuracy                           0.97        78
   macro avg       0.89      0.89      0.89        78
weighted avg       0.97      0.97      0.97        78


Analysis and Conclusions:
The model achieved an accuracy of 0.97.
The classification report reveals:
              precision    recall  f1-score   support

          NO       0.80      0.80      0.80         5
         YES       0.99      0.99      0.99        73

    accuracy                           0.97        78
   macro avg       0.89      0.89      0.89        78
weighted avg       0.97      0.97      0.97        78

However, the dataset size (100 samples) limits the generalizability of these results.
Feature importance analysis (if applicable) can provide insights into the most predictive features for lung cancer.
This initial 

In [43]:
import pandas as pd

file = pd.read_csv("lung cancer.csv")

file.to_csv("lung cancer.tab")



In [None]:
#updated code by Saad Mohamed Saad