In [None]:
import pandas as pd
import numpy as np

In [None]:
# Loading xlsx (data) in google colab so that it can be used
from google.colab import files
uploaded = files.upload()

Saving cancer patient data sets.xlsx to cancer patient data sets.xlsx


In [None]:
# Loading the xlsx data into a dataframe
cancer_data = pd.read_excel('cancer patient data sets.xlsx')

In [None]:
# Check if the data has been loaded successfully
cancer_data.head()

Unnamed: 0,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,P1,33,1,2,4,5,4,3,2,2,...,3,4,2,2,3,1,2,3,4,Low
1,P10,17,1,3,1,5,3,4,2,2,...,1,3,7,8,6,2,1,7,2,Medium
2,P100,35,1,4,5,6,5,5,4,6,...,8,7,9,2,1,4,6,7,2,High
3,P1000,37,1,7,7,7,7,6,7,7,...,4,2,3,1,4,5,6,7,5,High
4,P101,46,1,6,8,7,7,7,6,7,...,3,2,4,1,4,2,4,2,3,High


#### **Data preparation and feature extraction process**

The data preparation and feature extraction process is done in 3 processes: 

*   Data Munging
*   Feature Scaling
*   Feature Importance



**1.   Data Munging**




In [None]:
# Drop columns in the dataset that are trivial to the model (Feature Extraction)
drop_cols = ['Patient Id']

# Save changes on the dataframe using inplace=True
cancer_data.drop(columns=drop_cols, inplace=True)

In [None]:
# Check for missing values
print(cancer_data.isnull().sum())

Age                         0
Gender                      0
Air Pollution               0
Alcohol use                 0
Dust Allergy                0
OccuPational Hazards        0
Genetic Risk                0
chronic Lung Disease        0
Balanced Diet               0
Obesity                     0
Smoking                     0
Passive Smoker              0
Chest Pain                  0
Coughing of Blood           0
Fatigue                     0
Weight Loss                 0
Shortness of Breath         0
Wheezing                    0
Swallowing Difficulty       0
Clubbing of Finger Nails    0
Frequent Cold               0
Dry Cough                   0
Snoring                     0
Level                       0
dtype: int64


In [None]:
cancer_data.fillna(cancer_data.mean(), inplace=True)

  cancer_data.fillna(cancer_data.mean(), inplace=True)


In [None]:
# Define a dictionary mapping each category to a numerical value
ordinal_map = {'High': 2, 'Medium': 1, 'Low': 0}

# Replace the categorical values with their corresponding numerical values
cancer_data['Level'] = cancer_data['Level'].replace(ordinal_map)

In [None]:
# cancer_data['Level'] = cancer_data['Level'].map({'Low': 0, 'High': 2, 'Medium': 1}).fillna(0)

In [None]:
# Checking the dataframe after encoding
cancer_data.head(10)

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,0
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,1
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,2
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,2
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,2
5,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,2
6,52,2,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,0
7,28,2,3,1,4,3,2,3,4,3,...,3,2,2,4,2,2,3,4,3,0
8,35,2,4,5,6,5,6,5,5,5,...,1,4,3,2,4,6,2,4,1,1
9,46,1,2,3,4,2,4,3,3,3,...,1,2,4,6,5,4,2,1,5,1


**2. Feature Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler

# Apply feature scaling to the numerical variables
scaler = StandardScaler()

In [None]:
cols_to_scale = ['Age', 'Gender', 'Air Pollution', 'Alcohol use', 'Dust Allergy', 'OccuPational Hazards', 'Genetic Risk', 'chronic Lung Disease',
                 'Balanced Diet', 'Obesity', 'Smoking', 'Passive Smoker', 'Chest Pain', 'Coughing of Blood', 'Fatigue', 'Weight Loss', 'Shortness of Breath',
                 'Wheezing', 'Swallowing Difficulty', 'Clubbing of Finger Nails', 'Frequent Cold', 'Dry Cough', 'Snoring', 'Level']

cancer_data[cols_to_scale] = scaler.fit_transform(cancer_data[cols_to_scale])

In [None]:
# Checking the dataframe after scaling
cancer_data.head(10)

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,-0.347848,-0.819903,-0.906679,-0.214954,-0.08334,-0.398718,-0.743202,-1.288162,-1.16704,-0.218941,...,-0.381548,0.065746,-0.98076,-0.870694,-0.328743,-1.224625,-0.838618,-0.41855,0.728655,-1.303135
1,-1.681238,-0.819903,-0.413919,-1.360357,-0.08334,-0.873383,-0.272821,-1.288162,-1.16704,-1.160623,...,-1.273014,-0.387677,1.208436,2.069186,0.993281,-0.805663,-1.384593,1.544171,-0.628245,-0.076078
2,-0.181174,-0.819903,0.078842,0.166847,0.421751,0.075946,0.19756,-0.205673,0.70697,1.193582,...,1.847119,1.426018,2.084114,-0.870694,-1.210093,0.03226,1.345283,1.544171,-0.628245,1.15098
3,-0.014501,-0.819903,1.557123,0.930449,0.926842,1.025275,0.667941,1.418061,1.175473,1.193582,...,0.064186,-0.841101,-0.542921,-1.360675,0.111931,0.451222,1.345283,1.544171,1.407105,1.15098
4,0.735531,-0.819903,1.064362,1.31225,0.926842,1.025275,1.138323,0.876816,1.175473,1.193582,...,-0.381548,-0.841101,-0.105081,-1.360675,0.111931,-0.805663,0.253332,-0.909231,0.050205,1.15098
5,-0.181174,-0.819903,0.078842,0.166847,0.421751,0.075946,0.19756,-0.205673,0.70697,1.193582,...,1.847119,1.426018,2.084114,-0.870694,-1.210093,0.03226,1.345283,1.544171,-0.628245,1.15098
6,1.235553,1.219657,-0.906679,-0.214954,-0.08334,-0.398718,-0.743202,-1.288162,-1.16704,-0.218941,...,-0.381548,0.065746,-0.98076,-0.870694,-0.328743,-1.224625,-0.838618,-0.41855,0.728655,-1.303135
7,-0.764533,1.219657,-0.413919,-1.360357,-0.588431,-0.873383,-1.213584,-0.746918,-0.230035,-0.689782,...,-0.381548,-0.841101,-0.98076,0.109266,-0.769418,-0.805663,-0.292643,0.07213,0.050205,-1.303135
8,-0.181174,1.219657,0.078842,0.166847,0.421751,0.075946,0.667941,0.335572,0.238468,0.2519,...,-1.273014,0.065746,-0.542921,-0.870694,0.111931,0.870183,-0.838618,0.07213,-1.306694,-0.076078
9,0.735531,-0.819903,-0.906679,-0.596755,-0.588431,-1.348047,-0.272821,-0.746918,-0.698537,-0.689782,...,-1.273014,-0.841101,-0.105081,1.089226,0.552606,0.03226,-0.838618,-1.399911,1.407105,-0.076078


**3. Feature Importance**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [None]:
# Split the dataset into training and testing sets
X = cancer_data.drop(columns=['Level'])
y = cancer_data['Level']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Fit a random forest model to the data
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [None]:
# Calculate feature importance
importance = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf.feature_importances_})
importance.sort_values('Importance', ascending=False, inplace=True)
print(importance)

                     Feature    Importance
9                    Obesity  3.653293e-01
13         Coughing of Blood  1.589634e-01
17                  Wheezing  1.058862e-01
3                Alcohol use  9.236907e-02
14                   Fatigue  6.585301e-02
22                   Snoring  5.539344e-02
11            Passive Smoker  3.701970e-02
6               Genetic Risk  1.903763e-02
5       OccuPational Hazards  1.847293e-02
19  Clubbing of Finger Nails  1.359359e-02
18     Swallowing Difficulty  1.279569e-02
8              Balanced Diet  1.245290e-02
2              Air Pollution  1.021692e-02
12                Chest Pain  9.230961e-03
4               Dust Allergy  7.370297e-03
16       Shortness of Breath  6.841814e-03
21                 Dry Cough  2.541469e-03
7       chronic Lung Disease  2.324966e-03
10                   Smoking  1.874643e-03
15               Weight Loss  1.124607e-03
20             Frequent Cold  6.811756e-04
0                        Age  6.262758e-04
1          

#### **Feature subset that shows maximum correlation with 'Level' variable**



*   Using correlation matrix



In [None]:
# Compute the correlation matrix of the variables
corr_matrix = cancer_data.corr()

In [None]:
# Identify the features that have the highest correlation with level variable
corr_with_target = corr_matrix['Level'].abs().sort_values(ascending=False)

In [None]:
# Get the top 10 features with their correlation coefficent
print(f"The top 10 important features which show maximum correlation with the 'Level' variable:\n\n{corr_with_target[1:11]}")

The top 10 important features which show maximum correlation with the 'Level' variable:

Obesity                 0.827435
Coughing of Blood       0.782092
Alcohol use             0.718710
Dust Allergy            0.713839
Balanced Diet           0.706273
Passive Smoker          0.703594
Genetic Risk            0.701303
OccuPational Hazards    0.673255
Chest Pain              0.645461
Air Pollution           0.636038
Name: Level, dtype: float64


In [None]:
# Using the top 10 feature
# Select a subset of the most important features based on the correlation analysis
top_features = corr_with_target[1:11].index
X = cancer_data[top_features]
y = cancer_data['Level']

In [None]:
# Get the feature subsets, show the first 15 rows
X.head(15)

Unnamed: 0,Obesity,Coughing of Blood,Alcohol use,Dust Allergy,Balanced Diet,Passive Smoker,Genetic Risk,OccuPational Hazards,Chest Pain,Air Pollution
0,-0.218941,-0.353971,-0.214954,-0.08334,-1.16704,-0.949961,-0.743202,-0.398718,-1.069735,-0.906679
1,-1.160623,-0.766045,-1.360357,-0.08334,-1.16704,-0.084393,-0.272821,-0.873383,-1.069735,-0.413919
2,1.193582,1.294323,0.166847,0.421751,0.70697,-0.517177,0.19756,0.075946,-0.192184,0.078842
3,1.193582,1.294323,0.930449,0.926842,1.175473,1.213959,0.667941,1.025275,1.124143,1.557123
4,1.193582,1.706397,1.31225,0.926842,1.175473,1.213959,1.138323,1.025275,1.124143,1.064362
5,1.193582,1.294323,0.166847,0.421751,0.70697,-0.517177,0.19756,0.075946,-0.192184,0.078842
6,-0.218941,-0.353971,-0.214954,-0.08334,-1.16704,-0.949961,-0.743202,-0.398718,-1.069735,-0.906679
7,-0.689782,-1.590192,-1.360357,-0.588431,-0.230035,-0.084393,-1.213584,-0.873383,-0.630959,-0.413919
8,0.2519,0.058102,0.166847,0.421751,0.238468,0.781175,0.667941,0.075946,0.685368,0.078842
9,-0.689782,-0.353971,-0.596755,-0.588431,-0.698537,-0.517177,-0.272821,-1.348047,-0.192184,-0.906679


#### **Suitable machine learning model that can predict chances of getting cancer given the above variables using Naive Bayes**

As we train the model, we have used feature subset obtained above of the top ten (10) most features that have the highest correlation with level variable, with the purpose of future demostration of how the model work on the web page. 

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define the bin edges for the target variable
bin_edges = [-np.inf, -0.5, 0.5, np.inf]

# Bin the continuous numerical values into discrete intervals
y_train_cat = pd.cut(y_train, bins=bin_edges, labels=['Low', 'Medium', 'High'])
y_test_cat = pd.cut(y_test, bins=bin_edges, labels=['Low', 'Medium', 'High'])

# Fit a Gaussian Naive Bayes classifier to the training data
nb_model = GaussianNB()
nb_model.fit(X_train, y_train_cat)

In [None]:
# Use the model to make predictions on the testing data
y_pred = nb_model.predict(X_test)

In [None]:
# Calculate the accuracy of the model using accuracy score
accuracy = accuracy_score(y_test_cat, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.745


#### **Fine tuning the model as a process of optimization.**

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Define parameter grid for GridSearchCV
param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
}

In [None]:
# Create a GridSearchCV object with the Gaussian Naive Bayes classifier and the parameter grid
grid = GridSearchCV(nb_model, param_grid=param_grid, cv=5)

In [None]:
# Fit the GridSearchCV object to the training data
grid.fit(X_train, y_train_cat)

In [None]:
# Print the best hyperparameters
print("Best hyperparameters: ", grid.best_params_)

Best hyperparameters:  {'var_smoothing': 0.1}


In [None]:
# Use the best model to make predictions on the test data
y_pred = grid.predict(X_test)

In [None]:
# Evaluate the performance of the model on the test data
grid_accuracy = accuracy_score(y_test_cat, y_pred)
print("Accuracy after fine tuning: ", grid_accuracy)

Accuracy after fine tuning:  0.75


In [None]:
y_predq = grid.predict([[2,3,4,1,4,2,1,4,2,6]])
print(y_predq[0])

Medium




#### **Deploy the model**

In [None]:
import pickle

In [None]:
# save the trained model to a .pkl file
with open('nb_model.pkl', 'wb') as file:
    pickle.dump(nb_model, file)

In [None]:
# Download the pickel file for further use
from google.colab import files
files.download('nb_model.pkl')