In [9]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error, classification_report

In [10]:
filename = "cancer patient data sets.csv"
df = pd.read_csv(filename)
df.head(5)

Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0,P1,33,1,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
1,1,P10,17,1,3,1,5,3,4,2,...,1,3,7,8,6,2,1,7,2,Medium
2,2,P100,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
3,3,P1000,37,1,7,7,7,7,6,7,...,4,2,3,1,4,5,6,7,5,High
4,4,P101,46,1,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High


In [11]:
names = list(df.columns)
types = df.dtypes
print(types)

index                        int64
Patient Id                  object
Age                          int64
Gender                       int64
Air Pollution                int64
Alcohol use                  int64
Dust Allergy                 int64
OccuPational Hazards         int64
Genetic Risk                 int64
chronic Lung Disease         int64
Balanced Diet                int64
Obesity                      int64
Smoking                      int64
Passive Smoker               int64
Chest Pain                   int64
Coughing of Blood            int64
Fatigue                      int64
Weight Loss                  int64
Shortness of Breath          int64
Wheezing                     int64
Swallowing Difficulty        int64
Clubbing of Finger Nails     int64
Frequent Cold                int64
Dry Cough                    int64
Snoring                      int64
Level                       object
dtype: object


In [13]:
# Detecting and Removing Outliers
q1 = df.quantile(0.25, numeric_only=True)
q3 = df.quantile(0.75, numeric_only=True)

# Calculate the IQR for each column
iqr = q3 - q1
lower_outlier_bound = q1 - 1.5 * iqr
upper_outlier_bound = q3 + 1.5 * iqr

df_clean = df

# Print the outliers
for i in range(2,len(names)):
        if i < len(names[i]):
            outliers = df[df[names[i]] > upper_outlier_bound[names[i]]]
            df_clean = df_clean.drop(outliers.index)
#     fig = px.histogram(df.values, x=i)
#     fig.update_layout(title = names[i])
#     fig.show() 
    
#     print(df[names[i]].describe(percentiles=[0.25, 0.5, 0.75]))
#     if types[i] != 'object':
#         print('lower_outlier_bound: ', lower_outlier_bound[names[i]])
#         print('upper_outlier_bound: ', upper_outlier_bound[names[i]])
df_clean.head(20)

Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0,P1,33,1,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
1,1,P10,17,1,3,1,5,3,4,2,...,1,3,7,8,6,2,1,7,2,Medium
2,2,P100,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
3,3,P1000,37,1,7,7,7,7,6,7,...,4,2,3,1,4,5,6,7,5,High
4,4,P101,46,1,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High
5,5,P102,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
6,6,P103,52,2,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
7,7,P104,28,2,3,1,4,3,2,3,...,3,2,2,4,2,2,3,4,3,Low
8,8,P105,35,2,4,5,6,5,6,5,...,1,4,3,2,4,6,2,4,1,Medium
9,9,P106,46,1,2,3,4,2,4,3,...,1,2,4,6,5,4,2,1,5,Medium


In [14]:
# Preprocessing
# CLEANING
# Remove duplicate records
df_clean = df_clean.drop_duplicates()

# Drop variables
to_drop = ["index",
           "Patient Id"]
df_clean.drop(to_drop, inplace=True, axis=1)

# Transform
le = LabelEncoder()
names = list(df_clean.columns)
for i in range(len(names)-1):
    df_clean[names[i]].fillna(df_clean[names[i]].mean(), inplace=True)  
        
le.fit_transform(df_clean['Level'])
df_clean['Level'] = le.transform(df_clean['Level'])

df_clean.head(5)

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,33,1,2,4,5,4,3,2,2,4,...,3,4,2,2,3,1,2,3,4,1
1,17,1,3,1,5,3,4,2,2,2,...,1,3,7,8,6,2,1,7,2,2
2,35,1,4,5,6,5,5,4,6,7,...,8,7,9,2,1,4,6,7,2,0
3,37,1,7,7,7,7,6,7,7,7,...,4,2,3,1,4,5,6,7,5,0
4,46,1,6,8,7,7,7,6,7,7,...,3,2,4,1,4,2,4,2,3,0


In [18]:
data = df_clean.values
# print(data[:2])
X = data[:,:-1]
y = data[:,-1]
#print(X.shape)
# print(y.shape)

In [19]:
# Create a StandardScaler object
scaler = StandardScaler()

# Fit the StandardScaler object to the data
scaler.fit(X)
X_scaled = scaler.transform(X)

# Print the transformed data
print(X_scaled)

[[-0.3313679  -0.82684549 -0.89787225 ... -0.85666498 -0.40815856
   0.72012547]
 [-1.72216482 -0.82684549 -0.40677085 ... -1.40524087  1.55576356
  -0.6326719 ]
 [-0.15751829 -0.82684549  0.08433054 ...  1.33763858  1.55576356
  -0.6326719 ]
 ...
 [-1.02676636  1.2094158   0.08433054 ...  1.33763858  1.55576356
  -0.6326719 ]
 [-1.63524001  1.2094158   1.06653334 ...  0.2404868  -0.89913909
   0.04372678]
 [ 0.8855794  -0.82684549  1.06653334 ...  1.33763858  1.55576356
  -0.6326719 ]]


In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split \
(X_scaled,y,random_state=42)
print(len(y_test))

248


In [21]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
#kNN = KNeighborsClassifier(n_neighbors=3, weights = 'distance')
kNN = KNeighborsClassifier(n_neighbors = 100)

kNN.fit(X_train, y_train)

In [22]:
# prediction 
y_pred_knn = kNN.predict(X_test)

error = 0
for i in range(len(X_test)):
  
  if y_test[i] != y_pred_knn[i]:
    error += 1
print("Error: ",error) 

mae = mean_absolute_error(y_test, y_pred_knn)
print(f"Mean Absolute Error (MAE): ${mae:.2f}")

accuracy = accuracy_score(y_test,y_pred_knn)
print(f"Accuracy: {accuracy:.2f}")

report = classification_report(y_test,y_pred_knn)
print("Classifier Report: \n",report)

Error:  20
Mean Absolute Error (MAE): $0.14
Accuracy: 0.92
Classifier Report: 
               precision    recall  f1-score   support

           0       0.86      1.00      0.92       102
           1       0.99      0.95      0.97        74
           2       0.97      0.78      0.86        72

    accuracy                           0.92       248
   macro avg       0.94      0.91      0.92       248
weighted avg       0.93      0.92      0.92       248



In [23]:
# Linear Regression
from sklearn.linear_model import LinearRegression
# Create a linear regression model
lr = LinearRegression()

# Fit the model to the training data
lr.fit(X_train, y_train)

# Predict the labels for the test data
y_pred_lr = lr.predict(X_test)

# Calculate the accuracy of the predictions
# multiclass_precision = precision_score(y_test, y_predict, average='macro')
# multiclass_recall = recall_score(y_test, y_predict, average='macro')
# multiclass_f1 = f1_score(y_test, y_predict, average='macro')

continuous_mse = mean_squared_error(y_test, y_pred_lr)
continuous_mae = mean_absolute_error(y_test, y_pred_lr)

# print(f'Multiclass precision: {multiclass_precision}')
# print(f'Multiclass recall: {multiclass_recall}')
# print(f'Multiclass F1: {multiclass_f1}')

print(f'Continuous MSE: {continuous_mse}')
print(f'Continuous MAE: {continuous_mae}')

Continuous MSE: 0.141303592183156
Continuous MAE: 0.27422564821554335


In [24]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
# Create a Decision Tree Classifier
dt = DecisionTreeClassifier()

# Fit the classifier on the training data
dt.fit(X_train, y_train)

# Predict the labels for the test data
y_pred_dt = dt.predict(X_test)

mae = mean_absolute_error(y_test, y_pred_dt)
print(f"Mean Absolute Error (MAE): ${mae:.2f}")

accuracy = accuracy_score(y_test,y_pred_dt)
print(f"Accuracy: {accuracy:.2f}")

report = classification_report(y_test,y_pred_dt)
print("Classifier Report: \n",report)

Mean Absolute Error (MAE): $0.00
Accuracy: 1.00
Classifier Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       102
           1       1.00      1.00      1.00        74
           2       1.00      1.00      1.00        72

    accuracy                           1.00       248
   macro avg       1.00      1.00      1.00       248
weighted avg       1.00      1.00      1.00       248



In [25]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
# Create a logistic regression model
lr = LogisticRegression()

# Fit the model to the training data
lr.fit(X_train, y_train)

# Make predictions on the test data
y_pred = lr.predict(X_test)

#Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): ${mae:.2f}")

accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy: {accuracy:.2f}")

report = classification_report(y_test,y_pred)
print("Classifier Report: \n",report)

Mean Absolute Error (MAE): $0.00
Accuracy: 1.00
Classifier Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       102
           1       1.00      1.00      1.00        74
           2       1.00      1.00      1.00        72

    accuracy                           1.00       248
   macro avg       1.00      1.00      1.00       248
weighted avg       1.00      1.00      1.00       248



In [26]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier
rf = RandomForestClassifier()

# Fit the model to the training data
rf.fit(X_train, y_train)

# Make predictions on the test data
y_pred_rf = rf.predict(X_test)

#Evaluate the model
mae = mean_absolute_error(y_test, y_pred_rf)
print(f"Mean Absolute Error (MAE): ${mae:.2f}")

accuracy = accuracy_score(y_test,y_pred_rf)
print(f"Accuracy: {accuracy:.2f}")

report = classification_report(y_test,y_pred_rf)
print("Classifier Report: \n",report)

Mean Absolute Error (MAE): $0.00
Accuracy: 1.00
Classifier Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       102
           1       1.00      1.00      1.00        74
           2       1.00      1.00      1.00        72

    accuracy                           1.00       248
   macro avg       1.00      1.00      1.00       248
weighted avg       1.00      1.00      1.00       248

