In [1]:
import pandas as pd
df = pd.read_csv('oc_1.csv')
df

Unnamed: 0,id,patient_id,localization,size,tobacco_use,alcohol_consumption,sun_exposure,gender,age_group,diagnosis
0,0,72,Gingiva,4.5,Not informed,Not informed,Not informed,M,1,Leukoplakia
1,1,63,Gingiva,0.7,Not informed,Not informed,Not informed,F,2,Leukoplakia
2,2,10,Palate,1.5,Former,No,No,M,2,OSCC
3,3,48,Tongue,2.0,Yes,No,No,M,2,OSCC
4,4,39,Tongue,2.5,Yes,Former,Yes,M,1,OSCC
...,...,...,...,...,...,...,...,...,...,...
148,148,82,Buccal mucosa,4.5,Not informed,Not informed,Not informed,M,2,Leukoplakia
149,149,165,Tongue,1.2,No,No,No,M,0,Leukoplakia
150,150,31,Tongue,1.2,Not informed,Not informed,Not informed,F,2,OSCC
151,151,183,Tongue,1.0,No,No,Not informed,M,1,Leukoplakia


In [2]:
leukoplakia_rows = df[df['diagnosis'] == 'Leukoplakia']
len(leukoplakia_rows)

88

In [3]:
oscc_rows = df[df['diagnosis'] == 'OSCC']
len(oscc_rows)

65

In [4]:
############### filtering unique patient's data ####################
df_unique_patient = df.drop_duplicates(subset='patient_id', keep='last')
print(df_unique_patient)

      id  patient_id   localization  size   tobacco_use alcohol_consumption  \
17    17          52         Tongue   0.0  Not informed        Not informed   
18    18          20        Gingiva   2.0        Former                  No   
20    20          55         Tongue   2.0  Not informed        Not informed   
24    24          90  Buccal mucosa   0.2           Yes                 Yes   
27    27          51            Lip   1.5  Not informed        Not informed   
..   ...         ...            ...   ...           ...                 ...   
148  148          82  Buccal mucosa   4.5  Not informed        Not informed   
149  149         165         Tongue   1.2            No                  No   
150  150          31         Tongue   1.2  Not informed        Not informed   
151  151         183         Tongue   1.0            No                  No   
152  152          48         Tongue   2.0           Yes                  No   

     sun_exposure gender  age_group    diagnosis  


In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Drop unnecessary columns
data = df.drop(['id','patient_id'], axis=1)

# Convert categorical data to numerical format
label_encoder = LabelEncoder()
categorical_columns = ['localization', 'gender', 'tobacco_use', 'alcohol_consumption','sun_exposure', 'diagnosis']
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])

data

Unnamed: 0,localization,size,tobacco_use,alcohol_consumption,sun_exposure,gender,age_group,diagnosis
0,2,4.5,2,2,1,1,1,0
1,2,0.7,2,2,1,0,2,0
2,4,1.5,0,1,0,1,2,1
3,5,2.0,3,1,0,1,2,1
4,5,2.5,3,0,2,1,1,1
...,...,...,...,...,...,...,...,...
148,0,4.5,2,2,1,1,2,0
149,5,1.2,1,1,0,1,0,0
150,5,1.2,2,2,1,0,2,1
151,5,1.0,1,1,1,1,1,0


In [6]:
######################### RANDOM FOREST ###################
# Split the data into features (X) and target variable (y)
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a Random Forest model
rf_model = RandomForestClassifier(n_estimators=5, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display results
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)


Accuracy: 0.97
Confusion Matrix:
 [[19  0]
 [ 1 11]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97        19
           1       1.00      0.92      0.96        12

    accuracy                           0.97        31
   macro avg       0.97      0.96      0.97        31
weighted avg       0.97      0.97      0.97        31



In [7]:
######################### GRID SEARCH WITH RANDOM FOREST###################
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best hyperparameters to train the model
best_rf_model = RandomForestClassifier(random_state=42, **best_params)
best_rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display results
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)


Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Accuracy: 0.81
Confusion Matrix:
 [[14  5]
 [ 1 11]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.74      0.82        19
           1       0.69      0.92      0.79        12

    accuracy                           0.81        31
   macro avg       0.81      0.83      0.80        31
weighted avg       0.84      0.81      0.81        31



In [8]:
######################### LOGISTIC REGRESSION ###################
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a Logistic Regression model
logreg_model = LogisticRegression(random_state=42)
logreg_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logreg_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display results
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)


Accuracy: 0.74
Confusion Matrix:
 [[16  3]
 [ 5  7]]
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.84      0.80        19
           1       0.70      0.58      0.64        12

    accuracy                           0.74        31
   macro avg       0.73      0.71      0.72        31
weighted avg       0.74      0.74      0.74        31



In [9]:
######################### SVM ###################
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build a Support Vector Machine model
svm_model = SVC(random_state=42)
svm_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display results
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)


Accuracy: 0.84
Confusion Matrix:
 [[15  4]
 [ 1 11]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.79      0.86        19
           1       0.73      0.92      0.81        12

    accuracy                           0.84        31
   macro avg       0.84      0.85      0.84        31
weighted avg       0.86      0.84      0.84        31



In [10]:
!pip install xgboost



DEPRECATION: Loading egg at d:\python311\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330


In [11]:
######################### XG BOOST###################
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

X = data.drop('diagnosis', axis=1)
y = data['diagnosis']
print(X)
print(y)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build an XGBoost model
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display results
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)


     localization  size  tobacco_use  alcohol_consumption  sun_exposure  \
0               2   4.5            2                    2             1   
1               2   0.7            2                    2             1   
2               4   1.5            0                    1             0   
3               5   2.0            3                    1             0   
4               5   2.5            3                    0             2   
..            ...   ...          ...                  ...           ...   
148             0   4.5            2                    2             1   
149             5   1.2            1                    1             0   
150             5   1.2            2                    2             1   
151             5   1.0            1                    1             1   
152             5   2.0            3                    1             0   

     gender  age_group  
0         1          1  
1         0          2  
2         1          2  

In [12]:
######################### NAIVE BAYES T###################
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Create a Gaussian Naive Bayes model
nb_model = GaussianNB()

# Train the model
nb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_nb = nb_model.predict(X_test)

# Evaluate the model
accuracy_nb = accuracy_score(y_test, y_pred_nb)
conf_matrix_nb = confusion_matrix(y_test, y_pred_nb)
classification_rep_nb = classification_report(y_test, y_pred_nb)

# Display results
print(f"Naive Bayes Accuracy: {accuracy_nb:.2f}")
print("Naive Bayes Confusion Matrix:\n", conf_matrix_nb)
print("Naive Bayes Classification Report:\n", classification_rep_nb)


Naive Bayes Accuracy: 0.71
Naive Bayes Confusion Matrix:
 [[15  4]
 [ 5  7]]
Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.79      0.77        19
           1       0.64      0.58      0.61        12

    accuracy                           0.71        31
   macro avg       0.69      0.69      0.69        31
weighted avg       0.71      0.71      0.71        31



In [13]:
################################ KNN T###################
from sklearn.neighbors import KNeighborsClassifier

# Create a KNN model (you can experiment with different values of 'n_neighbors')
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_knn = knn_model.predict(X_test)

# Evaluate the model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
conf_matrix_knn = confusion_matrix(y_test, y_pred_knn)
classification_rep_knn = classification_report(y_test, y_pred_knn)

# Display results
print(f"KNN Accuracy: {accuracy_knn:.2f}")
print("KNN Confusion Matrix:\n", conf_matrix_knn)
print("KNN Classification Report:\n", classification_rep_knn)


KNN Accuracy: 0.77
KNN Confusion Matrix:
 [[17  2]
 [ 5  7]]
KNN Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.89      0.83        19
           1       0.78      0.58      0.67        12

    accuracy                           0.77        31
   macro avg       0.78      0.74      0.75        31
weighted avg       0.77      0.77      0.77        31



In [14]:
import pickle
pickle.dump(rf_model,open('model.pkl','wb'))
model = pickle.load(open('model.pkl','rb'))

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Drop unnecessary columns
data = df.drop(['id','patient_id'], axis=1)

# Convert categorical data to numerical format
label_encoder = LabelEncoder()
categorical_columns = ['localization', 'gender', 'tobacco_use', 'alcohol_consumption','sun_exposure', 'diagnosis']
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])

data