# import required libraries

In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

# dataset

## loading dataset

In [45]:
dataset = pd.read_csv('/content/drive/MyDrive/Machine_Learning/symptoms_based_disease_prediction/data/Training.csv')
dataset

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,(vertigo) Paroymsal Positional Vertigo
4916,0,1,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,Acne
4917,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Urinary tract infection
4918,0,1,0,0,0,0,1,0,0,0,...,0,0,1,1,1,1,0,0,0,Psoriasis


In [46]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Columns: 133 entries, itching to prognosis
dtypes: int64(132), object(1)
memory usage: 5.0+ MB


## data pre-processing

### preparing labels

In [47]:
X = dataset.drop('prognosis', axis=1)
y = dataset['prognosis']

# encode labels
le = LabelEncoder()
y = le.fit_transform(y)

### dataset splitting

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=20, stratify=y)

### data augmentation

In [49]:
def augment_data(X, noise_level=0.25):
    X_noisy = X.copy()
    for i in range(len(X_noisy)):
        mask = np.random.rand(X_noisy.shape[1]) < noise_level
        X_noisy.iloc[i, mask] = 1 - X_noisy.iloc[i, mask]
    return X_noisy

X_train_aug = augment_data(X_train)

# model

In [50]:
models = {
    'SVC': SVC(kernel='linear'),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'KNeighbors': KNeighborsClassifier(n_neighbors=5),
    'MultinomialNB': MultinomialNB()
}

## train models

In [51]:
for model_name, model in models.items():
    model.fit(X_train_aug, y_train)
    predictions = model.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    f1 = f1_score(y_test, predictions, average='weighted')
    cm = confusion_matrix(y_test, predictions)

    # results
    print(f"{model_name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print("Confusion Matrix:")
    print(np.array2string(cm, separator=', '))
    print("\n" + "-"*50 + "\n")

SVC Results:
Accuracy: 0.9885
Precision: 0.9903
Recall: 0.9885
F1-Score: 0.9882
Confusion Matrix:
[[36,  0,  0, ...,  0,  0,  0],
 [ 0, 36,  0, ...,  0,  0,  0],
 [ 0,  0, 36, ...,  0,  0,  0],
 ...,
 [ 0,  0,  0, ..., 36,  0,  0],
 [ 0,  0,  0, ...,  0, 36,  0],
 [ 0,  0,  0, ...,  0,  0, 36]]

--------------------------------------------------

RandomForest Results:
Accuracy: 0.9973
Precision: 0.9975
Recall: 0.9973
F1-Score: 0.9973
Confusion Matrix:
[[36,  0,  0, ...,  0,  0,  0],
 [ 0, 36,  0, ...,  0,  0,  0],
 [ 0,  0, 36, ...,  0,  0,  0],
 ...,
 [ 0,  0,  0, ..., 36,  0,  0],
 [ 0,  0,  0, ...,  0, 36,  0],
 [ 0,  0,  0, ...,  0,  0, 36]]

--------------------------------------------------

GradientBoosting Results:
Accuracy: 0.9993
Precision: 0.9993
Recall: 0.9993
F1-Score: 0.9993
Confusion Matrix:
[[36,  0,  0, ...,  0,  0,  0],
 [ 0, 36,  0, ...,  0,  0,  0],
 [ 0,  0, 36, ...,  0,  0,  0],
 ...,
 [ 0,  0,  0, ..., 36,  0,  0],
 [ 0,  0,  0, ...,  0, 36,  0],
 [ 0,  0,  0, ..

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## neural network

In [52]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

nn_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(len(le.classes_), activation='softmax')
])

nn_model.compile(optimizer=Adam(learning_rate=0.01), loss='sparse_categorical_crossentropy')
nn_model.fit(X_train_aug, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))

# predictions
nn_predictions = np.argmax(nn_model.predict(X_test), axis=1)

nn_accuracy = accuracy_score(y_test, nn_predictions)
nn_precision = precision_score(y_test, nn_predictions, average='weighted', zero_division=0)
nn_recall = recall_score(y_test, nn_predictions, average='weighted', zero_division=0)
nn_f1 = f1_score(y_test, nn_predictions, average='weighted', zero_division=0)

# results
print(f"Neural Network Results:")
print(f"Accuracy: {nn_accuracy:.4f}")
print(f"Precision: {nn_precision:.4f}")
print(f"Recall: {nn_recall:.4f}")
print(f"F1-Score: {nn_f1:.4f}")

Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 3.6969 - val_loss: 2.7039
Epoch 2/30
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 3.2170 - val_loss: 1.8962
Epoch 3/30
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 2.9028 - val_loss: 1.3082
Epoch 4/30
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 2.7035 - val_loss: 0.9590
Epoch 5/30
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 2.5017 - val_loss: 0.7669
Epoch 6/30
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 2.4143 - val_loss: 0.7611
Epoch 7/30
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 2.2711 - val_loss: 0.5906
Epoch 8/30
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 2.1843 - val_loss: 0.4519
Epoch 9/30
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━

In [57]:
import pickle
with open('/content/drive/MyDrive/Machine_Learning/symptoms_based_disease_prediction/model.pkl', 'wb') as f:
    pickle.dump(models['GradientBoosting'], f)