In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



/kaggle/input/playground-series-s3e13/sample_submission.csv
/kaggle/input/playground-series-s3e13/train.csv
/kaggle/input/playground-series-s3e13/test.csv


In [2]:
df_train = pd.read_csv('/kaggle/input/playground-series-s3e13/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s3e13/test.csv')

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 707 entries, 0 to 706
Data columns (total 66 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     707 non-null    int64  
 1   sudden_fever           707 non-null    float64
 2   headache               707 non-null    float64
 3   mouth_bleed            707 non-null    float64
 4   nose_bleed             707 non-null    float64
 5   muscle_pain            707 non-null    float64
 6   joint_pain             707 non-null    float64
 7   vomiting               707 non-null    float64
 8   rash                   707 non-null    float64
 9   diarrhea               707 non-null    float64
 10  hypotension            707 non-null    float64
 11  pleural_effusion       707 non-null    float64
 12  ascites                707 non-null    float64
 13  gastro_bleeding        707 non-null    float64
 14  swelling               707 non-null    float64
 15  nausea

In [4]:
df_test

Unnamed: 0,id,sudden_fever,headache,mouth_bleed,nose_bleed,muscle_pain,joint_pain,vomiting,rash,diarrhea,...,lymph_swells,breathing_restriction,toe_inflammation,finger_inflammation,lips_irritation,itchiness,ulcers,toenail_loss,speech_problem,bullseye_rash
0,707,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,708,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,709,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,710,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,711,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,1005,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
299,1006,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
300,1007,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
301,1008,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Check for missing values in the dataset
print(df_train.isnull().sum())

# Drop the 'id' column from the train and test datasets
df_train = df_train.drop('id', axis=1)
df_test = df_test.drop('id', axis=1)

# Convert the 'prognosis' column from object to categorical
df_train['prognosis'] = pd.Categorical(df_train['prognosis'])

# Split the train dataset into features and target
X_train = df_train.drop('prognosis', axis=1)
y_train = df_train['prognosis']

# Scale the train and test datasets using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(df_test)

# Split the train dataset into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

id                0
sudden_fever      0
headache          0
mouth_bleed       0
nose_bleed        0
                 ..
ulcers            0
toenail_loss      0
speech_problem    0
bullseye_rash     0
prognosis         0
Length: 66, dtype: int64


In [6]:
# Define the hyperparameters to search over
params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Initialize the Stratified K-Fold Cross-Validator
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the Grid Search Cross-Validator with 5-fold cross-validation
grid_search = GridSearchCV(rf, params, cv=skf, n_jobs=-1, verbose=2)

# Train the classifier on the training dataset using Grid Search Cross-Validation
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the corresponding accuracy score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Hyperparameters: ", best_params)
print("Best Accuracy Score: ", best_score)

Fitting 5 folds for each of 108 candidates, totalling 540 fits




[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   0.7s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   0.7s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.2s


In [7]:
# Make predictions on the validation set
y_pred_val = grid_search.predict(X_val)

# Calculate the accuracy score of the classifier on the validation set
acc_score_val = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy Score:", acc_score_val)

# Create a classification report to visualize the performance of the classifier on the validation set
class_report = classification_report(y_val, y_pred_val)
print(class_report)

Validation Accuracy Score: 0.323943661971831
                       precision    recall  f1-score   support

          Chikungunya       0.60      0.75      0.67        12
               Dengue       0.23      0.50      0.32         6
Japanese_encephalitis       0.17      0.12      0.14        17
         Lyme_disease       0.47      0.64      0.54        11
              Malaria       0.23      0.30      0.26        10
               Plague       1.00      0.06      0.12        16
    Rift_Valley_fever       0.22      0.17      0.19        12
            Tungiasis       0.56      0.75      0.64        12
      West_Nile_fever       0.23      0.17      0.19        18
         Yellow_Fever       0.20      0.40      0.27        15
                 Zika       0.20      0.08      0.11        13

             accuracy                           0.32       142
            macro avg       0.37      0.36      0.31       142
         weighted avg       0.38      0.32      0.29       142



In [8]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV

# Load the dataset
df_train = pd.read_csv('/kaggle/input/playground-series-s3e13/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s3e13/test.csv')

# Preprocess the data
le = LabelEncoder()
df_train['prognosis'] = le.fit_transform(df_train['prognosis'])

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(df_train.drop(['id', 'prognosis'], axis=1), df_train['prognosis'], test_size=0.2, random_state=42)

# Define a classification model
rfc = RandomForestClassifier(random_state=42)

# Train the model on the training set
rfc.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_val = rfc.predict(X_val)

# Evaluate the performance of the model on the validation set
acc_score_val = accuracy_score(y_val, y_pred_val)
prec_score_val = precision_score(y_val, y_pred_val, average='weighted')
rec_score_val = recall_score(y_val, y_pred_val, average='weighted')
f1_score_val = f1_score(y_val, y_pred_val, average='weighted')
class_report_val = classification_report(y_val, y_pred_val)

print('Validation Accuracy Score:', acc_score_val)
print('Validation Precision Score:', prec_score_val)
print('Validation Recall Score:', rec_score_val)
print('Validation F1 Score:', f1_score_val)
print('Validation Classification Report:\n', class_report_val)

# Tune the hyperparameters of the model
param_grid = {'n_estimators': [10, 50, 100, 200],
              'max_depth': [None, 10, 20, 30],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4]}
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Train the final model on the entire dataset
rfc_final = grid_search.best_estimator_
rfc_final.fit(df_train.drop(['id', 'prognosis'], axis=1), df_train['prognosis'])

# Make predictions on the test set
y_pred_test = rfc_final.predict(df_test.drop(['id'], axis=1))

# Save the predictions to a CSV file
submission_df = pd.DataFrame({'id': df_test['id'], 'prognosis': le.inverse_transform(y_pred_test)})
submission_df

Validation Accuracy Score: 0.33098591549295775
Validation Precision Score: 0.3482126303688027
Validation Recall Score: 0.33098591549295775
Validation F1 Score: 0.31240283211741565
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.83      0.87        12
           1       0.21      0.50      0.30         6
           2       0.09      0.06      0.07        17
           3       0.46      0.55      0.50        11
           4       0.18      0.30      0.22        10
           5       0.50      0.06      0.11        16
           6       0.25      0.17      0.20        12
           7       0.53      0.75      0.62        12
           8       0.19      0.17      0.18        18
           9       0.23      0.33      0.27        15
          10       0.36      0.31      0.33        13

    accuracy                           0.33       142
   macro avg       0.36      0.37      0.33       142
weighted avg       0.35    

Unnamed: 0,id,prognosis
0,707,Rift_Valley_fever
1,708,Dengue
2,709,West_Nile_fever
3,710,Japanese_encephalitis
4,711,Malaria
...,...,...
298,1005,West_Nile_fever
299,1006,Malaria
300,1007,Lyme_disease
301,1008,Rift_Valley_fever


In [9]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load the dataset
df_train = pd.read_csv('/kaggle/input/playground-series-s3e13/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s3e13/test.csv')

# Preprocess the data
le = LabelEncoder()
df_train['prognosis'] = le.fit_transform(df_train['prognosis'])

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(df_train.drop(['id', 'prognosis'], axis=1), df_train['prognosis'], test_size=0.2, random_state=42)

# Define the deep learning model
model = tf.keras.Sequential([
  tf.keras.layers.Dense(128, input_shape=(X_train.shape[1],), activation='relu'),
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dense(11, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model on the training set
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val))

# Make predictions on the validation set
y_pred_val = model.predict(X_val)
y_pred_val = np.argmax(y_pred_val, axis=1)

# Evaluate the performance of the model on the validation set
acc_score_val = accuracy_score(y_val, y_pred_val)
prec_score_val = precision_score(y_val, y_pred_val, average='weighted')
rec_score_val = recall_score(y_val, y_pred_val, average='weighted')
f1_score_val = f1_score(y_val, y_pred_val, average='weighted')
class_report_val = classification_report(y_val, y_pred_val)

print('Validation Accuracy Score:', acc_score_val)
print('Validation Precision Score:', prec_score_val)
print('Validation Recall Score:', rec_score_val)
print('Validation F1 Score:', f1_score_val)
print('Validation Classification Report:\n', class_report_val)

# Make predictions on the test set
X_test = df_test.drop(['id'], axis=1)
y_pred_test = model.predict(X_test)
y_pred_test = le.inverse_transform(np.argmax(y_pred_test, axis=1))
df_submission = pd.DataFrame({'id': df_test['id'], 'prognosis': y_pred_test})
df_submission.to_csv('submission.csv', index=False)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Validation Accuracy Score: 0.323943661971831
Validation Precision Score: 0.3363183604865897
Validation Recall Score: 0.323943661971831
Validation F1 Score: 0.3234775232812047
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.75      0.78        12
           1       0.27      0.50      0.35         6
           2       0.18      0.18   