In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, classification_report, mean_squared_error
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

In [11]:
doc_visit = pd.read_csv('NPHA-doctor-visits.csv')
# df.info()

In [13]:
# Drop the Age column since it has only one unique value (2)
 # print(doc_visit.columns)
doc_visit.drop(['Age'], axis=1, inplace=True)

In [17]:
#Replace only -1 with None (NaN)
doc_visit.replace(-1, pd.NA, inplace=True)

#Check if the replacement worked
# print(doc_visit.isnull().sum())  # This will give you the count of NaN values per column(values are now empty there)

In [19]:
# Step 2: Optionally, Impute the missing values (NaN) with the mode or most frequent value
# For this example, we'll impute with the most frequent value for each column.
doc_visit.fillna(doc_visit.mode().iloc[0], inplace=True)
doc_visit = doc_visit.infer_objects(copy=False)


In [21]:
# Define features (X) and target (y)
X = doc_visit.drop("Number of Doctors Visited", axis=1)  # Features
y = doc_visit["Number of Doctors Visited"]  # Target

# Check the shape of X and y to ensure they are consistent
print(X.shape)
print(y.shape)

(714, 13)
(714,)


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print("Class distribution after SMOTE:", Counter(y_train_resampled))

Class distribution after SMOTE: Counter({3: 298, 2: 298, 1: 298})


In [27]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

In [31]:
# Evaluate the performance of the model

def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)

    # Print evaluation results
    print("Confusion Matrix:\n", matrix)
    print("Classification Report:\n", class_report)

In [35]:
X_scaled = np.vstack((X_train_scaled, X_test_scaled))
y = np.concatenate((y_train_resampled, y_test))

# Initializing Models
models = {
    "LogisticRegression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVC": SVC(kernel ='linear'),
    "DecisionTree": DecisionTreeClassifier()
}

cv = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)

# Producing cross validation score for the models
for model_name in models:
    model = models[model_name]
    
    # Evaluate the model's accuracy using cross-validation
    accuracies = cross_val_score(model, X_scaled, y, cv=cv, scoring='accuracy')
    
    print("🚀", model_name)
    print("Average accuracy:", np.mean(accuracies))

    model.fit(X_train_scaled, y_train_resampled)
    evaluate(model, X_test_scaled, y_test)

🚀 LogisticRegression
Average accuracy: 0.4358569828230022
Confusion Matrix:
 [[ 5 15  8]
 [19 26 29]
 [ 7 15 19]]
Classification Report:
               precision    recall  f1-score   support

           1       0.16      0.18      0.17        28
           2       0.46      0.35      0.40        74
           3       0.34      0.46      0.39        41

    accuracy                           0.35       143
   macro avg       0.32      0.33      0.32       143
weighted avg       0.37      0.35      0.35       143

🚀 Random Forest
Average accuracy: 0.5939973861090365
Confusion Matrix:
 [[ 5 13 10]
 [10 44 20]
 [ 5 20 16]]
Classification Report:
               precision    recall  f1-score   support

           1       0.25      0.18      0.21        28
           2       0.57      0.59      0.58        74
           3       0.35      0.39      0.37        41

    accuracy                           0.45       143
   macro avg       0.39      0.39      0.39       143
weighted avg       0.4

In [11]:
import tensorflow as tf
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the dataset
doc_visit = pd.read_csv("NPHA-doctor-visits.csv")
doc_visit.replace(-1, np.nan, inplace=True)  # Replace invalid values
doc_visit.fillna(doc_visit.mode().iloc[0], inplace=True)  # Impute missing values

# Encode categorical features
encoder = OneHotEncoder(sparse_output=False)
y_encoded = encoder.fit_transform(doc_visit['Number of Doctors Visited'].values.reshape(-1, 1))

# Drop target column
X = doc_visit.drop('Number of Doctors Visited', axis=1)

# Scale the features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

# Function to build the neural network model
def create_model():
    model = Sequential([
        Dense(64, input_dim=X_train.shape[1], activation='relu'),  # Hidden layer
        Dropout(0.2),  # Dropout for regularization
        Dense(32, activation='relu'),  # Another hidden layer
        Dense(3, activation='softmax')  # Output layer for 3 classes
    ])
    model.compile(optimizer='adam', 
                  loss='categorical_crossentropy', 
                  metrics=['accuracy'])
    return model

# Wrap the model in KerasClassifier
model = KerasClassifier(build_fn=create_model, epochs=50, batch_size=16, verbose=1)

# Perform cross-validation to evaluate the model
cross_val_scores = cross_val_score(model, X_scaled, y_encoded, cv=5, scoring='accuracy')
print(f"Cross-validation scores: {cross_val_scores}")
print(f"Mean cross-validation score: {cross_val_scores.mean()}")

# Train the model on the training data
history = model.fit(X_train, y_train, validation_data=(X_test, y_test))

# Evaluate the model on the test set
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)
y_test_classes = y_test.argmax(axis=1)

# Classification Report
print("Classification Report:\n", classification_report(y_test_classes, y_pred_classes))
print("Test Accuracy:", accuracy_score(y_test_classes, y_pred_classes))

# Confusion Matrix
cm = confusion_matrix(y_test_classes, y_pred_classes)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=encoder.categories_[0], yticklabels=encoder.categories_[0])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

# Optionally, visualize the training and validation accuracy over epochs
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title("Training vs Validation Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()


ModuleNotFoundError: No module named 'tensorflow.keras.wrappers'