In [60]:
# Load the data
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

from sklearn.svm import SVC
from sklearn.discriminant_analysis import StandardScaler

In [61]:
data = pd.read_csv('../Input/diabetes_clean_with_distribution.csv')

In [62]:
# Data Preprocessing
# Split the Data into features and Target
X = data.drop('Outcome', axis=1)
y = data.Outcome

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

### XGBoost best 82.47%

In [63]:
# Define the hyperparameter grid
xgboost_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'gamma': [0, 0.1, 0.5]
}

In [64]:
# Train the XGBoost classifier
xgboost_model = XGBClassifier()

# Train the model on the training data
xgboost_model.fit(X_train, y_train)

In [65]:
# Tune the hyperparameters using GridSearchCV
xgboost_grid_search = GridSearchCV(xgboost_model, xgboost_param_grid, cv=5)
xgboost_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
xgboost_best_params = xgboost_grid_search.best_params_

# Create a new XGBoost classifier with the best hyperparameters
xgboost_best_model = XGBClassifier(**xgboost_best_params)

# Train the best model on the training data
xgboost_best_model.fit(X_train, y_train)

In [66]:
# Evaluate the best model on the validation data
xgboost_y_pred = xgboost_best_model.predict(X_test)
xgboost_accuracy = accuracy_score(y_test, xgboost_y_pred)

# Print the accuracy
print(f'Accuracy: {xgboost_accuracy * 100: .2f}%')

Accuracy:  82.47%


### Random Forest 81.17%

In [67]:
# Initialize and train the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=7)
rf_model.fit(X_train, y_train)

In [68]:
# Make predictions
rf_y_pred = rf_model.predict(X_test)

In [69]:
# Evaluate the model
rf_accuracy = accuracy_score(y_test, rf_y_pred)

# Print the results
print("Accuracy: {:.2f}%".format(rf_accuracy * 100))

Accuracy: 81.17%


### KNN best 80.52%

In [70]:
# Standardize the features (important for KNN)
scaler = StandardScaler()
knn_X_train = scaler.fit_transform(X_train)
knn_X_test = scaler.transform(X_test)

In [71]:
# Define the range of k values to search
knn_param_grid = {'n_neighbors': range(1, 21)}  # You can adjust the range as needed

In [72]:
# Initialize the KNN classifier
k = 3  # You can adjust the number of neighbors (k) as needed
knn_model = KNeighborsClassifier(n_neighbors=k)

# Create a GridSearchCV object
knn_grid_search = GridSearchCV(knn_model, knn_param_grid, cv=5)  # Use cross-validation with 5 folds

# Fit the GridSearchCV to your data
knn_grid_search.fit(knn_X_train, y_train)

# Fit the classifier to the training data
knn_model.fit(knn_X_train, y_train)

In [73]:
# Get the best parameter (k value)
best_k = knn_grid_search.best_params_['n_neighbors']

# Create a new KNN classifier with the best k value
knn_best_model = KNeighborsClassifier(n_neighbors=best_k)

# Fit the best classifier to the training data
knn_best_model.fit(knn_X_train, y_train)

In [74]:
# Evaluate the best model on the validation data
knn_y_pred = knn_best_model.predict(knn_X_test)

In [75]:
# Evaluate the best model with the best k value
knn_accuracy = round(accuracy_score(y_test, knn_y_pred) * 100, 2)

print(f"Best k: {best_k}")
print(f"Accuracy with best k: {knn_accuracy}%")

Best k: 15
Accuracy with best k: 80.52%


### Logistic Regression best 81.82%

In [76]:
# Feature scaling
scaler = StandardScaler()
log_reg_X_train = scaler.fit_transform(X_train)
log_reg_X_test = scaler.transform(X_test)

# Perform SMOTE for class imbalance
smote = SMOTE(sampling_strategy='auto', random_state=7)
log_reg_X_train_resampled, log_reg_y_train_resampled = smote.fit_resample(log_reg_X_train, y_train)

In [77]:
# Hyperparameter tuning using GridSearchCV
log_reg_param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10]
}

In [78]:
grid_search = GridSearchCV(LogisticRegression(), log_reg_param_grid, cv=5)
grid_search.fit(log_reg_X_train_resampled, log_reg_y_train_resampled)
log_reg_best_model = grid_search.best_estimator_

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Anaconda\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\Anaconda\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.73697569        nan 0.73821793]


In [79]:
# Make predictions with the best model
log_reg_y_pred = log_reg_best_model.predict(log_reg_X_test)

In [80]:
# Evaluate the model
log_reg_accuracy = round(accuracy_score(y_test, log_reg_y_pred) * 100, 2)
print("Accuracy:", log_reg_accuracy, "%")

Accuracy: 81.82 %


### SVM 78.57%

In [81]:
# Standardize the feature data
scaler = StandardScaler()
svm_X_train = scaler.fit_transform(X_train)
svm_X_test = scaler.transform(X_test)

In [82]:
# Create an SVM model
svm_model = SVC(kernel='linear', C=1.0)

In [83]:
# Train the SVM model on the training data
svm_model.fit(svm_X_train, y_train)

# Make predictions on the test data
svm_y_pred = svm_model.predict(svm_X_test)

In [84]:
# Evaluate the model
svm_accuracy = accuracy_score(y_test, svm_y_pred)

# Print the results
print("Accuracy: {:.2f}%".format(svm_accuracy * 100))

Accuracy: 78.57%


### Poll Final Algorithm
#### Assumption: Out of total opinions made, we know, when to listen and to whom
1. On a report, each of the algorithms make a decision
2. It is found that, at least one of them makes a right choice 93.4 % of the time
3. If we could decide, when to listen to a particular decision, we can make about 93% right decisions
4. This idea came from overlap between decisions

In [85]:
# At least one algorithm guesses it right
def at_least_one(common_test, models):
    correct = 0
    value_not_set = True
    right_guesses = []

    for i in range(len(common_test)):
        guess = []

        for j in range(len(models)):
            if models[j][i] == common_test[i]:
                guess.append(j)
                if value_not_set:
                    correct += 1
                    value_not_set = False
        
        if value_not_set:
            guess = np.nan
        else:
            value_not_set = True
        
        right_guesses.append(guess)

    accuracy = (correct / len(common_test)) * 100
    return accuracy

In [86]:
# Models' predictions in descending order of their accuracy
models_pred = [list(xgboost_y_pred), list(log_reg_y_pred), list(rf_y_pred), list(knn_y_pred), list(svm_y_pred)]
common_test_set = y_test.values

In [87]:
at_least_one(common_test_set, models_pred)

93.5064935064935

In [94]:
# only if 80+ does
def skilled_3(common_test, models):
    correct = 0
    value_not_set = True
    right_guesses = []

    for i in range(len(common_test)):
        guess = []

        for j in range(3):
            if models[j][i] == common_test[i]:
                guess.append(j)
                if value_not_set:
                    correct += 1
                    value_not_set = False
        
        if value_not_set:
            guess = np.nan
        else:
            value_not_set = True
        
        right_guesses.append(guess)

    accuracy = (correct / len(common_test)) * 100
    return accuracy

In [95]:
skilled_3(common_test_set, models_pred)

90.9090909090909

In [90]:
# Fresher contributors but all
def all_freshers_advice(common_test, models):
    right_guesses = []
    correct = 0

    for i in range(len(common_test)):
        guess = []

        for j in range(3, len(models)):
            if models[j][i] == common_test[i]:
                guess.append(j)
        
        if len(guess) < 2:
            guess = np.nan
        else:
            right_guesses.append(guess)
            correct += 1

    accuracy = (correct / len(common_test)) * 100
    return accuracy

In [91]:
all_freshers_advice(common_test_set, models_pred)

73.37662337662337

In [96]:
# Last 2 selects and at least one topper supports
def fresher_advice_with_skilled(common_test, models):
    correct = 0
    value_not_set = True
    right_guesses = []
    contribution = False

    for i in range(len(common_test)):
        guess = []

        for j in range(len(models)):
            if models[j][i] == common_test[i]:
                if j < 3 or contribution:
                    guess.append(j)
                    if not contribution: contribution = True

                    if value_not_set:
                        correct += 1
                        value_not_set = False
        
        if value_not_set:
            guess = np.nan
        else:
            value_not_set = True
        
        right_guesses.append(guess)

    accuracy = (correct / len(common_test)) * 100
    return accuracy

In [97]:
fresher_advice_with_skilled(common_test_set, models_pred)

93.5064935064935