In [1]:
# GAUSSIAN NAIVE BAYES BEFORE FEATURE SELECTION

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

import pandas as pd
data = pd.read_csv('C:\DiabetesData\Diabetes_Dataset.csv')

X = data[['AGE','Urea', 'Cr', 'HbA1c','Chol','TG','HDL','LDL','VLDL','BMI']]
y = data['CLASS']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% testing

gnb = GaussianNB()

gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test)

from sklearn import metrics
from sklearn.metrics import cohen_kappa_score

# Assuming y_test are your true classes and y_pred are your predicted classes

# Accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy for Gaussian NB: ", accuracy)

# F1 Score
f1_score = metrics.f1_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("F1 Score for Gaussian NB: ", f1_score)

# Kappa Score
kappa_score = cohen_kappa_score(y_test, y_pred)
print("Kappa Score for Gaussian NB:", kappa_score)

# Precision
precision = metrics.precision_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Precision for Gaussian NB:", precision)

# Recall
recall = metrics.recall_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Recall for Gaussian NB: ", recall)


Accuracy for Gaussian NB:  0.9333333333333333
F1 Score for Gaussian NB:  0.9327318776235065
Kappa Score for Gaussian NB: 0.7896434456403605
Precision for Gaussian NB: 0.9326480829748804
Recall for Gaussian NB:  0.9333333333333333


  _warn_prf(average, modifier, msg_start, len(result))


In [2]:
# GAUSSIAN NAIVE BAYES AFTER FEATURE SELECTION

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import metrics
from sklearn.metrics import cohen_kappa_score

import pandas as pd
data = pd.read_csv('C:\DiabetesData\Diabetes_Dataset.csv')

X = data[['AGE','Urea', 'Cr', 'HbA1c','Chol','TG','HDL','LDL','VLDL','BMI']]
y = data['CLASS']

#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=5)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(5,'Score'))  #print 10 best features

# Now you can select only those features from X which are in the top 10
top_features = featureScores.nlargest(5,'Score')['Specs'].values
X = X[top_features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% testing

gnb = GaussianNB()

gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test)

from sklearn import metrics
from sklearn.metrics import cohen_kappa_score

# Assuming y_test are your true classes and y_pred are your predicted classes

# Accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy for Gaussian NB: ", accuracy)

# F1 Score
f1_score = metrics.f1_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("F1 Score for Gaussian NB: ", f1_score)

# Kappa Score
kappa_score = cohen_kappa_score(y_test, y_pred)
print("Kappa Score for Gaussian NB:", kappa_score)

# Precision
precision = metrics.precision_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Precision for Gaussian NB:", precision)

# Recall
recall = metrics.recall_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Recall for Gaussian NB: ", recall)


   Specs       Score
0    AGE  323.975398
9    BMI  283.742879
3  HbA1c  242.873059
8   VLDL  111.041655
2     Cr   80.716030
Accuracy for Gaussian NB:  0.9366666666666666
F1 Score for Gaussian NB:  0.9362141471343924
Kappa Score for Gaussian NB: 0.7998525229116191
Precision for Gaussian NB: 0.9363384522734117
Recall for Gaussian NB:  0.9366666666666666


  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
#Random Forests BEFORE FEATURE SELECTION

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

data = pd.read_csv('C:\DiabetesData\Diabetes_Dataset.csv')

X = data[['AGE','Urea', 'Cr', 'HbA1c','Chol','TG','HDL','LDL','VLDL','BMI']]
y = data['CLASS']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

from sklearn import metrics
from sklearn.metrics import cohen_kappa_score

# Assuming y_test are your true classes and y_pred are your predicted classes

# Accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy for Random Forests: ", accuracy)

# F1 Score
f1_score = metrics.f1_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("F1 Score for Random Forests: ", f1_score)

# Kappa Score
kappa_score = cohen_kappa_score(y_test, y_pred)
print("Kappa Score for Random Forests:", kappa_score)

# Precision
precision = metrics.precision_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Precision for Random Forests:", precision)

# Recall
recall = metrics.recall_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Recall for Random Forests: ", recall)


Accuracy for Random Forests:  0.9766666666666667
F1 Score for Random Forests:  0.9748732886984196
Kappa Score for Random Forests: 0.9239102866045872
Precision for Random Forests: 0.9737291947818263
Recall for Random Forests:  0.9766666666666667


  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
#Random Forests AFTER FEATURE SELECTION
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.metrics import cohen_kappa_score

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

import pandas as pd
data = pd.read_csv('C:\DiabetesData\Diabetes_Dataset.csv')

X = data[['AGE','Urea', 'Cr', 'HbA1c','Chol','TG','HDL','LDL','VLDL','BMI']]
y = data['CLASS']

#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=6)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(6,'Score'))  #print 10 best features

# Now you can select only those features from X which are in the top 10
top_features = featureScores.nlargest(6,'Score')['Specs'].values
X = X[top_features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

from sklearn import metrics
from sklearn.metrics import cohen_kappa_score

# Assuming y_test are your true classes and y_pred are your predicted classes

# Accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy for Random Forests: ", accuracy)

# F1 Score
f1_score = metrics.f1_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("F1 Score for Random Forests: ", f1_score)

# Kappa Score
kappa_score = cohen_kappa_score(y_test, y_pred)
print("Kappa Score for Random Forests:", kappa_score)

# Precision
precision = metrics.precision_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Precision for Random Forests:", precision)

# Recall
recall = metrics.recall_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Recall for Random Forests: ", recall)


   Specs       Score
0    AGE  323.975398
9    BMI  283.742879
3  HbA1c  242.873059
8   VLDL  111.041655
2     Cr   80.716030
5     TG   28.204749
Accuracy for Random Forests:  0.97
F1 Score for Random Forests:  0.9681861471861472
Kappa Score for Random Forests: 0.9006659063316287
Precision for Random Forests: 0.9669076305220885
Recall for Random Forests:  0.97


  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
#Gradient Boosting BEFORE FEATURE SELECTION

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = data[['AGE','Urea', 'Cr', 'HbA1c','Chol','TG','HDL','LDL','VLDL','BMI']]
y = data['CLASS']

# Assuming X is your features and y is your target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a GradientBoostingClassifier object
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=42)

# Train the model
gb_clf.fit(X_train, y_train)

# Make predictions
y_pred = gb_clf.predict(X_test)

from sklearn import metrics
from sklearn.metrics import cohen_kappa_score

# Assuming y_test are your true classes and y_pred are your predicted classes

# Accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy for Gradient Boosting: ", accuracy)

# F1 Score
f1_score = metrics.f1_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("F1 Score for Gradient Boosting: ", f1_score)

# Kappa Score
kappa_score = cohen_kappa_score(y_test, y_pred)
print("Kappa Score for Gradient Boosting:", kappa_score)

# Precision
precision = metrics.precision_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Precision for Gradient Boosting: ", precision)

# Recall
recall = metrics.recall_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Recall for Gradient Boosting: ", recall)


Accuracy for Gradient Boosting:  0.915
F1 Score for Gradient Boosting:  0.9190275429331581
Kappa Score for Gradient Boosting: 0.7177017602125539
Precision for Gradient Boosting:  0.9281018518518519
Recall for Gradient Boosting:  0.915


  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
#Gradient Boosting AFTER FEATURE SELECTION

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = data[['AGE','Urea', 'Cr', 'HbA1c','Chol','TG','HDL','LDL','VLDL','BMI']]
y = data['CLASS']

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=4)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(4,'Score'))  #print 10 best features

# Now you can select only those features from X which are in the top 10
top_features = featureScores.nlargest(4,'Score')['Specs'].values
X = X[top_features]

# Assuming X is your features and y is your target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a GradientBoostingClassifier object
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=42)

# Train the model
gb_clf.fit(X_train, y_train)

# Make predictions
y_pred = gb_clf.predict(X_test)

from sklearn import metrics
from sklearn.metrics import cohen_kappa_score

# Assuming y_test are your true classes and y_pred are your predicted classes

# Accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy for Gradient Boosting: ", accuracy)

# F1 Score
f1_score = metrics.f1_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("F1 Score for Gradient Boosting: ", f1_score)

# Kappa Score
kappa_score = cohen_kappa_score(y_test, y_pred)
print("Kappa Score for Gradient Boosting:", kappa_score)

# Precision
precision = metrics.precision_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Precision for Gradient Boosting: ", precision)

# Recall
recall = metrics.recall_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Recall for Gradient Boosting: ", recall)


   Specs       Score
0    AGE  323.975398
9    BMI  283.742879
3  HbA1c  242.873059
8   VLDL  111.041655
Accuracy for Gradient Boosting:  0.91
F1 Score for Gradient Boosting:  0.9173768148127586
Kappa Score for Gradient Boosting: 0.7194950911640954
Precision for Gradient Boosting:  0.9371721822209915
Recall for Gradient Boosting:  0.91


  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
# Logistic Regression BEFORE FEATURE SELECTION

# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pandas as pd

# Load your dataset
df = pd.read_csv('C:\DiabetesData\Diabetes_Dataset.csv')

# Assume that you are trying to predict a multi-class outcome variable 'y' based on some features 'X1', 'X2', 'X3'
X = df[['AGE','Urea', 'Cr', 'HbA1c','Chol','TG','HDL','LDL','VLDL','BMI']]
y = df['CLASS']

# Split the dataset into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Create an instance of the Logistic Regression model
logistic_regression= LogisticRegression(multi_class='multinomial',max_iter=35000)

# Fit the model using the training data
logistic_regression.fit(X_train,y_train)

# Use the model to make predictions on the test data
y_pred=logistic_regression.predict(X_test)

from sklearn import metrics
from sklearn.metrics import cohen_kappa_score

# Assuming y_test are your true classes and y_pred are your predicted classes

# Accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy for Logistic Regression: ", accuracy)

# F1 Score
f1_score = metrics.f1_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("F1 Score for Logistic Regression: ", f1_score)

# Kappa Score
kappa_score = cohen_kappa_score(y_test, y_pred)
print("Kappa Score for Logistic Regression:", kappa_score)

# Precision
precision = metrics.precision_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Precision for Logistic Regression: ", precision)

# Recall
recall = metrics.recall_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Recall for Logistic Regression: ", recall)

Accuracy for Logistic Regression:  0.9566666666666667
F1 Score for Logistic Regression:  0.9543703703703704
Kappa Score for Logistic Regression: 0.8134506840141587
Precision for Logistic Regression:  0.9549404761904761
Recall for Logistic Regression:  0.9566666666666667


In [8]:
# Logistic Regression AFTER FEATURE SELECTION

# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pandas as pd

# Load your dataset
df = pd.read_csv('C:\DiabetesData\Diabetes_Dataset.csv')

# Assume that you are trying to predict a multi-class outcome variable 'y' based on some features 'X1', 'X2', 'X3'
X = df[['AGE','Urea', 'Cr', 'HbA1c','Chol','TG','HDL','LDL','VLDL','BMI']]
y = df['CLASS']

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=6)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(6,'Score'))  #print 10 best features

# Now you can select only those features from X which are in the top 10
top_features = featureScores.nlargest(6,'Score')['Specs'].values
X = X[top_features]

# Split the dataset into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Create an instance of the Logistic Regression model
logistic_regression= LogisticRegression(multi_class='multinomial',max_iter=35000)

# Fit the model using the training data
logistic_regression.fit(X_train,y_train)

# Use the model to make predictions on the test data
y_pred=logistic_regression.predict(X_test)

from sklearn import metrics
from sklearn.metrics import cohen_kappa_score

# Assuming y_test are your true classes and y_pred are your predicted classes

# Accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy for Logistic Regression: ", accuracy)

# F1 Score
f1_score = metrics.f1_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("F1 Score for Logistic Regression: ", f1_score)

# Kappa Score
kappa_score = cohen_kappa_score(y_test, y_pred)
print("Kappa Score for Logistic Regression:", kappa_score)

# Precision
precision = metrics.precision_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Precision for Logistic Regression: ", precision)

# Recall
recall = metrics.recall_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Recall for Logistic Regression: ", recall)

   Specs       Score
0    AGE  323.975398
9    BMI  283.742879
3  HbA1c  242.873059
8   VLDL  111.041655
2     Cr   80.716030
5     TG   28.204749
Accuracy for Logistic Regression:  0.9466666666666667
F1 Score for Logistic Regression:  0.9346499994150083
Kappa Score for Logistic Regression: 0.7498697238144867
Precision for Logistic Regression:  0.9335132669983417
Recall for Logistic Regression:  0.9466666666666667


In [9]:
#Extra Trees BEFORE FEATURE SELECTION

from sklearn.ensemble import ExtraTreesClassifier

extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=0)

extra_trees.fit(X_train, y_train)

y_pred = extra_trees.predict(X_test)

from sklearn import metrics
from sklearn.metrics import cohen_kappa_score

# Assuming y_test are your true classes and y_pred are your predicted classes

# Accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy for Extra Trees: ", accuracy)

# F1 Score
f1_score = metrics.f1_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("F1 Score for Extra Trees: ", f1_score)

# Kappa Score
kappa_score = cohen_kappa_score(y_test, y_pred)
print("Kappa Score for Extra Trees:", kappa_score)

# Precision
precision = metrics.precision_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Precision for Extra Trees: ", precision)

# Recall
recall = metrics.recall_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Recall for Extra Trees: ", recall)

Accuracy for Extra Trees:  0.9666666666666667
F1 Score for Extra Trees:  0.9695156188616135
Kappa Score for Extra Trees: 0.8586838758302322
Precision for Extra Trees:  0.9730139011690736
Recall for Extra Trees:  0.9666666666666667


  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
#Extra Trees AFTER FEATURE SELECTION

# Import necessary libraries
from sklearn.model_selection import train_test_split

from sklearn import metrics
import pandas as pd

# Load your dataset
df = pd.read_csv('C:\DiabetesData\Diabetes_Dataset.csv')

# Assume that you are trying to predict a multi-class outcome variable 'y' based on some features 'X1', 'X2', 'X3'
X = df[['AGE','Urea', 'Cr', 'HbA1c','Chol','TG','HDL','LDL','VLDL','BMI']]
y = df['CLASS']

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=4)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(4,'Score'))  #print 10 best features

# Now you can select only those features from X which are in the top 10
top_features = featureScores.nlargest(4,'Score')['Specs'].values
X = X[top_features]

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.ensemble import ExtraTreesClassifier

extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
extra_trees.fit(X_train, y_train)

y_pred = extra_trees.predict(X_test)

from sklearn import metrics
from sklearn.metrics import cohen_kappa_score

# Assuming y_test are your true classes and y_pred are your predicted classes

# Accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy for Extra Trees: ", accuracy)

# F1 Score
f1_score = metrics.f1_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("F1 Score for Extra Trees: ", f1_score)

# Kappa Score
kappa_score = cohen_kappa_score(y_test, y_pred)
print("Kappa Score for Extra Trees:", kappa_score)

# Precision
precision = metrics.precision_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Precision for Extra Trees: ", precision)

# Recall
recall = metrics.recall_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Recall for Extra Trees: ", recall)

   Specs       Score
0    AGE  323.975398
9    BMI  283.742879
3  HbA1c  242.873059
8   VLDL  111.041655
Accuracy for Extra Trees:  0.9766666666666667
F1 Score for Extra Trees:  0.9801526649802514
Kappa Score for Extra Trees: 0.9043802932337675
Precision for Extra Trees:  0.9839481193255512
Recall for Extra Trees:  0.9766666666666667


  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
# Bagging Trees BEFORE FEATURE SELECTION

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# Create a decision tree classifier
tree = DecisionTreeClassifier()

# Create a bagging classifier with the decision tree
bagging = BaggingClassifier(base_estimator=tree, n_estimators=100, random_state=0)

# Fit the model to your training data
bagging.fit(X_train, y_train)

# Make predictions on the test data
y_pred = bagging.predict(X_test)

from sklearn import metrics
from sklearn.metrics import cohen_kappa_score

# Assuming y_test are your true classes and y_pred are your predicted classes

# Accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy for Bagging: ", accuracy)

# F1 Score
f1_score = metrics.f1_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("F1 Score for Bagging: ", f1_score)

# Kappa Score
kappa_score = cohen_kappa_score(y_test, y_pred)
print("Kappa Score for Bagging:", kappa_score)

# Precision
precision = metrics.precision_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Precision for Bagging: ", precision)

# Recall
recall = metrics.recall_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Recall for Bagging: ", recall)



Accuracy for Bagging:  0.9866666666666667
F1 Score for Bagging:  0.9900634624772556
Kappa Score for Bagging: 0.9453601675621528
Precision for Bagging:  0.9937777777777778


  _warn_prf(average, modifier, msg_start, len(result))


Recall for Bagging:  0.9866666666666667


In [12]:
#Bagging AFTER FEATURE SELECTION

# Import necessary libraries
from sklearn.model_selection import train_test_split

from sklearn import metrics
import pandas as pd

# Load your dataset
df = pd.read_csv('C:\DiabetesData\Diabetes_Dataset.csv')

# Assume that you are trying to predict a multi-class outcome variable 'y' based on some features 'X1', 'X2', 'X3'
X = df[['AGE','Urea', 'Cr', 'HbA1c','Chol','TG','HDL','LDL','VLDL','BMI']]
y = df['CLASS']

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=4)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(4,'Score'))  #print 10 best features

# Now you can select only those features from X which are in the top 10
top_features = featureScores.nlargest(4,'Score')['Specs'].values
X = X[top_features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Bagging Trees BEFORE FEATURE SELECTION

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# Create a decision tree classifier
tree = DecisionTreeClassifier()

# Create a bagging classifier with the decision tree
bagging = BaggingClassifier(base_estimator=tree, n_estimators=100, random_state=0)

# Fit the model to your training data
bagging.fit(X_train, y_train)

# Make predictions on the test data
y_pred = bagging.predict(X_test)

from sklearn import metrics
from sklearn.metrics import cohen_kappa_score

# Assuming y_test are your true classes and y_pred are your predicted classes

# Accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy for Bagging: ", accuracy)

# F1 Score
f1_score = metrics.f1_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("F1 Score for Bagging: ", f1_score)

# Kappa Score
kappa_score = cohen_kappa_score(y_test, y_pred)
print("Kappa Score for Bagging:", kappa_score)

# Precision
precision = metrics.precision_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Precision for Bagging: ", precision)

# Recall
recall = metrics.recall_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Recall for Bagging: ", recall)

  _warn_prf(average, modifier, msg_start, len(result))


   Specs       Score
0    AGE  323.975398
9    BMI  283.742879
3  HbA1c  242.873059
8   VLDL  111.041655
Accuracy for Bagging:  0.9866666666666667
F1 Score for Bagging:  0.9900634624772556
Kappa Score for Bagging: 0.9453601675621528
Precision for Bagging:  0.9937777777777778
Recall for Bagging:  0.9866666666666667


In [16]:
# XGBoost BEFORE FEATURE SELECTION

import xgboost as xgb
from sklearn.metrics import accuracy_score

import pandas as pd
data = pd.read_csv('C:\DiabetesData\Diabetes_Dataset.csv')

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['Gender'] = le.fit_transform(data['Gender'])

data = pd.get_dummies(data, columns=['Gender'])

X = data.drop('CLASS', axis=1)
y = data['CLASS']
# Remove leading/trailing spaces
y = y.str.strip()

# Convert categorical variable to numerical
y = y.map({'N': 0, 'P': 1, 'Y': 2})

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

from sklearn import metrics
from sklearn.metrics import cohen_kappa_score

# Assuming y_test are your true classes and y_pred are your predicted classes

# Accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy for XGBoost: ", accuracy)

# F1 Score
f1_score = metrics.f1_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("F1 Score for XGBoost: ", f1_score)

# Kappa Score
kappa_score = cohen_kappa_score(y_test, y_pred)
print("Kappa Score for XGBoost:", kappa_score)

# Precision
precision = metrics.precision_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Precision for XGBoost: ", precision)

# Recall
recall = metrics.recall_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Recall for XGBoost: ", recall)

Accuracy for XGBoost:  0.995
F1 Score for XGBoost:  0.995056727932834
Kappa Score for XGBoost: 0.9814281734608599
Precision for XGBoost:  0.9952500000000001
Recall for XGBoost:  0.995


In [17]:
from sklearn.metrics import confusion_matrix

# Assuming y_test are your true labels and y_pred are the predicted labels
cm = confusion_matrix(y_test, y_pred)

print(cm)

[[ 19   0   0]
 [  0  11   0]
 [  1   0 169]]


In [14]:
# XGBoost AFTER FEATURE SELECTION
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel

import pandas as pd
data = pd.read_csv('C:\DiabetesData\Diabetes_Dataset.csv')

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['Gender'] = le.fit_transform(data['Gender'])

data = pd.get_dummies(data, columns=['Gender'])

X = data.drop('CLASS', axis=1)
y = data['CLASS']
# Remove leading/trailing spaces
y = y.str.strip()

# Convert categorical variable to numerical
y = y.map({'N': 0, 'P': 1, 'Y': 2})

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBClassifier()
model.fit(X_train, y_train)

# Get feature importances
importances = model.feature_importances_

# Create a DataFrame for visualization
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Print the feature importances
print(importance_df)

# Select the top 4 features
top_4_features = importance_df['Feature'].iloc[:4]

# Select only the top 4 features from your train and test data
X_train_selected = X_train[top_4_features]
X_test_selected = X_test[top_4_features]

# Train and evaluate a new model on the selected features
model_selected = xgb.XGBClassifier()
model_selected.fit(X_train_selected, y_train)

y_pred = model_selected.predict(X_test_selected)

from sklearn import metrics
from sklearn.metrics import cohen_kappa_score

# Accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy for XGBoost: ", accuracy)

# F1 Score
f1_score = metrics.f1_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("F1 Score for XGBoost: ", f1_score)

# Kappa Score
kappa_score = cohen_kappa_score(y_test, y_pred)
print("Kappa Score for XGBoost:", kappa_score)

# Precision
precision = metrics.precision_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Precision for XGBoost: ", precision)

# Recall
recall = metrics.recall_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Recall for XGBoost: ", recall)

      Feature  Importance
5       HbA1c    0.299015
11        BMI    0.247056
2         AGE    0.124813
6        Chol    0.096337
7          TG    0.054843
10       VLDL    0.048931
4          Cr    0.037668
8         HDL    0.034904
1   No_Pation    0.021386
9         LDL    0.019407
0          ID    0.011032
12   Gender_0    0.002851
3        Urea    0.001758
13   Gender_1    0.000000
Accuracy for XGBoost:  0.99
F1 Score for XGBoost:  0.99
Kappa Score for XGBoost: 0.9623281220568846
Precision for XGBoost:  0.99
Recall for XGBoost:  0.99


In [15]:
#Decision Trees 

#loading the required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

df=pd.read_csv('C:\DiabetesData\Diabetes_Dataset.csv')

x=df[['AGE','Urea','Cr','HbA1c','Chol','TG','HDL','LDL','VLDL','BMI']]
y=df[['CLASS']]

x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.7,random_state=100)

clf=DecisionTreeClassifier() 
clf=clf.fit(x_train, y_train) 
y_pred=clf.predict(x_test) 

print("Accuracy of Decision Tree:",round(metrics.accuracy_score(y_test, y_pred),2))
print("Precision of Decision Tree:",round(metrics.precision_score(y_test,y_pred,average='weighted'),2))
print("F1-Score of Decision Tree:",round(metrics.f1_score(y_test,y_pred,average='weighted'),2))
print("Kappa Index of Decision Tree:",round(metrics.cohen_kappa_score(y_test,y_pred),2))
print("Recall of Decision Tree:",round(metrics.recall_score(y_pred,y_test,average='weighted'),2))
print("Log loss of Decision Tree:",round(metrics.log_loss(y_pred,y_test),2))


Accuracy of Decision Tree: 0.97
Precision of Decision Tree: 0.97
F1-Score of Decision Tree: 0.97
Kappa Index of Decision Tree: 0.88
Recall of Decision Tree: 0.97


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ValueError: could not convert string to float: 'Y'

In [None]:
# Support Vector Machines

from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics

df=pd.read_csv('C:\DiabetesData\Diabetes_Dataset.csv')

x=df[['AGE','Urea','Cr','HbA1c','Chol','TG','HDL','LDL','VLDL','BMI']]
y=df[['CLASS']]

x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.6,random_state=100)

#code to create a support vector machine
clf=svm.SVC(kernel='linear') #creating an svm classifier (a linear kernel)
clf.fit(x_train,y_train.values.ravel()) #training the classifier
y_pred=clf.predict(x_test) #predicting the response for the chosen dataset

print("Accuracy of Support Vector Machine:",round(metrics.accuracy_score(y_test, y_pred),2))
print("Precision of Support Vector Machine:",round(metrics.precision_score(y_test,y_pred,average='weighted'),2))
print("F1-Score of Support Vector Machine:",round(metrics.f1_score(y_test,y_pred,average='weighted'),2))
print("Kappa Index of Support Vector Machine:",round(metrics.cohen_kappa_score(y_test,y_pred),2))
print("Recall of Support Vector Machine:",round(metrics.recall_score(y_pred,y_test,average='weighted'),2))


In [None]:
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import cohen_kappa_score
import pandas as pd
per = Perceptron()

df=pd.read_csv('C:\DiabetesData\Diabetes_Dataset.csv')

x=df[['AGE','Urea','Cr','HbA1c','Chol','TG','HDL','LDL','VLDL','BMI']]
y=df[['CLASS']]

x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.6,random_state=100)

# Fit the model to your training data
per.fit(x_train, y_train)

# Make predictions on the test data
y_pred = per.predict(x_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy for Perceptron: ", accuracy)

# F1 Score
f1_score = metrics.f1_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("F1 Score for Perceptron: ", f1_score)

# Kappa Score
kappa_score = cohen_kappa_score(y_test, y_pred)
print("Kappa Score for Perceptron:", kappa_score)

# Precision
precision = metrics.precision_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Precision for Perceptron: ", precision)

# Recall
recall = metrics.recall_score(y_test, y_pred, average='weighted') # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Recall for Perceptron: ", recall)

In [None]:
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import cohen_kappa_score
import pandas as pd

df=pd.read_csv('C:\DiabetesData\Diabetes_Dataset.csv')

X=df[['AGE','Urea','Cr','HbA1c','Chol','TG','HDL','LDL','VLDL','BMI']]
y=df[['CLASS']]

# Split your dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a perceptron model
model = Perceptron()

# Train the model
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy for Perceptron: ", accuracy)

# F1 Score
f1_score = metrics.f1_score(y_test, y_pred) # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("F1 Score for Perceptron: ", f1_score)

# Kappa Score
kappa_score = cohen_kappa_score(y_test, y_pred)
print("Kappa Score for Perceptron:", kappa_score)

# Precision
precision = metrics.precision_score(y_test, y_pred) # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Precision for Perceptron: ", precision)

# Recall
recall = metrics.recall_score(y_test, y_pred) # you can change the average parameter to 'micro', 'macro', 'weighted', depending on your problem
print("Recall for Perceptron: ", recall)