In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

class Ensemble:
    def __init__(self):
        self.x_train = None
        self.x_test = None
        self.y_train = None
        self.y_test = None

    def load_data(self):
        # Load your dataset
        data = pd.read_csv('amyloid.csv', na_values='--')
        # Preprocess the data
        data.dropna(inplace=True)  # Remove rows with missing values
        X = data.iloc[:, 1:]  # Features
        y = data['Amyloid']  # Target variable
        
        # Split the data
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.25, random_state=23)

    @staticmethod
    def __Classifiers__(name=None):
        # Seed for reproducibility
        random_state = 23

        if name == 'decision_tree':
            return DecisionTreeClassifier(random_state=random_state)
        if name == 'kneighbors':
            return KNeighborsClassifier()
        if name == 'logistic_regression':
            return LogisticRegression(random_state=random_state, solver='liblinear')
        if name == 'svm_linear':
            return SVC(kernel='linear', probability=True)
        if name == 'svm_poly':
            return SVC(kernel='poly', degree=3, probability=True)
        if name == 'svm_rbf':
            return SVC(kernel='rbf', probability=True)

    def __VotingClassifier__(self):
        # Instantiate classifiers including different SVM kernels
        decision_tree = self.__Classifiers__('decision_tree')
        knn = self.__Classifiers__('kneighbors')
        logistic_regression = self.__Classifiers__('logistic_regression')
        svm_linear = self.__Classifiers__('svm_linear')
        svm_poly = self.__Classifiers__('svm_poly')
        svm_rbf = self.__Classifiers__('svm_rbf')

        # Voting Classifier initialization
        vc = VotingClassifier(estimators=[
            ('decision_tree', decision_tree),
            ('knn', knn),
            ('logistic_regression', logistic_regression),
            ('svm_linear', svm_linear),
            ('svm_poly', svm_poly),
            ('svm_rbf', svm_rbf)
        ], voting='soft')
        
        # Fitting the vc model
        vc.fit(self.x_train, self.y_train)
        
        # Getting train and test accuracies from meta_model
        y_pred_train = vc.predict(self.x_train)
        y_pred = vc.predict(self.x_test)
        
        print(f"Train accuracy: {accuracy_score(self.y_train, y_pred_train)}")
        print(f"Test accuracy: {accuracy_score(self.y_test, y_pred)}")

if __name__ == "__main__":
    ensemble = Ensemble()
    ensemble.load_data()
    ensemble.__VotingClassifier__()


Train accuracy: 0.9329021827000809
Test accuracy: 0.6828087167070218


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC  # Import SVC for Support Vector Machine classifiers

# Load and preprocess the data
data = pd.read_csv('amyloid.csv', na_values='--')
data.dropna(inplace=True)  # Remove rows with missing values
X = data.iloc[:, 1:]  # Features
y = data['Amyloid']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up the voting classifier with multiple classifiers
clf1 = LogisticRegression(random_state=1)
clf2 = DecisionTreeClassifier(random_state=1)
clf3 = RandomForestClassifier(random_state=1)
clf4 = SVC(kernel='linear', probability=True, random_state=1)
clf5 = SVC(kernel='rbf', probability=True, random_state=1)
clf6 = SVC(kernel='poly', degree=3, probability=True, random_state=1)

# Create the VotingClassifier with the classifiers
ensemble = VotingClassifier(estimators=[
    ('lr', clf1),
    ('dt', clf2),
    ('rf', clf3),
    ('svm_linear', clf4),
    ('svm_rbf', clf5),
    ('svm_poly', clf6)
], voting='soft')  # Using 'soft' voting to use the probability estimates for voting

# Train the ensemble classifier
ensemble.fit(X_train, y_train)

# Predict on the test data
voting_pred = ensemble.predict(X_test)

# Evaluate the metrics
accuracy = accuracy_score(y_test, voting_pred)
precision = precision_score(y_test, voting_pred, average='binary')
recall = recall_score(y_test, voting_pred, average='binary')
f1 = f1_score(y_test, voting_pred, average='binary')

# Display the metrics as percentages
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")


Accuracy: 72.12%
Precision: 71.01%
Recall: 73.62%
F1 Score: 72.29%


In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC  # Import SVC for Support Vector Machine classifiers
from sklearn.preprocessing import StandardScaler  # Import the scaler
from sklearn.pipeline import Pipeline  # Import Pipeline to handle scaling and model fitting
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier


# Load and preprocess the data
data = pd.read_csv('amyloid.csv', na_values='--')
data.dropna(inplace=True)  # Remove rows with missing values
X = data.iloc[:, 1:]  # Features
y = data['Amyloid']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Pipeline for Logistic Regression with scaling
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # This will scale each feature to mean 0 and variance 1
    ('logistic', LogisticRegression(C=10, solver='lbfgs', random_state=1, max_iter=1000))  # Increased max_iter
])

# Set up the voting classifier with multiple classifiers
clf2 = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=1)
clf3 = RandomForestClassifier(criterion='entropy', max_depth=None, max_features='log2', n_estimators=200, random_state=1)
clf4 = SVC(kernel='linear', probability=True, random_state=1)  # Additional parameters could be tuned if needed
clf5 = SVC(kernel='rbf', probability=True, random_state=1)     # Additional parameters could be tuned if needed
clf6 = SVC(kernel='poly', C=1, gamma='scale', probability=True, random_state=1)
clf7 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=1)
clf8 = KNeighborsClassifier(n_neighbors=5)

# Update the VotingClassifier with new classifiers
ensemble = VotingClassifier(estimators=[
    ('lr', lr_pipeline),
    ('dt', clf2),
    ('rf', clf3),
    ('svm_linear', clf4),
    ('svm_rbf', clf5),
    ('svm_poly', clf6),
    ('gb', clf7),
    ('knn', clf8)
], voting='soft')  # Continue using 'soft' voting
# Train the ensemble classifier
ensemble.fit(X_train, y_train)

# Predict on the test data
voting_pred = ensemble.predict(X_test)

# Evaluate the metrics
accuracy = accuracy_score(y_test, voting_pred)
precision = precision_score(y_test, voting_pred, average='binary')
recall = recall_score(y_test, voting_pred, average='binary')
f1 = f1_score(y_test, voting_pred, average='binary')

# Display the metrics as percentages
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")


Accuracy: 73.33%
Precision: 73.01%
Recall: 73.01%
F1 Score: 73.01%


In [1]:
from sklearn.model_selection import GridSearchCV

# Logistic Regression Grid
lr_params = {
    'logistic__C': [0.1, 1, 10],
    'logistic__solver': ['lbfgs', 'liblinear']
}

# Decision Tree Grid
dt_params = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Random Forest Grid
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2']
}

# SVC Linear Grid
svm_linear_params = {
    'C': [0.1, 1, 10]
}

# SVC RBF Grid
svm_rbf_params = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.1, 1]
}

# SVC Poly Grid
svm_poly_params = {
    'C': [0.1, 1, 10],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto']
}

# Gradient Boosting Grid
gb_params = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# KNN Grid
knn_params = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance']
}

# Create GridSearchCV for each model in the pipeline
lr_grid = GridSearchCV(lr_pipeline, lr_params, cv=5, scoring='accuracy')
lr_grid.fit(X_train, y_train)
print(f'Best parameters for Logistic Regression: {lr_grid.best_params_}')

# You can create similar GridSearchCV calls for the other classifiers
# Note: Make sure to include fit and print best parameters for each model like shown above

# Example for Decision Tree
dt_grid = GridSearchCV(clf2, dt_params, cv=5, scoring='accuracy')
dt_grid.fit(X_train, y_train)
print(f'Best parameters for Decision Tree: {dt_grid.best_params_}')
# Random Forest Grid Search
rf_grid = GridSearchCV(clf3, rf_params, cv=5, scoring='accuracy')
rf_grid.fit(X_train, y_train)
print(f'Best parameters for Random Forest: {rf_grid.best_params_}')

# SVC Linear Grid Search
svm_linear_grid = GridSearchCV(clf4, svm_linear_params, cv=5, scoring='accuracy')
svm_linear_grid.fit(X_train, y_train)
print(f'Best parameters for SVM Linear: {svm_linear_grid.best_params_}')

# SVC RBF Grid Search
svm_rbf_grid = GridSearchCV(clf5, svm_rbf_params, cv=5, scoring='accuracy')
svm_rbf_grid.fit(X_train, y_train)
print(f'Best parameters for SVM RBF: {svm_rbf_grid.best_params_}')

# # SVC Poly Grid Search
# svm_poly_grid = GridSearchCV(clf6, svm_poly_params, cv=5, scoring='accuracy')
# svm_poly_grid.fit(X_train, y_train)
# print(f'Best parameters for SVM Poly: {svm_poly_grid.best_params_}')

# Gradient Boosting Grid Search
gb_grid = GridSearchCV(clf7, gb_params, cv=5, scoring='accuracy')
gb_grid.fit(X_train, y_train)
print(f'Best parameters for Gradient Boosting: {gb_grid.best_params_}')

# KNN Grid Search
knn_grid = GridSearchCV(clf8, knn_params, cv=5, scoring='accuracy')
knn_grid.fit(X_train, y_train)
print(f'Best parameters for KNN: {knn_grid.best_params_}')


NameError: name 'lr_pipeline' is not defined

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC  # Import SVC for Support Vector Machine classifiers
from sklearn.preprocessing import StandardScaler  # Import the scaler
from sklearn.pipeline import Pipeline  # Import Pipeline to handle scaling and model fitting
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier


# Load and preprocess the data
data = pd.read_csv('amyloid.csv', na_values='--')
data.dropna(inplace=True)  # Remove rows with missing values
X = data.iloc[:, 1:]  # Features
y = data['Amyloid']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Pipeline for Logistic Regression with scaling
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # This will scale each feature to mean 0 and variance 1
    ('logistic', LogisticRegression(C=10, solver='lbfgs', random_state=1, max_iter=1000))  # Increased max_iter
])

# Set up the voting classifier with multiple classifiers
clf2 = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=1)
clf3 = RandomForestClassifier(criterion='entropy', max_depth=None, max_features='log2', n_estimators=200, random_state=1)
clf4 = SVC(kernel='linear', probability=True, random_state=1)  # Additional parameters could be tuned if needed
clf5 = SVC(kernel='rbf', probability=True, random_state=1)     # Additional parameters could be tuned if needed
clf6 = SVC(kernel='poly', C=1, gamma='scale', probability=True, random_state=1)
clf7 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=1)
clf8 = KNeighborsClassifier(n_neighbors=5)

# Update the VotingClassifier with new classifiers
ensemble = VotingClassifier(estimators=[
    ('lr', lr_pipeline),
    ('dt', clf2),
    ('rf', clf3),
    ('svm_linear', clf4),
    ('svm_rbf', clf5),
    ('svm_poly', clf6),
    ('gb', clf7),
    ('knn', clf8)
], voting='hard')  # Continue using 'soft' voting
# Train the ensemble classifier
ensemble.fit(X_train, y_train)

# Predict on the test data
voting_pred = ensemble.predict(X_test)

# Evaluate the metrics
accuracy = accuracy_score(y_test, voting_pred)
precision = precision_score(y_test, voting_pred, average='binary')
recall = recall_score(y_test, voting_pred, average='binary')
f1 = f1_score(y_test, voting_pred, average='binary')

# Display the metrics as percentages
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")


Accuracy: 70.30%
Precision: 71.81%
Recall: 65.64%
F1 Score: 68.59%
