In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Using cached xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.1/124.9 MB 3.6 MB/s eta 0:00:35
   ---------------------------------------- 0.4/124.9 MB 5.4 MB/s eta 0:00:24
   ---------------------------------------- 0.8/124.9 MB 6.1 MB/s eta 0:00:21
   ---------------------------------------- 1.1/124.9 MB 6.2 MB/s eta 0:00:20
   ---------------------------------------- 1.3/124.9 MB 5.8 MB/s eta 0:00:22
   ---------------------------------------- 1.4/124.9 MB 5.5 MB/s eta 0:00:23
   ---------------------------------------- 1.4/124.9 MB 5.5 MB/s eta 0:00:23
   ---------------------------------------- 1.4/124.9 MB 5.5 MB/s eta 0:00:23
    --------------------------------------- 1.6/124.9 MB 3.9 MB/s eta 0:00:32


In [None]:
!pip install tensorflow

In [None]:
data = pd.read_csv("heart.csv")

In [None]:
print(data.head())

In [None]:
class_distribution = data['target'].value_counts()
print(class_distribution)

In [None]:
class_distribution.plot(kind='bar')
plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.show()

In [None]:
import pandas as pd
import numpy as np

def generate_new_data(data, num_samples, drift_factor=0.1):
    new_data = data.copy()
    for col in new_data.columns:
        if col != 'target':
            # Introduce drift by adding noise
            new_data[col] = new_data[col] * (1 + np.random.uniform(-drift_factor, drift_factor, size=new_data[col].shape))
    new_data = new_data.sample(n=num_samples, replace=True, random_state=42)
    return new_data


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import numpy as np
import matplotlib.pyplot as plt

# Function to train an XGBoost model and make predictions
def train_xgb_model(X_train, y_train, X_test, params, num_boost_round=220):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test)
    model = xgb.train(params, dtrain, num_boost_round=num_boost_round)
    y_pred = model.predict(dtest)
    y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]
    return model, y_pred_binary

# Function to simulate new data
def generate_new_data(data, num_samples, drift_factor=0.1):
    new_data = data.copy()
    for col in new_data.columns:
        if col != 'target':
            # Introduce drift by adding noise
            new_data[col] = new_data[col] * (1 + np.random.uniform(-drift_factor, drift_factor, size=new_data[col].shape))
    new_data = new_data.sample(n=num_samples, replace=True, random_state=42)
    return new_data

# Load the dataset
data = pd.read_csv("heart.csv")
X = data.drop(columns=["target"])
y = data["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost parameters
params = {
    'objective': 'binary:logistic',
    'eta': 0.1,
    'max_depth': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'logloss'
}

# Initial training
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
model, y_pred_binary = train_xgb_model(X_train_resampled, y_train_resampled, X_test, params)
accuracy = accuracy_score(y_test, y_pred_binary)
print("Initial Accuracy:", accuracy)
print("Initial Classification Report:\n", classification_report(y_test, y_pred_binary))
print("Initial Confusion Matrix:\n", confusion_matrix(y_test, y_pred_binary))

# Monitoring for concept drift
num_iterations = 5
accuracies = [accuracy]

for i in range(num_iterations):
    # Simulate new data
    new_data = generate_new_data(data, num_samples=len(data), drift_factor=0.1)
    X_new = new_data.drop(columns=["target"])
    y_new = new_data["target"]

    # Retrain the model with the updated data
    X_combined = pd.concat([X_train, X_new])
    y_combined = pd.concat([y_train, y_new])
    X_train_resampled, y_train_resampled = smote.fit_resample(X_combined, y_combined)
    model, y_pred_binary = train_xgb_model(X_train_resampled, y_train_resampled, X_test, params)

    # Evaluating the model
    new_accuracy = accuracy_score(y_test, y_pred_binary)
    accuracies.append(new_accuracy)
    print(f"Iteration {i+1} - Accuracy:", new_accuracy)
    print(f"Iteration {i+1} - Classification Report:\n", classification_report(y_test, y_pred_binary))
    print(f"Iteration {i+1} - Confusion Matrix:\n", confusion_matrix(y_test, y_pred_binary))

# Ploting accuracy over time
plt.plot(range(num_iterations + 1), accuracies, marker='o')
plt.title('Model Accuracy Over Time')
plt.xlabel('Iteration')
plt.ylabel('Accuracy')
plt.show()


##XGBoost with decision tree as base model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
data = pd.read_csv("heart.csv")

In [None]:
X = data.drop(columns=["target"])
y = data["target"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
params = {
    'objective': 'binary:logistic',
    'eta': 0.1,
    'max_depth': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'logloss'
}

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
num_trees_list = list(range(150, 250))
accuracy_list = []

In [None]:
for num_trees in num_trees_list:
    # Train the XGBoost model
    model = xgb.train(params, dtrain, num_boost_round=num_trees)

    # Make predictions on validation set
    y_pred = model.predict(dtest)
    y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]  # Convert to binary predictions

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred_binary)
    accuracy_list.append(accuracy)
    print("Number of Trees:", num_trees, "- Accuracy:", accuracy)

In [None]:
best_num_trees = num_trees_list[accuracy_list.index(max(accuracy_list))]
print("Best Number of Trees:", best_num_trees)

In [None]:
num_boost_round = 219
model = xgb.train(params, dtrain, num_boost_round)

In [None]:
y_pred = model.predict(dtest)
y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

In [None]:
# printing predictions
for i in range(5):
    print("Sample", i+1, "- True:", y_test.iloc[i], "- Predicted (Prob):", y_pred[i], "- Predicted (Binary):", y_pred_binary[i])


In [None]:
accuracy = accuracy_score(y_test, y_pred_binary)
print("Accuracy:", accuracy)

In [None]:
print(classification_report(y_test, y_pred_binary))

#85% training ans 15% testing data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [None]:
params = {
    'objective': 'binary:logistic',
    'eta': 0.1,
    'max_depth': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'eval_metric': 'logloss'
}

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
num_trees_list = list(range(150, 250))
accuracy_list = []

In [None]:
for num_trees in num_trees_list:
    # Train the XGBoost model
    model = xgb.train(params, dtrain, num_boost_round=num_trees)

    # Make predictions on validation set
    y_pred = model.predict(dtest)
    y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred_binary)
    accuracy_list.append(accuracy)
    print("Number of Trees:", num_trees, "- Accuracy:", accuracy)

In [None]:
best_num_trees = num_trees_list[accuracy_list.index(max(accuracy_list))]
print("Best Number of Trees:", best_num_trees)

In [None]:
num_boost_round = best_num_trees
model = xgb.train(params, dtrain, num_boost_round)

In [None]:
y_pred = model.predict(dtest)
y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

In [None]:
# printing predictions
for i in range(5):
    print("Sample", i+1, "- True:", y_test.iloc[i], "- Predicted (Prob):", y_pred[i], "- Predicted (Binary):", y_pred_binary[i])


In [None]:
accuracy = accuracy_score(y_test, y_pred_binary)
print("Accuracy:", accuracy)

In [None]:
print(classification_report(y_test, y_pred_binary))

##Random forest

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
data = pd.read_csv("heart.csv")

In [None]:
X = data.drop(columns=["target"])
y = data["target"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
rf_classifier.fit(X_train, y_train)

In [None]:
y_pred = rf_classifier.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
print(classification_report(y_test, y_pred))


##SVM

In [None]:
from sklearn.svm import SVC

In [None]:
svm_classifier = SVC(kernel='linear', random_state=42)

In [None]:
svm_classifier.fit(X_train, y_train)

In [None]:
y_pred_svm = svm_classifier.predict(X_test)

In [None]:

accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", accuracy_svm)

In [None]:
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

##Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB


In [None]:
nb_classifier = GaussianNB()

In [None]:
nb_classifier.fit(X_train, y_train)


In [None]:
y_pred_nb = nb_classifier.predict(X_test)


In [None]:
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print("Naive Bayes Accuracy:", accuracy_nb)

In [None]:
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))

##KMeans clustering


In [None]:
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score
import pandas as pd

In [None]:
heart_data = pd.read_csv("heart.csv")

In [None]:
X = heart_data.drop('target', axis=1)

In [None]:
X = heart_data.drop('target', axis=1)
y = heart_data['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
kmeans = KMeans(n_clusters=2, random_state=42)


In [None]:
kmeans.fit(X_train)

In [None]:
train_cluster_labels = kmeans.labels_
test_cluster_labels = kmeans.predict(X_test)


In [None]:
silhouette_score_train = silhouette_score(X_train, train_cluster_labels)
print("Silhouette Score on Training Data:", silhouette_score_train)


In [None]:
silhouette_score_test = silhouette_score(X_test, test_cluster_labels)
print("Silhouette Score on Test Data:", silhouette_score_test)

##K-Nearest Neighbors (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [None]:
X = heart_data.drop('target', axis=1)
y = heart_data['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
neighbors = np.arange(1, 21)

In [None]:
best_accuracy = 0
best_k = 0


In [None]:
for k in neighbors:

    knn = KNeighborsClassifier(n_neighbors=k)

    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_k = k

In [None]:
print(f"Best Number of Neighbors: {best_k}, Best Accuracy: {best_accuracy}")

In [None]:
print("Classification Report:")
print(classification_report(y_test, y_pred))


In [None]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
accuracy_scores = []


In [None]:
for k in neighbors:

    knn = KNeighborsClassifier(n_neighbors=k)

    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)

    print(f"Number of Neighbors: {k}, Accuracy: {accuracy}")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Load the heart.csv dataset
heart_data = pd.read_csv('heart.csv')

# Split the dataset into features and target variable
X = heart_data.drop("target", axis=1)
y = heart_data["target"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define a deeper neural network model
deep_model = Sequential([
    Dense(256, input_dim=X_train.shape[1], activation='relu'),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
deep_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = deep_model.fit(X_train, y_train, epochs=100, batch_size=32, 
                         validation_data=(X_test, y_test), verbose=1, callbacks=[early_stopping])

# Evaluate the model
_, accuracy = deep_model.evaluate(X_test, y_test)
print(f'Deep Neural Network Accuracy: {accuracy * 100:.2f}%')

# Make predictions
y_pred_probs = deep_model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int)

# Print confusion matrix and classification report
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

# Plot confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

# Load the heart.csv dataset
heart_data = pd.read_csv('heart.csv')

# Split the dataset into features and target variable
X = heart_data.drop("target", axis=1)
y = heart_data["target"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features for Deep Learning
scaler = StandardScaler()
X_train_dl = scaler.fit_transform(X_train)
X_test_dl = scaler.transform(X_test)

# Train Random Forest with optimized hyperparameters
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5)
rf_grid_search.fit(X_train, y_train)
best_rf_model = rf_grid_search.best_estimator_

# Train XGBoost with optimized hyperparameters
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001]
}
xgb_grid_search = GridSearchCV(XGBClassifier(random_state=42), xgb_param_grid, cv=5)
xgb_grid_search.fit(X_train, y_train)
best_xgb_model = xgb_grid_search.best_estimator_

# Train Deep Learning model with optimized architecture
deep_model = Sequential([
    Dense(512, input_dim=X_train_dl.shape[1], activation='relu'),
    Dropout(0.5),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
deep_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
deep_model.fit(X_train_dl, y_train, epochs=100, batch_size=32, validation_data=(X_test_dl, y_test), verbose=0, callbacks=[early_stopping])

# Predictions from individual models
rf_pred = best_rf_model.predict_proba(X_test)[:, 1]  # Using probabilities
xgb_pred = best_xgb_model.predict_proba(X_test)[:, 1]  # Using probabilities
dl_pred = deep_model.predict(X_test_dl).flatten()  # Using outputs

# Stack predictions horizontally to create meta-features
stacked_predictions = np.column_stack((rf_pred, xgb_pred, dl_pred))

# Train a meta-model (Logistic Regression in this case)
meta_model = LogisticRegression()
meta_model.fit(stacked_predictions, y_test)

# Make predictions using the meta-model
meta_pred = meta_model.predict(stacked_predictions)

# Evaluate the hybrid model
accuracy = accuracy_score(y_test, meta_pred)
print(f'Hybrid Model Accuracy: {accuracy * 100:.2f}%')

# Print confusion matrix and classification report
conf_matrix = confusion_matrix(y_test, meta_pred)
class_report = classification_report(y_test, meta_pred)

print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# Load your dataset
heart_data = pd.read_csv('heart.csv')

# Split dataset into features and target
X = heart_data.drop("target", axis=1)
y = heart_data["target"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features (if needed)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 1. Define and train Random Forest model
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

# 2. Define and train XGBoost model
xgb_model = XGBClassifier(n_estimators=300, max_depth=5, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# 3. Define and train Deep Neural Network model
def create_deep_model():
    model = Sequential([
        Dense(256, input_dim=X_train.shape[1], activation='relu'),
        Dropout(0.5),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

dl_model = create_deep_model()
dl_model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)

# 4. Combine models into an ensemble
# Generate predictions from each model
rf_preds = rf_model.predict_proba(X_test)[:, 1]
xgb_preds = xgb_model.predict_proba(X_test)[:, 1]
dl_preds = dl_model.predict(X_test)

# Stack predictions together (as new features)
ensemble_X = pd.DataFrame({
    'RandomForest': rf_preds,
    'XGBoost': xgb_preds,
    'DeepLearning': dl_preds.reshape(-1)
})

# Train a meta-classifier (e.g., Logistic Regression) on the stacked predictions
from sklearn.linear_model import LogisticRegression
meta_classifier = LogisticRegression()
meta_classifier.fit(ensemble_X, y_test)

# 5. Evaluate the ensemble model
# Generate predictions from each base model
rf_test_preds = rf_model.predict(X_test)
xgb_test_preds = xgb_model.predict(X_test)
dl_test_preds = (dl_preds > 0.5).astype(int)

# Combine test predictions into ensemble input
ensemble_X_test = pd.DataFrame({
    'RandomForest': rf_test_preds,
    'XGBoost': xgb_test_preds,
    'DeepLearning': dl_test_preds.reshape(-1)
})

# Predict with meta-classifier
ensemble_preds = meta_classifier.predict(ensemble_X_test)

# Evaluate the ensemble model
ensemble_accuracy = accuracy_score(y_test, ensemble_preds)
print(f'Ensemble Model Accuracy: {ensemble_accuracy * 100:.2f}%')

# Print confusion matrix and classification report
conf_matrix = confusion_matrix(y_test, ensemble_preds)
class_report = classification_report(y_test, ensemble_preds)

print("Confusion Matrix (Ensemble):\n", conf_matrix)
print("\nClassification Report (Ensemble):\n", class_report)
