# SVM Classifier on Iris Dataset

Q21. Write a Python program to train an SVM Classifier on the Iris dataset and evaluate accuracy:

In [None]:
# Import libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load Iris dataset
iris = datasets.load_iris()
X, y = iris.data, iris.target

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVM classifier
clf = SVC(kernel='linear', random_state=42)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Iris SVM Classifier Accuracy: {accuracy:.2f}')


Q22 Write a Python program to train two SVM classifiers with Linear and RBF kernels on the Wine dataset, then
compare their accuracies:

In [None]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the Wine dataset
wine = load_wine()
X, y = wine.data, wine.target

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear kernel SVM
linear_clf = SVC(kernel='linear', random_state=42)
linear_clf.fit(X_train, y_train)
linear_pred = linear_clf.predict(X_test)
accuracy_linear = accuracy_score(y_test, linear_pred)

# RBF kernel SVM
rbf_clf = SVC(kernel='rbf', random_state=42)
rbf_clf.fit(X_train, y_train)
rbf_pred = rbf_clf.predict(X_test)
accuracy_rbf = accuracy_score(y_test, rbf_pred)

print(f"SVM with Linear kernel Accuracy: {accuracy_linear:.2f}")
print(f"SVM with RBF kernel Accuracy: {accuracy_rbf:.2f}")


Q23: Write a Python program to train an SVM Regressor (SVR) on a housing dataset and evaluate it using Mean
Squared Error (MSE):

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

# Load the California housing dataset
housing = fetch_california_housing()
X, y = housing.data, housing.target

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVR
svr = SVR()
svr.fit(X_train, y_train)

# Predict and calculate MSE
y_pred = svr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'SVR Mean Squared Error: {mse:.2f}')



Q24 Write a Python program to train an SVM Classifier with a Polynomial Kernel and visualize the decision
boundary:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# Load Iris: use only the first 2 features for 2D plotting
iris = datasets.load_iris()
X = iris.data[:, :2]
y = iris.target

# Use only classes 0 and 1 for binary classification visual
X = X[y != 2]
y = y[y != 2]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Train polynomial kernel SVM
clf = SVC(kernel='poly', degree=3, C=1)
clf.fit(X_train, y_train)

# Function to plot decision boundary
def plot_decision_boundary(X, y, clf):
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
            xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))
                Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
                    Z = Z.reshape(xx.shape)
                        plt.contourf(xx, yy, Z, alpha=0.3)
                            plt.scatter(X[:, 0], X[:, 1], c=y, edgecolor='k', s=30)
                                plt.xlabel(iris.feature_names[0])
                                    plt.ylabel(iris.feature_names[1])
                                        plt.title('SVM with Polynomial Kernel')
                                            plt.show()

                                            plot_decision_boundary(X, y, clf)


Q25 Write a Python program to train a Gaussian Naïve Bayes classifier on the Breast Cancer dataset and
evaluate accuracy:

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load data
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f'Gaussian NB Accuracy on Breast Cancer: {accuracy:.2f}')


Q26: Write a Python program to train a Multinomial Naïve Bayes classifier for text classification using the 20
Newsgroups dataset.

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Fetch dataset
data = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)
X, y = data.data, data.target

# Vectorize
vectorizer = TfidfVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Train MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print(f'MultinomialNB Accuracy on 20 Newsgroups: {accuracy_score(y_test, y_pred):.2f}')
print(classification_report(y_test, y_pred, target_names=data.target_names))


27. Write a Python program to train an SVM Classifier with different C values and compare the decision
boundaries visually=

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets

# Load iris dataset
iris = datasets.load_iris()
X = iris.data[:, :2]  # First two features
y = iris.target

# Train SVMs with different C values
C_values = [0.1, 1, 100]
models = []
for C in C_values:
    model = svm.SVC(kernel='linear', C=C)
        model.fit(X, y)
            models.append(model)

            # Plot decision boundaries
            x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
            y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
            xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                                 np.arange(y_min, y_max, 0.02))

                                 plt.figure(figsize=(15, 5))
                                 for i, (model, C) in enumerate(zip(models, C_values)):
                                     plt.subplot(1, 3, i+1)
                                         Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
                                             Z = Z.reshape(xx.shape)
                                                 plt.contourf(xx, yy, Z, alpha=0.8)
                                                     plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k')
                                                         plt.title(f'SVM with C={C}')
                                                         plt.show()

28 Write a Python program to train a Bernoulli Naïve Bayes classifier for binary classification on a dataset with
binary features=

In [None]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Create binary dataset
X, y = make_classification(n_samples=1000, n_features=20,
                          n_informative=15, n_classes=2,
                                                    random_state=42)

                                                    # Convert to binary features
                                                    X = (X > 0).astype(int)

                                                    # Split data
                                                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

                                                    # Train and evaluate
                                                    bnb = BernoulliNB()
                                                    bnb.fit(X_train, y_train)
                                                    y_pred = bnb.predict(X_test)
                                                    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

30. Write a Python program to apply feature scaling before training an SVM model and compare results with
unscaled data?

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

# Load data
data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Without scaling
svm_unscaled = SVC(kernel='rbf')
svm_unscaled.fit(X_train, y_train)
print(f"Unscaled Accuracy: {svm_unscaled.score(X_test, y_test):.4f}")

# With scaling
svm_scaled = make_pipeline(StandardScaler(), SVC(kernel='rbf'))
svm_scaled.fit(X_train, y_train)
print(f"Scaled Accuracy: {svm_scaled.score(X_test, y_test):.4f}")

31.Write a Python program to train an SVM Classifier and use GridSearchCV to tune the hyperparameters (C,
gamma, kernel)?

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Create dataset
X, y = make_classification(n_samples=1000, n_features=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Without smoothing
gnb = GaussianNB(var_smoothing=0)
gnb.fit(X_train, y_train)
print(f"No smoothing accuracy: {gnb.score(X_test, y_test):.4f}")

# With smoothing
gnb_smooth = GaussianNB(var_smoothing=1e-9)
gnb_smooth.fit(X_train, y_train)
print(f"With smoothing accuracy: {gnb_smooth.score(X_test, y_test):.4f}")

32. Write a Python program to train an SVM Classifier on an imbalanced dataset and apply class weighting and
check it improve accuracy?

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.datasets import make_imbalance
from collections import Counter

# Load the iris dataset and make it imbalanced
iris = datasets.load_iris()
X = iris.data[:, :2]  # Use first two features for visualization
y = iris.target

# Create an imbalanced version (class 0 will be minority)
X, y = make_imbalance(iris.data, iris.target, sampling_strategy={0: 10, 1: 50, 2: 50})

print("Class distribution:", Counter(y))

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train SVM without class weighting
svm_unweighted = svm.SVC(kernel='linear', random_state=42)
svm_unweighted.fit(X_train, y_train)
y_pred_unweighted = svm_unweighted.predict(X_test)

# Train SVM with class weighting (balanced)
svm_weighted = svm.SVC(kernel='linear', class_weight='balanced', random_state=42)
svm_weighted.fit(X_train, y_train)
y_pred_weighted = svm_weighted.predict(X_test)

# Calculate accuracies
acc_unweighted = accuracy_score(y_test, y_pred_unweighted)
acc_weighted = accuracy_score(y_test, y_pred_weighted)

print("\nUnweighted SVM Accuracy: {:.2f}%".format(acc_unweighted * 100))
print("Weighted SVM Accuracy: {:.2f}%".format(acc_weighted * 100))

# More detailed evaluation
print("\nUnweighted SVM Classification Report:")
print(classification_report(y_test, y_pred_unweighted))

print("\nWeighted SVM Classification Report:")
print(classification_report(y_test, y_pred_weighted))

# Plot decision boundaries
def plot_decision_boundary(clf, X, y, title):
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
            xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                                     np.arange(y_min, y_max, 0.02))

                                             Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
                                                 Z = Z.reshape(xx.shape)

                                                         plt.contourf(xx, yy, Z, alpha=0.8)
                                                             plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k')
                                                                 plt.title(title)
                                                                     plt.xlabel(iris.feature_names[0])
                                                                         plt.ylabel(iris.feature_names[1])

                                                                         plt.figure(figsize=(12, 5))
                                                                         plt.subplot(1, 2, 1)
                                                                         plot_decision_boundary(svm_unweighted, X_train, y_train, "Unweighted SVM")

                                                                         plt.subplot(1, 2, 2)
                                                                         plot_decision_boundary(svm_weighted, X_train, y_train, "Weighted SVM (Balanced)")
                                                                         plt.tight_layout()
                                                                         plt.show()

33. Write a Python program to implement a Naïve Bayes classifier for spam detection using email data?

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset (replace with your email dataset)
# Sample dataset format: two columns - 'text' and 'label' (0=ham, 1=spam)
data = pd.read_csv('emails.csv')  # Replace with your dataset path

# If using the built-in dataset from sklearn (alternative)
# from sklearn.datasets import fetch_20newsgroups
# categories = ['alt.atheism', 'soc.religion.christian']
# data = fetch_20newsgroups(subset='train', categories=categories)
# df = pd.DataFrame({'text': data.data, 'label': data.target})

# Preprocessing
data['text'] = data['text'].str.lower()  # Convert to lowercase
data['text'] = data['text'].str.replace('[^\w\s]', '')  # Remove punctuation

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data['text'], data['label'], test_size=0.2, random_state=42
    )

    # Vectorize the text data (convert to numerical features)
    vectorizer = CountVectorizer(stop_words='english', max_features=5000)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Train Naïve Bayes classifier
    nb_classifier = MultinomialNB()
    nb_classifier.fit(X_train_vec, y_train)

    # Make predictions
    y_pred = nb_classifier.predict(X_test_vec)

    # Evaluate the model
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Confusion matrix visualization
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
                plt.xlabel('Predicted')
                plt.ylabel('Actual')
                plt.title('Confusion Matrix for Spam Detection')
                plt.show()

                # Example of predicting new emails
                new_emails = [
                    "Congratulations! You've won a $1000 prize! Click here to claim!",
                        "Hi John, just checking in about our meeting tomorrow",
                            "Your account has been compromised. Verify your details now!"
                            ]

                            new_emails_vec = vectorizer.transform(new_emails)
                            predictions = nb_classifier.predict(new_emails_vec)

                            print("\nSample Predictions:")
                            for email, pred in zip(new_emails, predictions):
                                print(f"\nEmail: {email[:50]}...")
                                    print("Prediction:", "Spam" if pred == 1 else "Ham")

34. Write a Python program to train an SVM Classifier and a Naïve Bayes Classifier on the same dataset and
compare their accuracy?

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt

# Load dataset (using Breast Cancer Wisconsin dataset as example)
data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names
target_names = data.target_names

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize classifiers
svm_classifier = SVC(kernel='linear', random_state=42)
nb_classifier = GaussianNB()

# Train classifiers
print("Training SVM Classifier...")
svm_classifier.fit(X_train, y_train)

print("Training Naïve Bayes Classifier...")
nb_classifier.fit(X_train, y_train)

# Make predictions
svm_pred = svm_classifier.predict(X_test)
nb_pred = nb_classifier.predict(X_test)

# Evaluate accuracy
svm_accuracy = accuracy_score(y_test, svm_pred)
nb_accuracy = accuracy_score(y_test, nb_pred)

# Print results
print("\nClassifier Comparison:")
print(f"SVM Accuracy: {svm_accuracy:.4f}")
print(f"Naïve Bayes Accuracy: {nb_accuracy:.4f}")

# Detailed classification reports
print("\nSVM Classification Report:")
print(classification_report(y_test, svm_pred, target_names=target_names))

print("\nNaïve Bayes Classification Report:")
print(classification_report(y_test, nb_pred, target_names=target_names))

# Visual comparison
plt.figure(figsize=(10, 5))
plt.bar(['SVM', 'Naïve Bayes'], [svm_accuracy, nb_accuracy], color=['blue', 'orange'])
plt.ylim(0.8, 1.0)
plt.ylabel('Accuracy')
plt.title('Classifier Accuracy Comparison')
for i, v in enumerate([svm_accuracy, nb_accuracy]):
    plt.text(i, v + 0.01, f"{v:.4f}", ha='center')
    plt.show()

    # Feature importance analysis (for SVM)
    if hasattr(svm_classifier, 'coef_'):
        print("\nTop 5 Important Features (SVM):")
            coef = svm_classifier.coef_[0]
                top_features = np.argsort(np.abs(coef))[-5:][::-1]
                    for i in top_features:
                            print(f"{feature_names[i]}: {coef[i]:.4f}")

35. Write a Python program to perform feature selection before training a Naïve Bayes classifier and compare
results?

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt

# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Function to train and evaluate Naïve Bayes
def evaluate_nb(X_train, X_test, y_train, y_test, title):
    nb = GaussianNB()
        nb.fit(X_train, y_train)
            y_pred = nb.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred)
                    print(f"\n{title} Results:")
                        print(f"Accuracy: {accuracy:.4f}")
                            print("Classification Report:")
                                print(classification_report(y_test, y_pred))
                                    return accuracy, nb

                                    # 1. Train without feature selection
                                    print("=== Naïve Bayes without Feature Selection ===")
                                    acc_full, nb_full = evaluate_nb(X_train, X_test, y_train, y_test, "All Features")

                                    # 2. Perform feature selection using ANOVA F-value
                                    print("\n=== Performing Feature Selection ===")
                                    selector = SelectKBest(score_func=f_classif, k=5)  # Select top 5 features
                                    X_train_selected = selector.fit_transform(X_train, y_train)
                                    X_test_selected = selector.transform(X_test)

                                    # Get selected feature names and scores
                                    selected_features = feature_names[selector.get_support()]
                                    feature_scores = selector.scores_[selector.get_support()]

                                    print("\nSelected Features:")
                                    for feature, score in zip(selected_features, feature_scores):
                                        print(f"{feature}: {score:.2f}")

                                        # 3. Train with selected features
                                        print("\n=== Naïve Bayes with Feature Selection ===")
                                        acc_selected, nb_selected = evaluate_nb(X_train_selected, X_test_selected, y_train, y_test, "Selected Features")

                                        # Visual comparison
                                        plt.figure(figsize=(10, 5))
                                        plt.bar(['All Features', 'Selected Features'], [acc_full, acc_selected], color=['blue', 'green'])
                                        plt.ylim(0.8, 1.0)
                                        plt.ylabel('Accuracy')
                                        plt.title('Naïve Bayes Performance: With vs Without Feature Selection')
                                        for i, v in enumerate([acc_full, acc_selected]):
                                            plt.text(i, v + 0.01, f"{v:.4f}", ha='center')
                                            plt.show()

                                            # Feature importance visualization
                                            plt.figure(figsize=(12, 6))
                                            plt.barh(range(len(selected_features)), feature_scores, color='purple')
                                            plt.yticks(range(len(selected_features)), selected_features)
                                            plt.xlabel('ANOVA F-value Score')
                                            plt.title('Top Selected Features and Their Importance Scores')
                                            plt.tight_layout()
                                            plt.show()

36 Write a Python program to train an SVM Classifier using One-vs-Rest (OvR) and One-vs-One (OvO)
strategies on the Wine dataset and compare their accuracy?

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Load the Wine dataset
wine = datasets.load_wine()
X = wine.data
y = wine.target
feature_names = wine.feature_names
target_names = wine.target_names

# Preprocess the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize SVM with linear kernel
svm = SVC(kernel='linear', random_state=42)

# Create OvR and OvO classifiers
ovr_classifier = OneVsRestClassifier(svm)
ovo_classifier = OneVsOneClassifier(svm)

# Train the classifiers
print("Training One-vs-Rest SVM...")
ovr_classifier.fit(X_train, y_train)

print("Training One-vs-One SVM...")
ovo_classifier.fit(X_train, y_train)

# Make predictions
ovr_pred = ovr_classifier.predict(X_test)
ovo_pred = ovo_classifier.predict(X_test)

# Calculate accuracies
ovr_accuracy = accuracy_score(y_test, ovr_pred)
ovo_accuracy = accuracy_score(y_test, ovo_pred)

# Print results
print("\nClassifier Comparison:")
print(f"One-vs-Rest Accuracy: {ovr_accuracy:.4f}")
print(f"One-vs-One Accuracy: {ovo_accuracy:.4f}")

# Detailed classification reports
print("\nOne-vs-Rest Classification Report:")
print(classification_report(y_test, ovr_pred, target_names=target_names))

print("\nOne-vs-One Classification Report:")
print(classification_report(y_test, ovo_pred, target_names=target_names))

# Visual comparison
plt.figure(figsize=(10, 5))
plt.bar(['One-vs-Rest', 'One-vs-One'], [ovr_accuracy, ovo_accuracy], color=['blue', 'orange'])
plt.ylim(0.7, 1.0)
plt.ylabel('Accuracy')
plt.title('SVM Multiclass Strategies Accuracy Comparison')
for i, v in enumerate([ovr_accuracy, ovo_accuracy]):
    plt.text(i, v + 0.01, f"{v:.4f}", ha='center')
    plt.show()

    # Number of classifiers created
    print(f"\nNumber of classifiers created:")
    print(f"OvR: {len(ovr_classifier.estimators_)} (equal to number of classes)")
    print(f"OvO: {len(ovo_classifier.estimators_)} (n_classes * (n_classes - 1) / 2)")

37.Write a Python program to train an SVM Classifier using Linear, Polynomial, and RBF kernels on the Breast
Cancer dataset and compare their accuracy?

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Load the Breast Cancer dataset
cancer = datasets.load_breast_cancer()
X = cancer.data
y = cancer.target
feature_names = cancer.feature_names
target_names = cancer.target_names

# Preprocess the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize SVM classifiers with different kernels
svm_linear = SVC(kernel='linear', random_state=42)
svm_poly = SVC(kernel='poly', degree=3, random_state=42)  # 3rd degree polynomial
svm_rbf = SVC(kernel='rbf', random_state=42)  # Radial Basis Function

# Train the classifiers
print("Training Linear Kernel SVM...")
svm_linear.fit(X_train, y_train)

print("Training Polynomial Kernel SVM...")
svm_poly.fit(X_train, y_train)

print("Training RBF Kernel SVM...")
svm_rbf.fit(X_train, y_train)

# Make predictions
linear_pred = svm_linear.predict(X_test)
poly_pred = svm_poly.predict(X_test)
rbf_pred = svm_rbf.predict(X_test)

# Calculate accuracies
linear_acc = accuracy_score(y_test, linear_pred)
poly_acc = accuracy_score(y_test, poly_pred)
rbf_acc = accuracy_score(y_test, rbf_pred)

# Print results
print("\nKernel Comparison:")
print(f"Linear Kernel Accuracy: {linear_acc:.4f}")
print(f"Polynomial Kernel Accuracy: {poly_acc:.4f}")
print(f"RBF Kernel Accuracy: {rbf_acc:.4f}")

# Detailed classification reports
print("\nLinear Kernel Classification Report:")
print(classification_report(y_test, linear_pred, target_names=target_names))

print("\nPolynomial Kernel Classification Report:")
print(classification_report(y_test, poly_pred, target_names=target_names))

print("\nRBF Kernel Classification Report:")
print(classification_report(y_test, rbf_pred, target_names=target_names))

# Visual comparison
plt.figure(figsize=(10, 6))
kernels = ['Linear', 'Polynomial', 'RBF']
accuracies = [linear_acc, poly_acc, rbf_acc]
colors = ['blue', 'green', 'red']

bars = plt.bar(kernels, accuracies, color=colors)
plt.ylim(0.85, 1.0)
plt.ylabel('Accuracy')
plt.title('SVM Kernel Performance Comparison on Breast Cancer Dataset')

# Add accuracy values on top of bars
for bar in bars:
    height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                     f'{height:.4f}',
                                  ha='center', va='bottom')

                                  plt.show()

                                  # Feature importance for linear kernel (coefficients)
                                  if hasattr(svm_linear, 'coef_'):
                                      print("\nTop 5 Important Features (Linear Kernel):")
                                          coef = svm_linear.coef_[0]
                                              top_features = np.argsort(np.abs(coef))[-5:][::-1]
                                                  for i in top_features:
                                                          print(f"{feature_names[i]}: {coef[i]:.4f}")

38 Write a Python program to train an SVM Classifier using Stratified K-Fold Cross-Validation and compute the
average accuracy?

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Load the Breast Cancer dataset
cancer = datasets.load_breast_cancer()
X = cancer.data
y = cancer.target

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Initialize SVM classifier
svm = SVC(kernel='rbf', random_state=42)

# Set up Stratified K-Fold Cross-Validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Store accuracy scores
accuracies = []
fold_num = 1

print(f"Performing {n_splits}-Fold Stratified Cross-Validation...\n")

# Perform cross-validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

                # Train SVM
                    svm.fit(X_train, y_train)

                            # Make predictions and calculate accuracy
                                y_pred = svm.predict(X_test)
                                    acc = accuracy_score(y_test, y_pred)
                                        accuracies.append(acc)

                                                print(f"Fold {fold_num} Accuracy: {acc:.4f}")
                                                    fold_num += 1

                                                    # Calculate average accuracy
                                                    avg_accuracy = np.mean(accuracies)
                                                    std_accuracy = np.std(accuracies)

                                                    print("\nCross-Validation Results:")
                                                    print(f"Average Accuracy: {avg_accuracy:.4f}")
                                                    print(f"Standard Deviation: {std_accuracy:.4f}")

                                                    # Visualize fold accuracies
                                                    plt.figure(figsize=(10, 5))
                                                    plt.bar(range(1, n_splits+1), accuracies, color='skyblue')
                                                    plt.axhline(y=avg_accuracy, color='r', linestyle='--', label=f'Average Accuracy: {avg_accuracy:.4f}')
                                                    plt.xlabel('Fold Number')
                                                    plt.ylabel('Accuracy')
                                                    plt.title('SVM Classifier Accuracy per Fold (Stratified K-Fold CV)')
                                                    plt.ylim(0.9, 1.0)
                                                    plt.legend()
                                                    plt.grid(True, linestyle='--', alpha=0.7)

                                                    # Add accuracy values on top of bars
                                                    for i, acc in enumerate(accuracies):
                                                        plt.text(i+1, acc + 0.005, f"{acc:.4f}", ha='center')

                                                        plt.show()

39 Write a Python program to train a Naïve Bayes classifier using different prior probabilities and compare
performance?

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target
class_names = data.target_names
class_counts = np.bincount(y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define different priors to test
priors_list = [
    None,  # Let the model estimate from data
        [0.5, 0.5],  # Equal priors
            [0.3, 0.7],  # Favor class 1
                [0.7, 0.3],  # Favor class 0
                    [class_counts[0]/len(y), class_counts[1]/len(y)]  # Empirical priors
                    ]

                    prior_names = [
                        "Estimated from data",
                            "Equal [0.5, 0.5]",
                                "Favor class 1 [0.3, 0.7]",
                                    "Favor class 0 [0.7, 0.3]",
                                        "Empirical priors"
                                        ]

                                        # Store results
                                        results = []

                                        # Train and evaluate with different priors
                                        for prior, name in zip(priors_list, prior_names):
                                            # Create and train model
                                                nb = GaussianNB(priors=prior)
                                                    nb.fit(X_train, y_train)

                                                            # Make predictions
                                                                y_pred = nb.predict(X_test)
                                                                    y_proba = nb.predict_proba(X_test)[:, 1]

                                                                            # Calculate metrics
                                                                                acc = accuracy_score(y_test, y_pred)
                                                                                    f1 = f1_score(y_test, y_pred)
                                                                                        roc_auc = roc_auc_score(y_test, y_proba)

                                                                                                # Store results
                                                                                                    results.append({
                                                                                                            'prior': name,
                                                                                                                    'accuracy': acc,
                                                                                                                            'f1_score': f1,
                                                                                                                                    'roc_auc': roc_auc,
                                                                                                                                            'actual_priors': nb.class_prior_  # The actual priors used
                                                                                                                                                })

                                                                                                                                                        print(f"\nResults for {name}:")
                                                                                                                                                            print(f"Accuracy: {acc:.4f}")
                                                                                                                                                                print(f"F1 Score: {f1:.4f}")
                                                                                                                                                                    print(f"ROC AUC: {roc_auc:.4f}")
                                                                                                                                                                        print(f"Actual priors used: {np.round(nb.class_prior_, 4)}")

                                                                                                                                                                        # Convert results to DataFrame for better visualization
                                                                                                                                                                        import pandas as pd
                                                                                                                                                                        results_df = pd.DataFrame(results)
                                                                                                                                                                        print("\nSummary Table:")
                                                                                                                                                                        print(results_df[['prior', 'accuracy', 'f1_score', 'roc_auc']])

                                                                                                                                                                        # Plot comparison
                                                                                                                                                                        metrics = ['accuracy', 'f1_score', 'roc_auc']
                                                                                                                                                                        x = np.arange(len(prior_names))
                                                                                                                                                                        width = 0.25

                                                                                                                                                                        plt.figure(figsize=(12, 6))
                                                                                                                                                                        for i, metric in enumerate(metrics):
                                                                                                                                                                            plt.bar(x + i*width, results_df[metric], width, label=metric)

                                                                                                                                                                                plt.xlabel('Prior Probabilities')
                                                                                                                                                                                plt.ylabel('Score')
                                                                                                                                                                                plt.title('Naïve Bayes Performance with Different Priors')
                                                                                                                                                                                plt.xticks(x + width, prior_names, rotation=15, ha='right')
                                                                                                                                                                                plt.ylim(0.8, 1.0)
                                                                                                                                                                                plt.legend()
                                                                                                                                                                                plt.grid(True, linestyle='--', alpha=0.7)
                                                                                                                                                                                plt.tight_layout()
                                                                                                                                                                                plt.show()

                                                                                                                                                                                # Show actual class distribution
                                                                                                                                                                                plt.figure(figsize=(6, 4))
                                                                                                                                                                                plt.bar(class_names, class_counts, color=['skyblue', 'lightcoral'])
                                                                                                                                                                                plt.title('Actual Class Distribution in Dataset')
                                                                                                                                                                                plt.ylabel('Count')
                                                                                                                                                                                plt.show()

40 Write a Python program to perform Recursive Feature Elimination (RFE) before training an SVM Classifier and
compare accurac?

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target
feature_names = data.feature_names

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create preprocessing pipeline
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 1. Train baseline SVM with all features
svm_full = SVC(kernel='linear', random_state=42)
svm_full.fit(X_train_scaled, y_train)
y_pred_full = svm_full.predict(X_test_scaled)
acc_full = accuracy_score(y_test, y_pred_full)

# 2. Perform RFE
print("Performing Recursive Feature Elimination...")
n_features_to_select = 10  # Select top 10 features
svm_rfe = SVC(kernel='linear', random_state=42)
rfe = RFE(estimator=svm_rfe, n_features_to_select=n_features_to_select, step=1)
rfe.fit(X_train_scaled, y_train)

# Get selected features
selected_features = np.where(rfe.support_)[0]
print("\nSelected Features ({} out of {}):".format(n_features_to_select, X.shape[1]))
for i, idx in enumerate(selected_features):
    print(f"{i+1}. {feature_names[idx]}")

    # Train SVM with selected features
    X_train_selected = rfe.transform(X_train_scaled)
    X_test_selected = rfe.transform(X_test_scaled)

    svm_selected = SVC(kernel='linear', random_state=42)
    svm_selected.fit(X_train_selected, y_train)
    y_pred_selected = svm_selected.predict(X_test_selected)
    acc_selected = accuracy_score(y_test, y_pred_selected)

    # Compare results
    print("\nModel Comparison:")
    print(f"Full Feature Set Accuracy ({X.shape[1]} features): {acc_full:.4f}")
    print(f"Selected Features Accuracy ({n_features_to_select} features): {acc_selected:.4f}")

    print("\nFull Feature Set Classification Report:")
    print(classification_report(y_test, y_pred_full))

    print("\nSelected Features Classification Report:")
    print(classification_report(y_test, y_pred_selected))

    # Visual comparison
    plt.figure(figsize=(10, 5))
    plt.bar(['All Features', f'Selected {n_features_to_select} Features'],
            [acc_full, acc_selected], color=['blue', 'green'])
            plt.ylim(0.8, 1.0)
            plt.ylabel('Accuracy')
            plt.title('SVM Performance: Before vs After RFE Feature Selection')
            for i, v in enumerate([acc_full, acc_selected]):
                plt.text(i, v + 0.01, f"{v:.4f}", ha='center')
                plt.show()

                # Plot feature rankings
                plt.figure(figsize=(12, 6))
                ranking = rfe.ranking_
                plt.barh(range(len(feature_names)), ranking, tick_label=feature_names)
                plt.title('Feature Rankings (1 = selected)')
                plt.xlabel('Ranking')
                plt.tight_layout()
                plt.show()

41. Write a Python program to train an SVM Classifier and evaluate its performance using Precision, Recall, and
F1-Score instead of accuracy?

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix
import seaborn as sns

# Load the Breast Cancer dataset
data = datasets.load_breast_cancer()
X = data.data
y = data.target
target_names = data.target_names

# Preprocess the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train SVM classifier
svm = SVC(kernel='rbf', random_state=42)
svm.fit(X_train, y_train)

# Make predictions
y_pred = svm.predict(X_test)

# Calculate evaluation metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print metrics
print("SVM Classifier Performance Metrics:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))

# Confusion matrix visualization
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_names, yticklabels=target_names)
            plt.xlabel('Predicted')
            plt.ylabel('Actual')
            plt.title('Confusion Matrix')
            plt.show()

            # Metrics comparison visualization
            metrics = ['Precision', 'Recall', 'F1-Score']
            values = [precision, recall, f1]

            plt.figure(figsize=(8, 5))
            bars = plt.bar(metrics, values, color=['blue', 'green', 'red'])
            plt.ylim(0, 1.1)
            plt.title('SVM Classifier Performance Metrics')
            plt.ylabel('Score')

            # Add values on top of bars
            for bar in bars:
                height = bar.get_height()
                    plt.text(bar.get_x() + bar.get_width()/2., height,
                                 f'{height:.4f}',
                                              ha='center', va='bottom')

                                              plt.show()

42. Write a Python program to train a Naïve Bayes Classifier and evaluate its performance using Log Loss
(Cross-Entropy Loss)?

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import seaborn as sns

# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names
target_names = data.target_names

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize features (important for probability calibration)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train Naïve Bayes classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train_scaled, y_train)

# Get predicted probabilities and class predictions
y_pred_proba = nb_classifier.predict_proba(X_test_scaled)
y_pred = nb_classifier.predict(X_test_scaled)

# Calculate evaluation metrics
logloss = log_loss(y_test, y_pred_proba)
accuracy = accuracy_score(y_test, y_pred)

print("Naïve Bayes Classifier Evaluation:")
print(f"Log Loss (Cross-Entropy): {logloss:.4f}")
print(f"Accuracy: {accuracy:.4f}")

# Plot predicted probabilities distribution
plt.figure(figsize=(10, 6))
plt.hist(y_pred_proba[y_test == 0][:, 1], bins=30, alpha=0.5, label='Class 0 (Benign)', color='blue')
plt.hist(y_pred_proba[y_test == 1][:, 1], bins=30, alpha=0.5, label='Class 1 (Malignant)', color='red')
plt.xlabel('Predicted Probability for Class 1')
plt.ylabel('Frequency')
plt.title('Distribution of Predicted Probabilities')
plt.legend()
plt.show()

# Confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_names, yticklabels=target_names)
            plt.xlabel('Predicted')
            plt.ylabel('Actual')
            plt.title('Confusion Matrix')
            plt.show()

            # Print probability calibration metrics
            print("\nProbability Calibration:")
            print("For well-calibrated classifiers, log loss should be close to 0.5-1.0 for binary classification")
            print(f"Our Log Loss: {logloss:.4f}")

            # Additional metrics
            from sklearn.metrics import classification_report
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred, target_names=target_names))

            # Plot log loss components (for understanding)
            def log_loss_components(y_true, y_pred_proba):
                eps = 1e-15  # to avoid log(0)
                    y_pred_proba = np.clip(y_pred_proba, eps, 1 - eps)
                        return -np.mean(y_true * np.log(y_pred_proba[:, 1]) +
                                   (1 - y_true) * np.log(1 - y_pred_proba[:, 1]))

                                   ll_components = log_loss_components(y_test, y_pred_proba)
                                   print(f"\nLog Loss Components (Positive/Negative class contributions): {ll_components:.4f}")

43. Write a Python program to train an SVM Classifier and visualize the Confusion Matrix using seaborn?

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# Load the Breast Cancer dataset
data = datasets.load_breast_cancer()
X = data.data
y = data.target
target_names = data.target_names
feature_names = data.feature_names

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize features (important for SVM)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train SVM classifier
svm = SVC(kernel='rbf', random_state=42)
svm.fit(X_train, y_train)

# Make predictions
y_pred = svm.predict(X_test)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a beautiful confusion matrix visualization
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.2)  # Adjust font size
heatmap = sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            cbar=False, linewidths=0.5, linecolor='gray',
                        xticklabels=target_names, yticklabels=target_names)

                        # Customize the plot
                        plt.title('SVM Classifier Confusion Matrix', pad=20, fontsize=16)
                        plt.xlabel('Predicted Label', fontsize=14)
                        plt.ylabel('True Label', fontsize=14)
                        heatmap.set_yticklabels(heatmap.get_yticklabels(), rotation=0)  # Keep y-axis labels horizontal

                        # Add accuracy and other metrics to the plot
                        accuracy = np.trace(cm) / np.sum(cm)
                        plt.text(0.5, -0.25,
                                 f'Accuracy: {accuracy:.2%}\n\n{classification_report(y_test, y_pred, target_names=target_names)}',
                                          ha='center', va='center', transform=plt.gca().transAxes,
                                                   bbox=dict(facecolor='white', alpha=0.8))

                                                   plt.tight_layout()
                                                   plt.show()

                                                   # Print classification report in console
                                                   print("Classification Report:")
                                                   print(classification_report(y_test, y_pred, target_names=target_names))

                                                   # Additional: Plot decision boundary for first two features (if interested)
                                                   if X.shape[1] >= 2:
                                                       plt.figure(figsize=(8, 6))
                                                           plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap='coolwarm', alpha=0.6)
                                                               plt.title('Decision Boundary (First Two Features)')
                                                                   plt.xlabel(feature_names[0])
                                                                       plt.ylabel(feature_names[1])
                                                                           plt.show()

44. Write a Python program to train an SVM Regressor (SVR) and evaluate its performance using Mean Absolute
Error (MAE) instead of MSE?


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
import seaborn as sns

# Load the California housing dataset
data = fetch_california_housing()
X = data.data
y = data.target
feature_names = data.feature_names

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize features (important for SVR)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train SVR model
svr = SVR(kernel='rbf', C=1.0, epsilon=0.1)
svr.fit(X_train_scaled, y_train)

# Make predictions
y_pred = svr.predict(X_test_scaled)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print metrics
print("SVR Performance Metrics:")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R²) Score: {r2:.4f}")

# Visualize predictions vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('SVR: Actual vs Predicted Values')
plt.grid(True)
plt.show()

# Residual plot
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.grid(True)
plt.show()

# Feature importance (for linear kernel)
if svr.kernel == 'linear':
    plt.figure(figsize=(10, 6))
        importance = svr.coef_[0]
            sorted_idx = np.argsort(np.abs(importance))[::-1]
                plt.barh(range(X.shape[1]), importance[sorted_idx], align='center')
                    plt.yticks(range(X.shape[1]), feature_names[sorted_idx])
                        plt.xlabel('Coefficient Value')
                            plt.title('Feature Importance (Linear SVR)')
                                plt.tight_layout()
                                    plt.show()

45. Write a Python program to train a Naïve Bayes classifier and evaluate its performance using the ROC-AUC
score?

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.preprocessing import StandardScaler
import seaborn as sns

# Load the Breast Cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target
target_names = data.target_names

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize features (important for probability calibration)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train Naïve Bayes classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train_scaled, y_train)

# Get predicted probabilities for the positive class (class 1)
y_pred_proba = nb_classifier.predict_proba(X_test_scaled)[:, 1]

# Calculate ROC-AUC score
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Guessing')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.title('Receiver Operating Characteristic (ROC) Curve\nNaïve Bayes Classifier')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.show()

# Print evaluation metrics
print(f"ROC-AUC Score: {roc_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, nb_classifier.predict(X_test_scaled), target_names=target_names))

# Additional: Plot probability distributions
plt.figure(figsize=(10, 6))
plt.hist(y_pred_proba[y_test == 0], bins=30, alpha=0.5, label='Class 0 (Benign)', color='blue')
plt.hist(y_pred_proba[y_test == 1], bins=30, alpha=0.5, label='Class 1 (Malignant)', color='red')
plt.xlabel('Predicted Probability for Class 1')
plt.ylabel('Frequency')
plt.title('Distribution of Predicted Probabilities')
plt.legend()
plt.show()

# Confusion matrix with optimal threshold
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
y_pred_optimal = (y_pred_proba >= optimal_threshold).astype(int)

plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred_optimal)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_names, yticklabels=target_names)
            plt.xlabel('Predicted')
            plt.ylabel('Actual')
            plt.title(f'Confusion Matrix (Threshold = {optimal_threshold:.2f})')
            plt.show()

46. Write a Python program to train an SVM Classifier and visualize the Precision-Recall Curve?

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.preprocessing import StandardScaler

# Load the Breast Cancer dataset (binary classification)
data = datasets.load_breast_cancer()
X = data.data
y = data.target
target_names = data.target_names

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize features (important for SVM)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train SVM classifier
svm_clf = svm.SVC(kernel='rbf', probability=True, random_state=42)
svm_clf.fit(X_train, y_train)

# Get predicted probabilities for the positive class
y_scores = svm_clf.predict_proba(X_test)[:, 1]

# Calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)
average_precision = average_precision_score(y_test, y_scores)

# Plot the Precision-Recall curve
plt.figure(figsize=(10, 8))
plt.step(recall, precision, where='post', color='b', alpha=0.8,
         label=f'SVM (AP = {average_precision:.2f})')
         plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')

         # Add baseline (random classifier)
         baseline = len(y_test[y_test==1]) / len(y_test)
         plt.axhline(y=baseline, color='r', linestyle='--',
                     label=f'Baseline (AP = {baseline:.2f})')

                     # Format the plot
                     plt.xlabel('Recall (Sensitivity)', fontsize=12)
                     plt.ylabel('Precision (Positive Predictive Value)', fontsize=12)
                     plt.ylim([0.0, 1.05])
                     plt.xlim([0.0, 1.0])
                     plt.title('Precision-Recall Curve\nSVM Classifier on Breast Cancer Dataset', fontsize=14)
                     plt.legend(loc='upper right', fontsize=12)
                     plt.grid(True, alpha=0.3)
                     plt.tight_layout()
                     plt.show()

                     # Print the optimal threshold (maximizing F1-score)
                     f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
                     optimal_idx = np.argmax(f1_scores)
                     optimal_threshold = thresholds[optimal_idx]
                     print(f"Optimal Threshold: {optimal_threshold:.4f}")
                     print(f"Optimal Precision: {precision[optimal_idx]:.4f}")
                     print(f"Optimal Recall: {recall[optimal_idx]:.4f}")
                     print(f"Optimal F1-score: {f1_scores[optimal_idx]:.4f}")

                     # Confusion matrix at optimal threshold
                     from sklearn.metrics import confusion_matrix
                     import seaborn as sns

                     y_pred = (y_scores >= optimal_threshold).astype(int)
                     cm = confusion_matrix(y_test, y_pred)

                     plt.figure(figsize=(8, 6))
                     sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                                 xticklabels=target_names, yticklabels=target_names)
                                 plt.xlabel('Predicted')
                                 plt.ylabel('Actual')
                                 plt.title(f'Confusion Matrix at Threshold = {optimal_threshold:.2f}')
                                 plt.show()