# Prg 1

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv(r'./housing.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df['total_bedrooms'].median()

In [None]:
# Handling missing values
df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)

In [None]:
for i in df.iloc[:,2:7]:
    df[i] = df[i].astype('int')

In [None]:
df.head()

In [None]:
 df.describe().T

In [None]:
Numerical = df.select_dtypes(include=[np.number]).columns
print(Numerical)

In [None]:
for col in Numerical:
    plt.figure(figsize=(10, 6))
    df[col].plot(kind='hist', title=col, bins=60, edgecolor='black')
    plt.ylabel('Frequency')
    plt.show()

In [None]:
for col in Numerical:
    plt.figure(figsize=(6, 6))
    sns.boxplot(df[col], color='blue')
    plt.title(col)
    plt.ylabel(col)
    plt.show()

# Prg 2

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
# Load California Housing dataset
data = fetch_california_housing()
# Convert to DataFrame
df = pd.DataFrame(data.data, columns=data.feature_names)
df['Target'] = data.target # Adding the target variable (median house value)
# df.head()

In [None]:
# Table of Meaning of Each Variable
variable_meaning = {
"MedInc": "Median income in block group",
"HouseAge": "Median house age in block group",
"AveRooms": "Average number of rooms per household",
"AveBedrms": "Average number of bedrooms per household",
"Population": "Population of block group",
"AveOccup": "Average number of household members",
"Latitude": "Latitude of block group",
"Longitude": "Longitude of block group",
"Target": "Median house value (in $100,000s)"
}
variable_df = pd.DataFrame(list(variable_meaning.items()), columns=["Feature", "Description"])
print("\nVariable Meaning Table:")
print(variable_df)

In [None]:
# Basic Data Exploration
print("\nBasic Information about Dataset:")
print(df.info()) # Overview of dataset
print("\nFirst Five Rows of Dataset:")
print(df.head()) # Display first few rows

In [None]:
# Summary Statistics
print("\nSummary Statistics:")
print(df.describe()) # Summary statistics of dataset

In [None]:
summary_explanation = """
The summary statistics table provides key percentiles and other descriptive metrics
- **25% (First Quartile - Q1):** This represents the value below which 25% of the d
- **50% (Median - Q2):** This is the middle value when the data is sorted. It provi
- **75% (Third Quartile - Q3):** This represents the value below which 75% of the d
- These percentiles are useful for detecting skewness, data distribution, and ident
"""
print("\nSummary Statistics Explanation:")
print(summary_explanation)

In [None]:
# Check for missing values
print("\nMissing Values in Each Column:")
print(df.isnull().sum()) # Count of missing values

In [None]:
# Histograms for distribution of features
plt.figure(figsize=(12, 8))
df.hist(figsize=(12, 8), bins=30, edgecolor='black')
plt.suptitle("Feature Distributions", fontsize=16)
plt.show()

In [None]:
# Boxplots for outlier detection
plt.figure(figsize=(12, 6))
sns.boxplot(data=df)
plt.xticks(rotation=45)
plt.title("Boxplots of Features to Identify Outliers")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# Pairplot to analyze feature relationships (only a subset for clarity)
sns.pairplot(df[['MedInc', 'HouseAge', 'AveRooms', 'Target']], diag_kind='kde')
plt.show()
# Insights from Data Exploration
print("\nKey Insights:")
print("1. The dataset has", df.shape[0], "rows and", df.shape[1], "columns.")
print("2. No missing values were found in the dataset.")
print("3. Histograms show skewed distributions in some features like 'MedInc'.")
print("4. Boxplots indicate potential outliers in 'AveRooms' and 'AveOccup'.")
print("5. Correlation heatmap shows 'MedInc' has the highest correlation with house")

# Prg 3

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# Step 1: Load the Iris Dataset
iris = datasets.load_iris()
X = iris.data # Extracting feature matrix (4D data)
y = iris.target # Extracting labels (0, 1, 2 representing three iris species)
# Step 2: Standardizing the Data
# PCA works best when data is standardized (mean = 0, variance = 1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Step 3: Calculating Covariance Matrix and Eigenvalues/Eigenvectors
# The foundation of PCA is eigen decomposition of the covariance matrix
cov_matrix = np.cov(X_scaled.T)
print(cov_matrix)
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
print("Eigenvalues:", eigenvalues)
print("Eigenvectors:\n", eigenvectors)
# Step 4: Visualizing Data in 3D before PCA
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')
colors = ['red', 'green', 'blue']
labels = iris.target_names
for i in range(len(colors)):
    ax.scatter(X_scaled[y == i, 0], X_scaled[y == i, 1], X_scaled[y == i, 2], color=colors[i], label=labels[i])
ax.set_xlabel('Sepal Length')
ax.set_ylabel('Sepal Width')
ax.set_zlabel('Petal Length')
ax.set_title('3D Visualization of Iris Data Before PCA')
plt.legend()
plt.show()
# Step 5: Applying PCA using SVD (Singular Value Decomposition)
# PCA internally relies on SVD, which decomposes a matrix into three parts: U, S, a
U, S, Vt = np.linalg.svd(X_scaled, full_matrices=False)
print("Singular Values:", S)
# Step 6: Applying PCA to Reduce Dimensionality to 2D
# We reduce 4D data to 2D for visualization while retaining maximum variance
pca = PCA(n_components=2) # We choose 2 components because we want to visualize
X_pca = pca.fit_transform(X_scaled) # Transform data into principal components
# Step 7: Understanding Variance Explained
# PCA provides the percentage of variance retained in each principal component
explained_variance = pca.explained_variance_ratio_
print(f"Explained Variance by PC1: {explained_variance[0]:.2f}")
print(f"Explained Variance by PC2: {explained_variance[1]:.2f}")
# Step 8: Visualizing the Transformed Data
# We plot the 2D representation of the Iris dataset after PCA transformation
plt.figure(figsize=(8, 6))
for i in range(len(colors)):
    plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1], color=colors[i], label=labels[i])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA on Iris Dataset (Dimensionality Reduction)')
plt.legend()
plt.grid()
plt.show()
            # Step 9: Visualizing Eigenvectors Superimposed on 3D Data
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')
for i in range(len(colors)):
    ax.scatter(X_scaled[y == i, 0], X_scaled[y == i, 1], X_scaled[y == i, 2], color=colors[i], label=labels[i])
for i in range(3): # Plot first three eigenvectors
    ax.quiver(0, 0, 0, eigenvectors[i, 0], eigenvectors[i, 1], eigenvectors[i, 2], color='black', length=1)
ax.set_xlabel('Sepal Length')
ax.set_ylabel('Sepal Width')
ax.set_zlabel('Petal Length')
ax.set_title('3D Data with Eigenvectors')
plt.legend()
plt.show()
# Recap:
# - The Iris dataset is historically important for testing classification models.
# - We standardized the data to ensure fair comparison across features.
# - We calculated the covariance matrix, eigenvalues, and eigenvectors.
# - PCA is built on SVD, which decomposes data into important components.
# - We visualized the original 3D data and superimposed eigenvectors.
# - We applied PCA to reduce the dimensionality from 4D to 2D.
# - Finally, we visualized the transformed data in 2D space.

# Prg 4

In [None]:
import pandas as pd
data = pd.read_csv(r"./training_data2.csv")

In [None]:
data

In [None]:
def find_s_algorithm(data):
    """Implements the Find-S algorithm to find the most specific hypothesis."""
    # Extract feature columns and target column
    attributes = data.iloc[:, :-1].values # All columns except last
    target = data.iloc[:, -1].values # Last column (class labels)
    # Step 1: Initialize hypothesis with first positive example
    for i in range(len(target)):
        if target[i] == "Yes": # Consider only positive examples
            hypothesis = attributes[i].copy()
            break
    # Step 2: Update hypothesis based on other positive examples
    for i in range(len(target)):
        if target[i] == "Yes":
            for j in range(len(hypothesis)):
                if hypothesis[j] != attributes[i][j]:
                    hypothesis[j] = '?' # Generalize inconsistent attributes
    return hypothesis
# Run Find-S Algorithm
final_hypothesis = find_s_algorithm(data)
# Print the learned hypothesis
print("Most Specific Hypothesis:", final_hypothesis)


# Prg 5

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Step 1: Generate dataset
np.random.seed(42)
values = np.random.rand(100)

In [None]:
labels = []
for i in values[:50]:
    if i <=0.5:
        labels.append('Class1')
    else:
        labels.append('Class2')

In [None]:
labels += [None] * 50

In [None]:
print(labels)

In [None]:
data = {
    "Point": [f"x{i+1}" for i in range(100)],
    "Value": values,
    "Label": labels
}

In [None]:
df = pd.DataFrame(data)

In [None]:
df.head()

In [None]:
# Table of Meaning of Each Variable
variable_meaning = {
    "Point": "The point number",
    "Value": "The value of the point",
    "Label": "The class of the point"
}
variable_df = pd.DataFrame(list(variable_meaning.items()), columns=["Feature", "Description"])
print("\nVariable Meaning Table:")
print(variable_df)

In [None]:
df.nunique()

In [None]:
df.shape

In [None]:
# Basic Data Exploration
print("\nBasic Information about Dataset:")
df.info()

In [None]:
print("\nSummary Statistics:")
df.describe().T

In [None]:
Summary_Statistics="""
- The 'Value' column has a mean of approximately 0.47, indicating that the values a
- The standard deviation of the 'Value' column is approximately 0.29, showing a mod
- The minimum value in the 'Value' column is approximately 0.0055, and the maximum
- The first quartile (25th percentile) is approximately 0.19, the median (50th perc"""
print(Summary_Statistics)

In [None]:
# Check for missing values
print("\nMissing Values in Each Column:")
df.isnull().sum()

In [None]:
# Get numeric columns
num_col = df.select_dtypes(include=['int', 'float']).columns
# Histograms for distribution of features
df[num_col].hist(figsize=(12, 8), bins=30, edgecolor='black')
# Title and labels
plt.suptitle("Feature Distributions", fontsize=16)
plt.show()

In [None]:
# Inference for the above graph
inference = """
- The histograms for the distribution of features show that the values are uniforml
- This is expected as the values were generated using a uniform random distribution
- There are no significant outliers or skewness in the data, indicating that the da
"""
print(inference)

In [None]:
# Split data into labeled and unlabeled
labeled_df = df[df["Label"].notna()]
X_train = labeled_df[["Value"]]
y_train = labeled_df["Label"]

In [None]:
unlabeled_df = df[df["Label"].isna()]
X_test = unlabeled_df[["Value"]]

In [None]:
# Generate true labels for testing (for accuracy calculation)
true_labels = ["Class1" if x <= 0.5 else "Class2" for x in values[50:]]

In [None]:
# Step 2: Perform KNN classification for different values of k
k_values = [1, 2, 3, 4, 5, 20, 30]
results = {}
accuracies = {}

In [None]:
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    predictions = knn.predict(X_test)
    results[k] = predictions
    # Calculate accuracy
    accuracy = accuracy_score(true_labels, predictions) * 100
    accuracies[k] = accuracy
    print(f"Accuracy for k={k}: {accuracy:.2f}%")
    # Assign predictions back to the DataFrame for this k
    unlabeled_df[f"Label_k{k}"] = predictions

In [None]:
# Inference for the KNN classification results
knn_inference = """
- The KNN classification was performed for different values of k: 1, 2, 3, 4, 5, 20
- The accuracy of the classification varied with the value of k.
- For smaller values of k (1, 2, 3, 4, 5), the accuracy was relatively high, indica
- As the value of k increased to 20 and 30, the accuracy decreased, suggesting that
- This is expected as higher values of k can lead to over-smoothing, where the mode
- Overall, the KNN classifier performed well for smaller values of k, with the high
"""
print(knn_inference)

In [None]:
print(predictions)

In [None]:
df1 = unlabeled_df.drop(columns=['Label'], axis=1)
df1

In [None]:
# Display accuracies
print("\nAccuracies for different k values:")
for k, acc in accuracies.items():
    print(f"k={k}: {acc:.2f}%")

# Prg 6

In [None]:
import numpy as np
import matplotlib.pyplot as plt
def gaussian_kernel(x, x_query, tau):
    return np.exp(- (x - x_query) ** 2 / (2 * tau ** 2))
def locally_weighted_regression(X, y, x_query, tau):
    X_b = np.c_[np.ones(len(X)), X] # Add bias term (Intercept)
    x_query_b = np.array([1, x_query]) # Query point with bias term
    W = np.diag(gaussian_kernel(X, x_query, tau)) # Compute weights
    # Compute theta: (X^T W X)^-1 X^T W y
    theta = np.linalg.inv(X_b.T @ W @ X_b) @ X_b.T @ W @ y
    return x_query_b @ theta # Return prediction
# Dataset
X = np.array([1, 2, 3, 4, 5])
y = np.array([1, 2, 1.3, 3.75, 2.25])
# Query point
x_query = 3 # Point at which we perform LWR
# Bandwidth parameter
tau = 1.0
# Compute prediction
y_pred = locally_weighted_regression(X, y, x_query, tau)
# Visualizing
plt.figure(figsize=(8, 6))
plt.scatter(X, y, color='blue', label='Data Points')
plt.scatter(x_query, y_pred, color='red', label=f'Prediction at x={x_query}')
# Plot weights effect
weights = gaussian_kernel(X, x_query, tau)
for i in range(len(X)):
    plt.plot([X[i], X[i]], [y[i], y[i] - weights[i]], 'k-', lw=1)
    plt.scatter(X[i], y[i], s=weights[i] * 200, color='green', alpha=0.5)
plt.title("Locally Weighted Regression (LWR)")
plt.xlabel("X")
plt.ylabel("Y")
plt.legend()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
def gaussian_kernel(x, x_query, tau):
    return np.exp(- (x - x_query) ** 2 / (2 * tau ** 2))
def locally_weighted_regression(X, y, x_query, tau):
    X_b = np.c_[np.ones(len(X)), X] # Add bias term (Intercept)
    x_query_b = np.array([1, x_query]) # Query point with bias term
    W = np.diag(gaussian_kernel(X, x_query, tau)) # Compute weights
    # Compute theta: (X^T W X)^-1 X^T W y
    theta = np.linalg.inv(X_b.T @ W @ X_b) @ X_b.T @ W @ y
    return x_query_b @ theta # Return prediction
# Complex Dataset
X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
y = np.array([1, 3, 2, 4, 3.5, 5, 6, 7, 6.5, 8])
# Query points for LWR
X_query = np.linspace(1, 10, 100)
tau = 1.0 # Bandwidth parameter
# Compute LWR predictions
y_lwr = np.array([locally_weighted_regression(X, y, x_q, tau) for x_q in X_query])
# Simple Linear Regression
lin_reg = LinearRegression()
X_reshaped = X.reshape(-1, 1)
lin_reg.fit(X_reshaped, y)
y_lin = lin_reg.predict(X_query.reshape(-1, 1))
# Visualizing
plt.figure(figsize=(10, 6))
plt.scatter(X, y, color='blue', label='Data Points')
plt.plot(X_query, y_lin, color='black', linestyle='dashed', label='Simple Linear Regression')
plt.plot(X_query, y_lwr, color='red', label='Locally Weighted Regression')
plt.title("Comparison: Simple Linear Regression vs. Locally Weighted Regression")
plt.xlabel("X")
plt.ylabel("Y")
plt.legend()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
def gaussian_kernel(x, x_query, tau):
    return np.exp(- (x - x_query) ** 2 / (2 * tau ** 2))
def locally_weighted_regression(X, y, x_query, tau):
    X_b = np.c_[np.ones(len(X)), X] # Add bias term (Intercept)
    x_query_b = np.array([1, x_query]) # Query point with bias term
    W = np.diag(gaussian_kernel(X, x_query, tau)) # Compute weights
    # Compute theta using pseudo-inverse to avoid singular matrix error
    theta = np.linalg.pinv(X_b.T @ W @ X_b) @ X_b.T @ W @ y
    return x_query_b @ theta # Return prediction
# Complex Dataset
X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
y = np.array([1, 3, 2, 4, 3.5, 5, 6, 7, 6.5, 8])
# Query points for LWR
X_query = np.linspace(1, 10, 100)
tau_values = [0.1, 0.5, 1.0, 5.0, 10.0] # Different bandwidth values
# Simple Linear Regression
lin_reg = LinearRegression()
X_reshaped = X.reshape(-1, 1)
lin_reg.fit(X_reshaped, y)
y_lin = lin_reg.predict(X_query.reshape(-1, 1))
# Visualizing
plt.figure(figsize=(12, 8))
plt.scatter(X, y, color='blue', label='Data Points')
plt.plot(X_query, y_lin, color='black', linestyle='dashed', label='Simple Linear Regression')
# Plot LWR for different tau values
colors = ['red', 'green', 'purple', 'orange', 'brown']
for tau, color in zip(tau_values, colors):
    y_lwr = np.array([locally_weighted_regression(X, y, x_q, tau) for x_q in X_query])
    plt.plot(X_query, y_lwr, color=color, label=f'LWR (τ={tau})')
plt.title("Effect of Different τ Values in Locally Weighted Regression")
plt.xlabel("X")
plt.ylabel("Y")
plt.legend()
plt.show()

# Prg 7

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv(r'./Bostonhousingdataset.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.nunique()

In [None]:
data.CHAS.unique()

In [None]:
data.ZN.unique()

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
df = data.copy()

In [None]:
df['CRIM'].fillna(df['CRIM'].mean(), inplace=True)
df['ZN'].fillna(df['ZN'].mean(), inplace=True)
df['CHAS'].fillna(df['CHAS'].mode()[0], inplace=True)
df['INDUS'].fillna(df['INDUS'].mean(), inplace=True)
df['AGE'].fillna(df['AGE'].median(), inplace=True) # Median is often preferred for
df['LSTAT'].fillna(df['LSTAT'].median(), inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
df['CHAS'] = df['CHAS'].astype('int')

In [None]:
df.describe().T

In [None]:
for i in df.columns:
    plt.figure(figsize=(6,3))
    plt.subplot(1, 2, 1)
    df[i].hist(bins=20, alpha=0.5, color='b',edgecolor='black')
    
    plt.title(f'Histogram of {i}')
    plt.xlabel(i)
    plt.ylabel('Frequency')
    plt.subplot(1, 2, 2)
    plt.boxplot(df[i], vert=False)
    
    plt.title(f'Boxplot of {i}')
    plt.show()

In [None]:
corr = df.corr(method='pearson')

plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.xticks(rotation=90, ha='right')
plt.yticks(rotation=0)
plt.title("Correlation Matrix Heatmap")
plt.show()

In [None]:
X = df.drop('MEDV', axis=1) # All columns except 'MEDV'
y = df['MEDV'] # Target variable

In [None]:
# Scale the features
scale = StandardScaler()
X_scaled = scale.fit_transform(X)

In [None]:
# Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled , y, test_size=0.2, random_state=42)

In [None]:
# Initialize the linear regression model
model = LinearRegression()
# Fit the model on the training data
model.fit(X_train, y_train)

In [None]:
# Predict on the test set
y_pred = model.predict(X_test)
y_pred

In [None]:
# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
# Calculate R-squared value
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared: {r2}')

# Prg 8

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import export_graphviz
from IPython.display import Image

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv(r'./WisconsinBreastCancerdataset.csv')

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.diagnosis.unique()

In [None]:
data.duplicated().sum()

In [None]:
df = data.drop(['id', 'Unnamed: 32'], axis=1)

In [None]:
df['diagnosis'] = df['diagnosis'].map({'M':1, 'B':0}) # Malignant:1, Benign:0

In [None]:
df.describe().T

In [None]:
X = df.drop('diagnosis', axis=1) # Drop the 'diagnosis' column (target)
y = df['diagnosis']

In [None]:
# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Fit the decision tree model
model = DecisionTreeClassifier(criterion='entropy') #criteria = gini, entropy
model.fit(X_train, y_train)
model

In [None]:
import math
# Function to calculate entropy

def entropy(column):
    counts = column.value_counts()
    probabilities = counts / len(column)
    return -sum(probabilities * probabilities.apply(math.log2))
# Function to calculate conditional entropy
def conditional_entropy(data, X, target):
    feature_values = data[X].unique() # Corrected: use .unique() on the series
    weighted_entropy = 0
    for value in feature_values:
        subset = data[data[feature] == value]
        weighted_entropy += (len(subset) / len(data)) * entropy(subset[target])
    return weighted_entropy
# Function to calculate information gain
def information_gain(data, X, target):
    total_entropy = entropy(data[target])
    feature_conditional_entropy = conditional_entropy(data, X, target)
    return total_entropy - feature_conditional_entropy
# Calculate information gain for each feature
for feature in X:
    ig = information_gain(df,feature,'diagnosis')
    print(f"Information Gain for {feature}: {ig}")


In [None]:
# Visualize the Decision Tree (optional)
plt.figure(figsize=(12, 8))
plot_tree(model, filled=True, feature_names=X.columns, class_names=['Benign', 'Malignant'])
plt.show()

In [None]:
y_pred = model.predict(X_test)
y_pred

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred) * 100
classification_rep = classification_report(y_test, y_pred)
# Print the results
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

In [None]:
df.head(1)

In [None]:
new = [[12.5, 19.2, 80.0, 500.0, 0.085, 0.1, 0.05, 0.02, 0.17, 0.06,
    0.4, 1.0, 2.5, 40.0, 0.006, 0.02, 0.03, 0.01, 0.02, 0.003,
    16.0, 25.0, 105.0, 900.0, 0.13, 0.25, 0.28, 0.12, 0.29, 0.08]]
y_pred = model.predict(new)
# Output the prediction (0 = Benign, 1 = Malignant)
if y_pred[0] == 0:
    print("Prediction: Benign")
else:
    print("Prediction: Malignant")

# Prg 9

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from sklearn.datasets import fetch_olivetti_faces
data = fetch_olivetti_faces()

In [None]:
data.keys()

In [None]:
print("Data Shape:", data.data.shape)
print("Target Shape:", data.target.shape)
print("There are {} unique persons in the dataset".format(len(np.unique(data.target))))
print("Size of each image is {}x{}".format(data.images.shape[1],data.images.shape[1]))

In [None]:
def print_faces(images, target, top_n):
    # Ensure the number of images does not exceed available data
    top_n = min(top_n, len(images))
    # Set up figure size based on the number of images
    grid_size = int(np.ceil(np.sqrt(top_n)))
    fig, axes = plt.subplots(grid_size, grid_size, figsize=(15, 15))
    fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.2, wspace=0.2)
    for i, ax in enumerate(axes.ravel()):
        if i < top_n:
            ax.imshow(images[i], cmap='bone')
            ax.axis('off')
            ax.text(2, 12, str(target[i]), fontsize=9, color='red')
            ax.text(2, 55, f"face: {i}", fontsize=9, color='blue')
        else:
            ax.axis('off')
    plt.show()


In [None]:
print_faces(data.images,data.target,400)

In [None]:
#let us extract unique charaters present in dataset
def display_unique_faces(pics):
    fig = plt.figure(figsize=(24, 10)) # Set figure size
    columns, rows = 10, 4 # Define grid dimensions
    # Loop through grid positions and plot each image
    for i in range(1, columns * rows + 1):
        img_index = 10 * i - 1 # Calculate the image index
        if img_index < pics.shape[0]: # Check for valid image index
            img = pics[img_index, :, :]
            ax = fig.add_subplot(rows, columns, i)
            ax.imshow(img, cmap='gray')
            ax.set_title(f"Person {i}", fontsize=14)
            ax.axis('off')
    plt.suptitle("There are 40 distinct persons in the dataset", fontsize=24)
    plt.show()

In [None]:
display_unique_faces(data.images)

In [None]:
from sklearn.model_selection import train_test_split
X = data.data
Y = data.target
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state=42)
print("x_train: ",x_train.shape)
print("x_test: ",x_test.shape)

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score
# Train the model
nb = GaussianNB()
nb.fit(x_train, y_train)
# Predict the test set results
y_pred = nb.predict(x_test)
# Calculate accuracy
nb_accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
# Display accuracy result
print(f"Naive Bayes Accuracy: {nb_accuracy}%")

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
# Initialize and fit Multinomial Naive Bayes
nb = MultinomialNB()
nb.fit(x_train, y_train)
# Predict the test set results
y_pred = nb.predict(x_test)
# Calculate accuracy
accuracy = round(accuracy_score(y_test, y_pred) * 100, 2)
print(f"Multinomial Naive Bayes Accuracy: {accuracy}%")

In [None]:
# Calculate the number of misclassified images
misclassified_idx = np.where(y_pred != y_test)[0]
num_misclassified = len(misclassified_idx)
# Print the number of misclassified images and accuracy
print(f"Number of misclassified images: {num_misclassified}")
print(f"Total images in test set: {len(y_test)}")
print(f"Accuracy: {round((1 - num_misclassified / len(y_test)) * 100, 2)}%")
# Visualize some of the misclassified images
n_misclassified_to_show = min(num_misclassified, 5) # Show up to 5 misclassified i
plt.figure(figsize=(10, 5))
for i in range(n_misclassified_to_show):
    idx = misclassified_idx[i]
    plt.subplot(1, n_misclassified_to_show, i + 1)
    plt.imshow(x_test[idx].reshape(64, 64), cmap='gray')
    plt.title(f"True: {y_test[idx]}, Pred: {y_pred[idx]}")
    plt.axis('off')
plt.show()


In [None]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score
# Binarize the test labels
y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
# Get predicted probabilities for each class
y_pred_prob = nb.predict_proba(x_test)
# Calculate and print AUC for each class
for i in range(y_test_bin.shape[1]):
    roc_auc = roc_auc_score(y_test_bin[:, i], y_pred_prob[:, i])
    print(f"Class {i} AUC: {roc_auc:.2f}")

# Prg 10

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv(r"./WisconsinBreastCancerdataset.csv")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.diagnosis.unique()

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
df = data.drop(['id', 'Unnamed: 32'], axis=1)

In [None]:
df['diagnosis'] = df['diagnosis'].map({'M':1, 'B':0}) # Malignant:1, Benign:0

In [None]:
df.describe().T

In [None]:
#dropped the Diagnosis (target) since clustering is unsupervised.
df.drop(columns=["diagnosis"], inplace=True) # Removing Target

In [None]:
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)

In [None]:
# Apply PCA for Dimensionality Reduction
pca = PCA(n_components=2) # Reduce to 2 dimensions for visualization
X_pca = pca.fit_transform(X_scaled)

In [None]:
# Check explained variance ratio
explained_variance = pca.explained_variance_ratio_
total_explained_variance = np.sum(explained_variance)
print(f"Variance explained by PC1: {explained_variance[0]:.4f}")
print(f"Variance explained by PC2: {explained_variance[1]:.4f}")
print(f"Total variance explained by first 2 components: {total_explained_variance:.4f}")

In [None]:
#Use the Elbow Method to determine the optimal number of clusters
wcss = [] # Within-Cluster Sum of Squares
K_range = range(1, 11)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_pca)
    wcss.append(kmeans.inertia_)

In [None]:
# Plot the Elbow Method Graph
plt.figure(figsize=(8, 5))
plt.plot(K_range, wcss, marker="o", linestyle="-")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("WCSS")
plt.title("Elbow Method to Find Optimal k")
plt.show()

In [None]:
#Apply K-Means Clustering with the optimal k (usually where elbow occurs, k=2)
optimal_k = 2
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_pca)

In [None]:
# Step 7: Visualize the Clusters
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap="viridis", alpha=0.6)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=200, c='red', marker='X', label='Centroids')
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("K-Means Clustering after PCA")
plt.legend()
plt.show()