In [None]:
"""
Q1. 	Consider handwritten digits dataset full_mnist_digits.csv file	(5 Marks each)  		(25 marks)

    a. Design SVC, Random Forest Classifier, Logistic Regression classifier to classify digits. 
    b. Find accuracy_score for every model designed in option a.
    c. Display classification report , which model is best 
    d. Display confusion matrix for all models.
    e. For Random Forest Classifier use values (5,10,20,25,40,50) for n_estimators, display which value is best suitable for digits data. (use train_test_split)        
Consider 70% data as training data set.
"""

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
data = pd.read_csv('full_mnist_digits.csv')

# Separate features and target
X = data.iloc[:, :-1]  # Assuming all columns except the last are features
y = data.iloc[:, -1]   # Assuming the last column is the target

# Split the dataset into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize classifiers
svc = SVC()
rf = RandomForestClassifier()
lr = LogisticRegression(max_iter=1000)

# Train and evaluate SVC
svc.fit(X_train, y_train)
svc_pred = svc.predict(X_test)
print("SVC Accuracy:", accuracy_score(y_test, svc_pred))
print("SVC Classification Report:\n", classification_report(y_test, svc_pred))
print("SVC Confusion Matrix:\n", confusion_matrix(y_test, svc_pred))

# Train and evaluate Logistic Regression
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_pred))
print("Logistic Regression Classification Report:\n", classification_report(y_test, lr_pred))
print("Logistic Regression Confusion Matrix:\n", confusion_matrix(y_test, lr_pred))

# Train and evaluate Random Forest Classifier with different n_estimators
n_estimators = [5, 10, 20, 25, 40, 50]
best_accuracy = 0
best_n = 0

for n in n_estimators:
    rf = RandomForestClassifier(n_estimators=n, random_state=42)
    rf.fit(X_train, y_train)
    rf_pred = rf.predict(X_test)
    acc = accuracy_score(y_test, rf_pred)
    print(f"Random Forest Accuracy with n_estimators={n}: {acc}")
    if acc > best_accuracy:
        best_accuracy = acc
        best_n = n

    print(f"Random Forest Classification Report (n_estimators={n}):\n", classification_report(y_test, rf_pred))
    print(f"Random Forest Confusion Matrix (n_estimators={n}):\n", confusion_matrix(y_test, rf_pred))

print(f"Best n_estimators for Random Forest: {best_n} with accuracy: {best_accuracy}")

In [None]:
"""
Q2. 	Use iris dataset and do the following			                                                                        (15 marks)
    f. Using elbow method find optimum number of clusters, to design k-Means clustering model.
    g. Build model using k-means clustering, (use number of clusters found in question a)
    h. Find silhouette_score to measure accuracy, of the k-means clustering model.

"""

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)

# Step 1: Use the elbow method to find the optimal number of clusters
inertia = []
range_clusters = range(1, 11)

for k in range_clusters:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(8, 5))
plt.plot(range_clusters, inertia, marker='o')
plt.title('Elbow Method to Determine Optimal Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()

# Step 2: Build the k-Means model using the optimal number of clusters (e.g., 3)
optimal_clusters = 3  # Based on the elbow method
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
kmeans.fit(X)

# Step 3: Evaluate the model using silhouette score
labels = kmeans.labels_
sil_score = silhouette_score(X, labels)
print(f"Silhouette Score for k-Means with {optimal_clusters} clusters: {sil_score}")

In [None]:
"""
Q1.  Consider diabetes.csv data set. (5 Marks each)  		                                                                        (25 marks)

    a. Build a classification model using support vector machine. Use standalone model, find score.
    b. Build Bagging model using SVM and check if you see any difference in the performance.
    c. Use decision tree classifier. Use standalone model.
    d. Build Bagging model using decision tree and check if you notice any difference in performance.
    e. For Random Forest Classifier use values(100,120,200,300) for n_estimators, display which value is best suitable for given data . (use train_test_split) 
Consider 70% data as training data set.

"""

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
data = pd.read_csv('diabetes.csv')

# Separate features and target
X = data.iloc[:, :-1]  # Assuming all columns except the last are features
y = data.iloc[:, -1]   # Assuming the last column is the target

# Split the dataset into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Task a: Standalone SVM
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print("Standalone SVM Accuracy:", accuracy_score(y_test, svm_pred))
print("Standalone SVM Classification Report:\n", classification_report(y_test, svm_pred))

# Task b: Bagging with SVM
bagging_svm = BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=42)
bagging_svm.fit(X_train, y_train)
bagging_svm_pred = bagging_svm.predict(X_test)
print("Bagging SVM Accuracy:", accuracy_score(y_test, bagging_svm_pred))
print("Bagging SVM Classification Report:\n", classification_report(y_test, bagging_svm_pred))

# Task c: Standalone Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
print("Standalone Decision Tree Accuracy:", accuracy_score(y_test, dt_pred))
print("Standalone Decision Tree Classification Report:\n", classification_report(y_test, dt_pred))

# Task d: Bagging with Decision Tree
bagging_dt = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42)
bagging_dt.fit(X_train, y_train)
bagging_dt_pred = bagging_dt.predict(X_test)
print("Bagging Decision Tree Accuracy:", accuracy_score(y_test, bagging_dt_pred))
print("Bagging Decision Tree Classification Report:\n", classification_report(y_test, bagging_dt_pred))

# Task e: Random Forest with different n_estimators
n_estimators = [100, 120, 200, 300]
best_accuracy = 0
best_n = 0

for n in n_estimators:
    rf = RandomForestClassifier(n_estimators=n, random_state=42)
    rf.fit(X_train, y_train)
    rf_pred = rf.predict(X_test)
    acc = accuracy_score(y_test, rf_pred)
    print(f"Random Forest Accuracy with n_estimators={n}: {acc}")
    if acc > best_accuracy:
        best_accuracy = acc
        best_n = n

    print(f"Random Forest Classification Report (n_estimators={n}):\n", classification_report(y_test, rf_pred))

print(f"Best n_estimators for Random Forest: {best_n} with accuracy: {best_accuracy}")

In [None]:
"""  
Q2. Consider handwritten digits dataset available in the scikit-learn library, and do the following	(15 marks)

    a.  Using PCA  to reduce the dimensions. (use 2 components)
    b.  Display the percentage of the explained variance and the variance (eigenvalues of the covariance matrix)
    c.  Apply Logistic Regression on the PCA component data and find accuracy score.
 

"""

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the digits dataset
digits = load_digits()
X = digits.data  # Features
y = digits.target  # Target labels

# Step 1: Apply PCA to reduce dimensions to 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Step 2: Display the percentage of explained variance and eigenvalues
explained_variance = pca.explained_variance_ratio_ * 100
eigenvalues = pca.explained_variance_
print("Percentage of Explained Variance by each component:", explained_variance)
print("Eigenvalues of the covariance matrix:", eigenvalues)

# Step 3: Split the PCA-transformed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)

# Step 4: Apply Logistic Regression on the PCA-transformed data
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# Step 5: Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of Logistic Regression on PCA-transformed data:", accuracy)

In [None]:
"""   

Q1.
On Titanic Dataset:                                                                                                                                [10 Marks] 
a. Perform One-hot Encoding Technique on eligible column in dataset. 
b. Identify which column has null/missing data and replace null/missing data with mean.

"""

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# Load the Titanic dataset
data = pd.read_csv('titanic.csv')  # Replace with the correct path to your Titanic dataset

# Task a: Perform One-Hot Encoding on eligible columns
# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns
print("Categorical Columns:", categorical_columns)

# Apply One-Hot Encoding
column_transformer = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first', sparse=False), categorical_columns)
    ],
    remainder='passthrough'
)
data_encoded = column_transformer.fit_transform(data)
data_encoded = pd.DataFrame(data_encoded, columns=column_transformer.get_feature_names_out())
print("Data after One-Hot Encoding:\n", data_encoded.head())

# Task b: Identify columns with missing/null values and replace them with the mean
imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data_encoded), columns=data_encoded.columns)
print("Data after replacing missing values:\n", data_imputed.head())

In [None]:
"""   
Q2. 
On Titanic Dataset:                                                                                            		              [20 Marks] 
a. Build a prediction model using Random Forest Algorithm to solve problem “what sorts of male people were more likely to survive”. 
b. Create a Confusion Matrix for the above solution. 
c. Tune the parameters for the existing solution and compare the results


"""

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the Titanic dataset
data = pd.read_csv('titanic.csv')  # Replace with the correct path to your Titanic dataset

# Filter data for male passengers
data = data[data['Sex'] == 'male']

# Perform preprocessing
# Handle missing values
data['Age'].fillna(data['Age'].mean(), inplace=True)
data['Fare'].fillna(data['Fare'].mean(), inplace=True)

# One-hot encode categorical columns
data = pd.get_dummies(data, columns=['Embarked'], drop_first=True)

# Separate features and target
X = data.drop(['Survived', 'Name', 'Ticket', 'Cabin', 'Sex'], axis=1)  # Drop irrelevant columns
y = data['Survived']

# Split the dataset into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Task a: Build a Random Forest Classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Evaluate the model
print("Accuracy of Random Forest Classifier:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Task b: Create a Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Task c: Tune the parameters using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
best_rf = grid_search.best_estimator_
y_pred_tuned = best_rf.predict(X_test)
print("Accuracy of Tuned Random Forest Classifier:", accuracy_score(y_test, y_pred_tuned))
print("Classification Report (Tuned):\n", classification_report(y_test, y_pred_tuned))
print("Confusion Matrix (Tuned):\n", confusion_matrix(y_test, y_pred_tuned))

In [None]:
"""
Q3. 													 [10 Marks]

 On Iris dataset, perform a K-means clustering algorithm. Show the suitable number of centroids    
required.

"""

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)

# Step 1: Use the elbow method to find the optimal number of clusters
inertia = []
range_clusters = range(1, 11)

for k in range_clusters:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

# Step 2: Plot the elbow curve
plt.figure(figsize=(8, 5))
plt.plot(range_clusters, inertia, marker='o')
plt.title('Elbow Method to Determine Optimal Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()

# Step 3: Perform K-means clustering with the optimal number of clusters (e.g., 3)
optimal_clusters = 3  # Based on the elbow method
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
kmeans.fit(X)

# Display the cluster centers
print("Cluster Centers:\n", kmeans.cluster_centers_)

In [None]:
"""   
Q1.
 On Titanic Dataset:                                                                                                                            [10 Marks]  
a. Perform Binary Encoding Technique on eligible column in dataset. 
b. Identify which column has null/missing data and replace null/missing data with median.

"""

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Load the Titanic dataset
data = pd.read_csv('titanic.csv')  # Replace with the correct path to your Titanic dataset

# Task a: Perform Binary Encoding on eligible columns
# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns
print("Categorical Columns:", categorical_columns)

# Apply Binary Encoding using LabelEncoder
label_encoder = LabelEncoder()
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col].astype(str))

print("Data after Binary Encoding:\n", data.head())

# Task b: Identify columns with missing/null values and replace them with the median
missing_columns = data.columns[data.isnull().any()]
print("Columns with Missing Values:", missing_columns)

# Replace missing values with the median
imputer = SimpleImputer(strategy='median')
data[missing_columns] = imputer.fit_transform(data[missing_columns])

print("Data after replacing missing values:\n", data.head())

In [None]:
""""   
Q2
 On Titanic Dataset:                                                                                                                             [20 Marks] 
a. Build a prediction model using Random Forest Algorithm to solve problem “what sorts of female people were more likely to survive”. 
b. Create a Confusion Matrix for the above solution. 
c. Tune the parameters for the existing solution and compare the results.


"""

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the Titanic dataset
data = pd.read_csv('titanic.csv')  # Replace with the correct path to your Titanic dataset

# Filter data for female passengers
data = data[data['Sex'] == 'female']

# Perform preprocessing
# Handle missing values
data['Age'].fillna(data['Age'].mean(), inplace=True)
data['Fare'].fillna(data['Fare'].mean(), inplace=True)

# One-hot encode categorical columns
data = pd.get_dummies(data, columns=['Embarked'], drop_first=True)

# Separate features and target
X = data.drop(['Survived', 'Name', 'Ticket', 'Cabin', 'Sex'], axis=1)  # Drop irrelevant columns
y = data['Survived']

# Split the dataset into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Task a: Build a Random Forest Classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Evaluate the model
print("Accuracy of Random Forest Classifier:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Task b: Create a Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Task c: Tune the parameters using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
best_rf = grid_search.best_estimator_
y_pred_tuned = best_rf.predict(X_test)
print("Accuracy of Tuned Random Forest Classifier:", accuracy_score(y_test, y_pred_tuned))
print("Classification Report (Tuned):\n", classification_report(y_test, y_pred_tuned))
print("Confusion Matrix (Tuned):\n", confusion_matrix(y_test, y_pred_tuned))

In [None]:
""""
Q3.                                                                                                                                                       [10 Marks]
 On Iris dataset, perform a K-means clustering algorithm. Show the suitable number of centroids  
 required.

"""

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)

# Step 1: Use the elbow method to find the optimal number of clusters
inertia = []
range_clusters = range(1, 11)

for k in range_clusters:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

# Step 2: Plot the elbow curve
plt.figure(figsize=(8, 5))
plt.plot(range_clusters, inertia, marker='o')
plt.title('Elbow Method to Determine Optimal Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()

# Step 3: Perform K-means clustering with the optimal number of clusters (e.g., 3)
optimal_clusters = 3  # Based on the elbow method
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
kmeans.fit(X)

# Display the cluster centers
print("Cluster Centers:\n", kmeans.cluster_centers_)