### 1

In [117]:
from sklearn.datasets import load_iris, load_wine, load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

# Load datasets
iris = load_iris()
wine = load_wine()
cancer = load_breast_cancer()

datasets = {
    'Iris': iris,
    'Wine': wine,
    'Breast Cancer': cancer
}


### 2

In [119]:
# Models
models = {
    'kNN': KNeighborsClassifier(),
    'SVM': SVC(),
    'Naive Bayes': GaussianNB(),
    'ID3': DecisionTreeClassifier(criterion='entropy'),
    'CART': DecisionTreeClassifier(criterion='gini')
}

# Train-test split and evaluation
results = {}
for dataset_name, dataset in datasets.items():
    X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.3, random_state=42)
    
    dataset_results = {}
    for model_name, model in models.items():
        # Train and predict
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # Accuracy on the test set
        accuracy = accuracy_score(y_test, y_pred)
        
        # 10-fold cross-validation accuracy with random seed
        skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
        cv_scores = cross_val_score(model, dataset.data, dataset.target, cv=skf)
        cv_accuracy = cv_scores.mean()
        
        dataset_results[model_name] = {
            'Test Accuracy': accuracy,
            'Cross-Validation Accuracy': cv_accuracy
        }
    
    results[dataset_name] = dataset_results

# Display results
import pandas as pd
for dataset_name, dataset_results in results.items():
    print(f"\nResults for {dataset_name} dataset:")
    df = pd.DataFrame(dataset_results).T
    print(df)



Results for Iris dataset:
             Test Accuracy  Cross-Validation Accuracy
kNN               1.000000                   0.953333
SVM               1.000000                   0.960000
Naive Bayes       0.977778                   0.953333
ID3               0.977778                   0.940000
CART              1.000000                   0.933333

Results for Wine dataset:
             Test Accuracy  Cross-Validation Accuracy
kNN               0.740741                   0.685621
SVM               0.759259                   0.674510
Naive Bayes       1.000000                   0.977778
ID3               0.833333                   0.927124
CART              0.944444                   0.877451

Results for Breast Cancer dataset:
             Test Accuracy  Cross-Validation Accuracy
kNN               0.959064                   0.935025
SVM               0.935673                   0.915633
Naive Bayes       0.941520                   0.936811
ID3               0.964912                   0

In [121]:
from imblearn.over_sampling import SMOTE
import numpy as np

# Augment data with the correct sampling_strategy for multi-class
def augment_dataset(X, y, factor):
    class_counts = np.bincount(y)  # Count the number of instances per class
    smote = SMOTE(sampling_strategy={i: class_counts[i] * factor for i in range(len(class_counts))})
    X_augmented, y_augmented = smote.fit_resample(X, y)
    return X_augmented, y_augmented

# Create datasets 1 to 5 as described in the task
def build_augmented_datasets(X_original, y_original):
    datasets = [(X_original, y_original)]  # Dataset 1 is the original data
    
    for i in range(1, 5):  # Datasets 2 to 5
        X_augmented, y_augmented = augment_dataset(X_original, y_original, i)
        X_combined = np.vstack([X_original, X_augmented])
        y_combined = np.hstack([y_original, y_augmented])
        datasets.append((X_combined, y_combined))
    
    return datasets

# Augment datasets for the Iris and Wine datasets
iris_datasets = build_augmented_datasets(iris.data, iris.target)
wine_datasets = build_augmented_datasets(wine.data, wine.target)

# Print details of the augmented datasets for verification
for i, (X, y) in enumerate(iris_datasets):
    print(f"Iris Dataset {i+1}: Size = {X.shape[0]} samples")

for i, (X, y) in enumerate(wine_datasets):
    print(f"Wine Dataset {i+1}: Size = {X.shape[0]} samples")




Iris Dataset 1: Size = 150 samples
Iris Dataset 2: Size = 300 samples
Iris Dataset 3: Size = 450 samples
Iris Dataset 4: Size = 600 samples
Iris Dataset 5: Size = 750 samples
Wine Dataset 1: Size = 178 samples
Wine Dataset 2: Size = 356 samples
Wine Dataset 3: Size = 534 samples
Wine Dataset 4: Size = 712 samples
Wine Dataset 5: Size = 890 samples


### 4

In [125]:
import time
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

# Initialize models
models = {
    'XGBoost': XGBClassifier(eval_metric='mlogloss'),  
    'CATBoost': CatBoostClassifier(verbose=0),
    'LightGBM': LGBMClassifier(min_gain_to_split=0.01, min_data_in_leaf=20, verbosity=-1)  # Adjusted LightGBM parameters
}

# Measure time and accuracy
def measure_time_and_accuracy(model, X_train, y_train, X_test, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return training_time, accuracy

# Function to evaluate models on datasets and return results in a dataframe
def evaluate_models_on_datasets(datasets, dataset_name):
    results = []
    for i, (X, y) in enumerate(datasets):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        
        for model_name, model in models.items():
            training_time, accuracy = measure_time_and_accuracy(model, X_train, y_train, X_test, y_test)
            results.append({
                'Dataset': f'{dataset_name} Dataset {i+1}',
                'Model': model_name,
                'Training Time (s)': training_time,
                'Accuracy': accuracy
            })
    
    return pd.DataFrame(results)

# Evaluate models on Iris datasets
iris_results = evaluate_models_on_datasets(iris_datasets, 'Iris')

# Evaluate models on Wine datasets
wine_results = evaluate_models_on_datasets(wine_datasets, 'Wine')

# Display results for Iris and Wine datasets
print("\nIris Results:")
print(iris_results)

print("\nWine Results:")
print(wine_results)



Iris Results:
           Dataset     Model  Training Time (s)  Accuracy
0   Iris Dataset 1   XGBoost           0.175634  1.000000
1   Iris Dataset 1  CATBoost           0.809847  1.000000
2   Iris Dataset 1  LightGBM           0.023686  1.000000
3   Iris Dataset 2   XGBoost           0.085033  1.000000
4   Iris Dataset 2  CATBoost           0.731146  1.000000
5   Iris Dataset 2  LightGBM           0.031238  1.000000
6   Iris Dataset 3   XGBoost           0.062896  0.962963
7   Iris Dataset 3  CATBoost           1.331803  0.962963
8   Iris Dataset 3  LightGBM           0.023118  0.962963
9   Iris Dataset 4   XGBoost           0.074502  1.000000
10  Iris Dataset 4  CATBoost           1.872653  1.000000
11  Iris Dataset 4  LightGBM           0.032295  1.000000
12  Iris Dataset 5   XGBoost           0.078686  0.964444
13  Iris Dataset 5  CATBoost           2.200534  0.968889
14  Iris Dataset 5  LightGBM           0.033924  0.968889

Wine Results:
           Dataset     Model  Training Tim