In [None]:
# 1

# Data Preprocessing
# Handling Missing Values
# 
# 1
# Objective: Learn how to handle missing values in a dataset.
# Techniques: Mean/median imputation, dropping missing values.
# Encoding Categorical Data


# Mean imputation
df['column_name'].fillna(df['column_name'].mean(), inplace=True)

# Median imputation
df['column_name'].fillna(df['column_name'].median(), inplace=True)

# Mode imputation
df['column_name'].fillna(df['column_name'].mode()[0], inplace=True)

# Drop rows with any missing values
df.dropna(axis=0, inplace=True)

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['encoded_column'] = label_encoder.fit_transform(df['original_column'])

# Encoding categorical data
import pandas as pd
df = pd.get_dummies(df, columns=['categorical_column'])

# categorical data
ordinal_mapping = {'low': 1, 'medium': 2, 'high': 3}
df['ordinal_column'] = df['ordinal_column'].map(ordinal_mapping)



In [1]:
# 2

# Objective: Learn how to encode categorical data.
# Techniques: Label encoding, one-hot encoding.
# Feature Scaling


# Encoding Categorical data
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Example DataFrame
data = {'category': ['A', 'B', 'A', 'C']}
df = pd.DataFrame(data)

# Apply Label Encoding
label_encoder = LabelEncoder()
df['encoded_category'] = label_encoder.fit_transform(df['category'])

print(df)

  category  encoded_category
0        A                 0
1        B                 1
2        A                 0
3        C                 2


In [2]:
# One-Hot encoding
# Example DataFrame
data = {'category': ['A', 'B', 'A', 'C']}
df = pd.DataFrame(data)

# Apply One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=['category'])

print(df_encoded)


   category_A  category_B  category_C
0        True       False       False
1       False        True       False
2        True       False       False
3       False       False        True


In [3]:
# Feature scaling (standard scaling)
from sklearn.preprocessing import StandardScaler

# Example DataFrame with numerical features
data = {'feature1': [10, 20, 30, 40], 'feature2': [1, 2, 3, 4]}
df = pd.DataFrame(data)

# Apply Standardization
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

print(df_scaled)


   feature1  feature2
0 -1.341641 -1.341641
1 -0.447214 -0.447214
2  0.447214  0.447214
3  1.341641  1.341641


In [4]:
# 3
# Objective: Learn how to scale features in a dataset.
# Techniques: Standardization, normalization.
# Data Normalization


# Feature Scaling Techniques (standard scaler)
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Example DataFrame with numerical features
data = {'feature1': [10, 20, 30, 40], 'feature2': [1, 2, 3, 4]}
df = pd.DataFrame(data)

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the data
scaled_data = scaler.fit_transform(df)

# Convert the scaled data back to a DataFrame
df_scaled = pd.DataFrame(scaled_data, columns=df.columns)

print("Original Data:")
print(df)
print("\nScaled Data (Standardization):")
print(df_scaled)


Original Data:
   feature1  feature2
0        10         1
1        20         2
2        30         3
3        40         4

Scaled Data (Standardization):
   feature1  feature2
0 -1.341641 -1.341641
1 -0.447214 -0.447214
2  0.447214  0.447214
3  1.341641  1.341641


In [5]:
# Normalization - Min Max

from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the data
normalized_data = scaler.fit_transform(df)

# Convert the normalized data back to a DataFrame
df_normalized = pd.DataFrame(normalized_data, columns=df.columns)

print("Original Data:")
print(df)
print("\nNormalized Data:")
print(df_normalized)


Original Data:
   feature1  feature2
0        10         1
1        20         2
2        30         3
3        40         4

Normalized Data:
   feature1  feature2
0  0.000000  0.000000
1  0.333333  0.333333
2  0.666667  0.666667
3  1.000000  1.000000


In [6]:
# 4

# Objective: Learn how to normalize data.
# Techniques: Min-Max scaling, z-score normalization.
# Handling Outliers

# Min-Max Scaling
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Example DataFrame with numerical features
data = {'feature1': [10, 20, 30, 40, 1000], 'feature2': [1, 2, 3, 4, 500]}
df = pd.DataFrame(data)

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the data
normalized_data = scaler.fit_transform(df)

# Convert the normalized data back to a DataFrame
df_normalized = pd.DataFrame(normalized_data, columns=df.columns)

print("Original Data:")
print(df)
print("\nNormalized Data (Min-Max Scaling):")
print(df_normalized)


Original Data:
   feature1  feature2
0        10         1
1        20         2
2        30         3
3        40         4
4      1000       500

Normalized Data (Min-Max Scaling):
   feature1  feature2
0  0.000000  0.000000
1  0.010101  0.002004
2  0.020202  0.004008
3  0.030303  0.006012
4  1.000000  1.000000


In [7]:
# Standard Scaler

from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the data
normalized_data = scaler.fit_transform(df)

# Convert the normalized data back to a DataFrame
df_normalized = pd.DataFrame(normalized_data, columns=df.columns)

print("Original Data:")
print(df)
print("\nNormalized Data (Z-Score Normalization):")
print(df_normalized)


Original Data:
   feature1  feature2
0        10         1
1        20         2
2        30         3
3        40         4
4      1000       500

Normalized Data (Z-Score Normalization):
   feature1  feature2
0 -0.538285 -0.507531
1 -0.512652 -0.502506
2 -0.487019 -0.497481
3 -0.461387 -0.492456
4  1.999343  1.999975


In [8]:
# 5

# Objective: Learn how to detect and handle outliers in a dataset.
# Techniques: IQR method, z-score method.
# TF-IDF
# Text Vectorization using TF-IDF

# IQR

import pandas as pd

# Example DataFrame
data = {'values': [10, 20, 30, 40, 1000]}
df = pd.DataFrame(data)

# Calculate IQR
Q1 = df['values'].quantile(0.25)
Q3 = df['values'].quantile(0.75)
IQR = Q3 - Q1

# Trim outliers
df['values'] = df['values'].apply(lambda x: Q3 if x > (Q3 + 1.5 * IQR) else Q1 if x < (Q1 - 1.5 * IQR) else x)

print("Original Data:")
print(df)


Original Data:
   values
0    10.0
1    20.0
2    30.0
3    40.0
4    40.0


In [9]:
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Optional: Get feature names (terms)
feature_names = vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a DataFrame (for better visualization)
import pandas as pd
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

print("TF-IDF Vectorized Data:")
print(df_tfidf)


TF-IDF Vectorized Data:
        and  document     first        is       one    second       the  \
0  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   
1  0.000000  0.687624  0.000000  0.281089  0.000000  0.538648  0.281089   
2  0.511849  0.000000  0.000000  0.267104  0.511849  0.000000  0.267104   
3  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   

      third      this  
0  0.000000  0.384085  
1  0.000000  0.281089  
2  0.511849  0.267104  
3  0.000000  0.384085  


In [10]:
# 6

# Objective: Learn how to use TF-IDF for text vectorization.
# Techniques: TF-IDF transformation, feature extraction.
# Cosine Similarity using TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Example documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Optional: Get feature names (terms)
feature_names = vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a DataFrame (for better visualization)
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

print("TF-IDF Vectorized Data:")
print(df_tfidf)


TF-IDF Vectorized Data:
        and  document     first        is       one    second       the  \
0  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   
1  0.000000  0.687624  0.000000  0.281089  0.000000  0.538648  0.281089   
2  0.511849  0.000000  0.000000  0.267104  0.511849  0.000000  0.267104   
3  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   

      third      this  
0  0.000000  0.384085  
1  0.000000  0.281089  
2  0.511849  0.267104  
3  0.000000  0.384085  


In [None]:
# Compute cosine similarity matrix
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Convert cosine similarities to a DataFrame (for better visualization)
df_cosine = pd.DataFrame(cosine_similarities, columns=range(1, len(documents) + 1), index=range(1, len(documents) + 1))

print("\nCosine Similarity Matrix:")
print(df_cosine)


In [11]:
# 7

# Objective: Learn how to use TF-IDF to calculate cosine similarity between documents.
# Techniques: TF-IDF vectorization, cosine similarity computation.
# Naive Bayes Algorithm
# Text Classification using Naive Bayes

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Example documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Optional: Get feature names (terms)
feature_names = vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a DataFrame (for better visualization)
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

print("TF-IDF Vectorized Data:")
print(df_tfidf)


TF-IDF Vectorized Data:
        and  document     first        is       one    second       the  \
0  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   
1  0.000000  0.687624  0.000000  0.281089  0.000000  0.538648  0.281089   
2  0.511849  0.000000  0.000000  0.267104  0.511849  0.000000  0.267104   
3  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   

      third      this  
0  0.000000  0.384085  
1  0.000000  0.281089  
2  0.511849  0.267104  
3  0.000000  0.384085  


In [12]:
# Compute cosine similarity matrix
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Convert cosine similarities to a DataFrame (for better visualization)
df_cosine = pd.DataFrame(cosine_similarities, columns=range(1, len(documents) + 1), index=range(1, len(documents) + 1))

print("\nCosine Similarity Matrix:")
print(df_cosine)



Cosine Similarity Matrix:
          1         2         3         4
1  1.000000  0.646926  0.307772  1.000000
2  0.646926  1.000000  0.225240  0.646926
3  0.307772  0.225240  1.000000  0.307772
4  1.000000  0.646926  0.307772  1.000000


In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Example dataset for text classification
texts = ["good movie", "not good movie", "very bad movie", "good movie indeed", "bad movie"]
labels = [1, 1, 0, 1, 0]  # 1 for positive sentiment, 0 for negative sentiment

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the texts
X = vectorizer.fit_transform(texts)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Initialize Naive Bayes classifier
classifier = MultinomialNB()

# Train the classifier
classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test)

# Evaluate the classifier
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



In [14]:
# 8
# Objective: Learn how to classify text using Naive Bayes.
# Techniques: TF-IDF vectorization, Multinomial Naive Bayes, model evaluation.
# Spam Detection using Naive Bayes

# Naive Bayes
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Example data (replace with your actual data)
data = {
    'text': [
        "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
        "Even my brother is not like to speak with me. They treat me like aids patent.",
        "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune",
        "WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.",
        "Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030"
    ],
    'label': [1, 0, 0, 1, 1]  # 1 for spam, 0 for not spam
}

# Create DataFrame
df = pd.DataFrame(data)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# TF-IDF Vectorization
# # Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = vectorizer.transform(X_test)

# Multinomial Naive Bayes Classifier
# Initialize the Multinomial Naive Bayes classifier
classifier = MultinomialNB()

# Train the classifier
classifier.fit(X_train_tfidf, y_train)

# Predict on the test data
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the classifier
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy Score: 0.0

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
# 9
# Objective: Learn how to use Naive Bayes for spam detection.
# Techniques: TF-IDF vectorization, Multinomial Naive Bayes, model evaluation.
# ID3 Algorithm
# Decision Tree Classifier using ID3 Algorithm

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Example data (replace with your actual data)
data = {
    'text': [
        "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
        "Even my brother is not like to speak with me. They treat me like aids patent.",
        "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune",
        "WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.",
        "Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030"
    ],
    'label': [1, 0, 0, 1, 1]  # 1 for spam, 0 for not spam
}

# Create DataFrame
df = pd.DataFrame(data)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)


# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = vectorizer.transform(X_test)



# Initialize the Multinomial Naive Bayes classifier
classifier = MultinomialNB()

# Train the classifier
classifier.fit(X_train_tfidf, y_train)

# Predict on the test data
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the classifier
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy Score: 0.0

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load example dataset (replace with your actual data)
iris = load_iris()
X = iris.data
y = iris.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Decision Tree classifier
classifier = DecisionTreeClassifier(criterion="entropy")  # ID3 uses entropy for information gain

# Train the classifier
classifier.fit(X_train, y_train)

# Predict on the test data
y_pred = classifier.predict(X_test)

# Evaluate the classifier
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))


Accuracy Score: 1.0

Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [17]:
# 10
# Objective: Learn how to implement the ID3 algorithm for classification.
# Techniques: Decision tree training, entropy calculation, model evaluation.

import numpy as np

class ID3DecisionTree:
    def __init__(self):
        self.tree = None
    
    def entropy(self, y):
        unique_classes, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        entropy_value = -np.sum(probabilities * np.log2(probabilities))
        return entropy_value
    
    def information_gain(self, X, y, feature_index):
        entropy_before_split = self.entropy(y)
        unique_values = np.unique(X[:, feature_index])
        weighted_entropy_after_split = 0
        
        for value in unique_values:
            subset_indices = np.where(X[:, feature_index] == value)[0]
            subset_entropy = self.entropy(y[subset_indices])
            weighted_entropy_after_split += (len(subset_indices) / len(y)) * subset_entropy
        
        information_gain_value = entropy_before_split - weighted_entropy_after_split
        return information_gain_value
    
    def fit(self, X, y):
        self.tree = self._grow_tree(X, y)
    
    def _grow_tree(self, X, y):
        if len(np.unique(y)) == 1:
            return np.unique(y)[0]
        
        if X.shape[1] == 0:
            return np.bincount(y).argmax()
        
        best_feature = np.argmax([self.information_gain(X, y, i) for i in range(X.shape[1])])
        tree = {best_feature: {}}
        
        unique_values = np.unique(X[:, best_feature])
        for value in unique_values:
            subset_indices = np.where(X[:, best_feature] == value)[0]
            subtree = self._grow_tree(X[subset_indices], y[subset_indices])
            tree[best_feature][value] = subtree
        
        return tree
    
    def predict(self, X):
        return np.array([self._predict_single(x, self.tree) for x in X])
    
    def _predict_single(self, x, tree):
        feature = list(tree.keys())[0]
        value = x[feature]
        subtree = tree[feature][value]
        
        if isinstance(subtree, dict):
            return self._predict_single(x, subtree)
        else:
            return subtree

# Example usage:
# Assume X_train, y_train are your training features and labels
# Assume X_test is your test features

# Create an instance of ID3DecisionTree
tree_classifier = ID3DecisionTree()

# Fit the model on training data
tree_classifier.fit(X_train, y_train)

# Predict on test data
y_pred = tree_classifier.predict(X_test)

# Evaluate the model
accuracy = np.mean(y_pred == y_test)
print("Accuracy:", accuracy)


IndexError: tuple index out of range

In [20]:
import numpy as np
import pandas as pd
from collections import Counter
from math import log2

def entropy(s):
    counts = Counter(s)
    probabilities = [count / len(s) for count in counts.values()]
    return -sum(p * log2(p) for p in probabilities if p > 0)

def information_gain(y, x):
    total_entropy = entropy(y)
    values, counts = np.unique(x, return_counts=True)
    weighted_entropy = sum((counts[i] / len(y)) * entropy(y[x == values[i]]) for i in range(len(values)))
    return total_entropy - weighted_entropy

def id3(X, y, attribute_names, depth=0, max_depth=None):
    # If all the target values are the same, return that value
    if len(np.unique(y)) == 1:
        return y[0]

    # If no features are left to split on, return the most common target value
    if len(attribute_names) == 0 or (max_depth is not None and depth >= max_depth):
        return Counter(y).most_common(1)[0][0]

    # Calculate information gains for all attributes
    gains = [information_gain(y, X[:, i]) for i in range(X.shape[1])]
    
    # Check if gains is empty (no valid attributes left)
    if not any(gains):
        return Counter(y).most_common(1)[0][0]

    # Choose the attribute with the highest information gain
    best_attr = np.argmax(gains)
    best_attr_name = attribute_names[best_attr]

    # Create the tree root with the best attribute
    tree = {best_attr_name: {}}
    
    # Recursively create the tree for each value of the best attribute
    values = np.unique(X[:, best_attr])
    for value in values:
        X_subset = X[X[:, best_attr] == value]
        y_subset = y[X[:, best_attr] == value]
        
        subtree = id3(X_subset, y_subset,
                      [attr for i, attr in enumerate(attribute_names) if i != best_attr],
                      depth + 1, max_depth)
        
        tree[best_attr_name][value] = subtree

    return tree

def predict(tree, sample):
    if not isinstance(tree, dict):
        return tree
    
    attr = next(iter(tree))
    value = sample.get(attr, None)  # Get the value from the sample or None if not found
    if value is None or value not in tree[attr]:
        # Handle missing attribute value by returning a default prediction
        return Counter(y).most_common(1)[0][0]
    
    subtree = tree[attr][value]
    return predict(subtree, sample)

# Sample dataset
data = {
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}

df = pd.DataFrame(data)
X = df.drop(columns='PlayTennis').values
y = df['PlayTennis'].values
attribute_names = df.drop(columns='PlayTennis').columns.tolist()

# Train the ID3 decision tree
tree = id3(X, y, attribute_names)

# Predict a new sample
new_sample = {'Outlook': 'Sunny', 'Temperature': 'Cool', 'Humidity': 'High', 'Wind': 'Strong'}
prediction = predict(tree, new_sample)
print(f"Prediction for {new_sample}: {prediction}")

# Print the tree
import pprint
pprint.pprint(tree)


IndexError: list index out of range