In [None]:
import pandas
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
import matplotlib.pyplot as plt

# Load the dataset
df = pandas.read_csv("TempData.csv")

# Map categorical data in the 'Nationality' column to numerical values
d = {'UK': 0, 'USA': 1, 'N': 2}
df['Nationality'] = df['Nationality'].map(d)

# Map categorical data in the 'Go' column to numerical values
d = {'YES': 1, 'NO': 0}
df['Go'] = df['Go'].map(d)

# Define the features (input variables) and target variable
features = ['Age', 'Experience', 'Rank', 'Nationality']  # Columns to be used as features
X = df[features]  # Input features
y = df['Go']  # Target variable

# Initialize the decision tree classifier
dtree = DecisionTreeClassifier()

# Fit the decision tree classifier on the data
dtree = dtree.fit(X, y)

# Plot the decision tree
plt.figure(figsize=(12, 8))  # Set the size of the plot
plot_tree(
    dtree, 
    feature_names=features,  # Display feature names
    class_names=['No', 'Yes'],  # Display class names for target variable
    filled=True,  # Fill nodes with colors representing the classes
    rounded=True,  # Use rounded corners for the boxes
    fontsize=10  # Set font size
)
plt.show()  # Show the plot

# Print a textual representation of the decision tree
print(export_text(dtree, feature_names=features))




In [None]:
# Load the dataset
df = pandas.read_csv("poker-hand-training-true.csv")

# Define the features and target
features = ["S1", "R1", "S2", "R2", "S3", "R3", "S4", "R4", "S5", "R5"]
X_train = df[features]  # Features from training data
y_train = df["Class"]  # Target from training data

model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
print(f"Tree Depth: {model.get_depth()}")
print(f"Number of Leaves: {model.get_n_leaves()}")

## Decision Tree Training Results

### Tree Structure
- **Tree Depth**: 30  
  - The depth of the tree is the longest path from the root node to a leaf node.  
  - A depth of 30 indicates that the tree has grown very deep, creating a highly detailed set of rules for classification.  
  - While this might improve training accuracy, it can lead to **overfitting**, where the model memorizes the training data instead of generalizing patterns.

- **Number of Leaves**: 9196  
  - Leaves are the terminal nodes in the tree where predictions are made.  
  - The tree has 9196 leaves, which suggests it has created a large number of distinct rules to classify the data.  
  - While this level of complexity is expected for a dataset like Poker Hands (due to the many combinations of suits and ranks), it could also indicate **overfitting**.

  ### Addressing Overfitting
To prevent overfitting and improve the model's generalization ability:
1. **Limit the Depth of the Tree**:
   - Restrict the maximum depth of the tree to control its complexity

2. **Limit the Minimum Samples per Leaf**:
   - Set a minimum number of samples required for a leaf node

3. **Prune the Tree**:
   - Use cost-complexity pruning (`ccp_alpha`) to remove overly specific splits


In [None]:
model = DecisionTreeClassifier(random_state=42, min_samples_leaf=10,  max_depth=10) # Added cap on depth and minimum samples per leaf
model.fit(X_train, y_train)
print(f"Tree Depth: {model.get_depth()}")
print(f"Number of Leaves: {model.get_n_leaves()}")