In [69]:
import numpy as np
import pandas as pd
import os
import sys
import time

In [70]:
import sklearn.tree
import sklearn.linear_model
import sklearn.metrics
import sklearn.ensemble
from sklearn.model_selection import PredefinedSplit, GridSearchCV


In [71]:

from pretty_print_sklearn_tree import pretty_print_sklearn_tree

In [72]:
# Plotting utils
import matplotlib
import matplotlib.pyplot as plt

plt.style.use('seaborn-v0_8') # pretty matplotlib plots

import seaborn as sns
sns.set('notebook', font_scale=1.25, style='whitegrid')

# Load all data from train/valid/test

In [73]:
folder = "data_product_reviews"
files = os.listdir(folder)
for file in files:
    print(file)

x_valid.csv
y_test.csv
x_train.csv
y_train.csv
y_valid.csv
x_test.csv


### Load training

In [74]:
###loading the dfs
df_trainX = pd.read_csv(os.path.join(folder,"x_train.csv"))
df_trainY = pd.read_csv(os.path.join(folder,"y_train.csv"))

### Load validation set

In [75]:
df_valX = pd.read_csv(os.path.join(folder,"x_valid.csv"))
df_valY = pd.read_csv(os.path.join(folder,"y_valid.csv"))

### Load test set 

In [76]:
df_testX = pd.read_csv(os.path.join(folder,"x_test.csv"))
df_testY = pd.read_csv(os.path.join(folder,"y_test.csv"))

In [77]:
print(df_trainX.columns)

Index(['good', 'great', 'time', 'book', 'don't', 'work', 'i_have', 'read',
       'make', 'if_you',
       ...
       'everything_i', 'first_two', 'never_get', 'i'd_like', 'loves_it',
       'an_author', 'nomin', 'could_give', 'bad_but', 'gap'],
      dtype='object', length=7729)


### Load vocabulary as a list of strings

In [78]:
vocab_list = df_trainX.columns.tolist()

### Pack training and validation sets into big arrays (so we can use sklearn's hyperparameter search tools)

In [79]:
# Convert DataFrames to numpy arrays for sklearn
x_tr_NF = df_trainX.values
x_val_NF = df_valX.values
x_te_NF = df_testX.values

y_tr_N = df_trainY.values.ravel()  # Flatten to 1D array
y_val_N = df_valY.values.ravel()
y_te_N = df_testY.values.ravel()

print(f"Train X shape: {x_tr_NF.shape}, y: {y_tr_N.shape}")
print(f"Valid X shape: {x_val_NF.shape}, y: {y_val_N.shape}")
print(f"Test X shape: {x_te_NF.shape}, y: {y_te_N.shape}")

Train X shape: (6346, 7729), y: (6346,)
Valid X shape: (792, 7729), y: (792,)
Test X shape: (793, 7729), y: (793,)


# Problem 1: Decision Trees

## 1A: Train a simple tree with depth 3

In [None]:
criterion = 'gini'
max_depth = 3
min_samples_leaf = 1
min_samples_split = 2
random_state = 101
###lets use sklearn first then may be tthis regres
tree = sklearn.tree.DecisionTreeClassifier(criterion=criterion,
                                            max_depth=max_depth,
                                            min_samples_leaf=min_samples_leaf,
                                            min_samples_split=min_samples_split,
                                            random_state=random_state)

### **Fit the tree** 

**TODO Train on the training set** in the next coding cell

In [None]:
tree.fit(x_tr_NF, y_tr_N)

###print the value on each bacc


### **Figure 1: Print Tree** 

Use a helper function from the starter code

In [82]:
pretty_print_sklearn_tree(tree, feature_names=vocab_list)

The binary tree structure has 15 nodes.
- depth   0 has    1 nodes, of which    0 are leaves
- depth   1 has    2 nodes, of which    0 are leaves
- depth   2 has    4 nodes, of which    0 are leaves
- depth   3 has    8 nodes, of which    8 are leaves
The decision tree:  (Note: Y = 'yes' to above question; N = 'no')
Decision: X['great'] <= 0.50?
  Y Decision: X['excel'] <= 0.50?
    Y Decision: X['disappoint'] <= 0.50?
      Y Leaf: p(y=1 | this leaf) = 0.430 (1 total training examples)
      N Leaf: p(y=1 | this leaf) = 0.114 (1 total training examples)
    N Decision: X['disappoint'] <= 0.50?
      Y Leaf: p(y=1 | this leaf) = 0.903 (1 total training examples)
      N Leaf: p(y=1 | this leaf) = 0.429 (1 total training examples)
  N Decision: X['return'] <= 0.50?
    Y Decision: X['bad'] <= 0.50?
      Y Leaf: p(y=1 | this leaf) = 0.745 (1 total training examples)
      N Leaf: p(y=1 | this leaf) = 0.415 (1 total training examples)
    N Decision: X['movie'] <= 0.50?
      Y Leaf: p(y

Yes there is a node having two children with same sentiment class that is `Decision: X['disappoint'] <= 0.50` which is under branch `Y Decision: X['excel'] <= 0.50` both its Y and N leaves are having the sentiment class 0. 

Now the reason that it makes sense is because decision trees split the data based on the feature that provides the highest information gain at each node. In this case, the feature 'disappoint' might not provide significant additional information to differentiate between positive and negative sentiments beyond what has already been captured by the previous splits in the tree.

## 1B : Find best Decision Tree with grid search

In [83]:
max_depth = [2,8,32,128]
min_samples_leaf = [1,3,9]
random_state = [101]
p_grid = {
    'max_depth': max_depth,
    'min_samples_leaf': min_samples_leaf,
    'random_state': random_state
}
### to do grid seaarch
base_tree = sklearn.tree.DecisionTreeClassifier(criterion='gini',random_state=101)

In [84]:
X_trainval = np.concatenate([x_tr_NF, x_val_NF], axis=0)
y_trainval = np.concatenate([y_tr_N, y_val_N], axis=0)
test_fold = np.concatenate([
    -1 * np.ones_like(y_tr_N),
    np.zeros_like(y_val_N)
])
my_splitter = PredefinedSplit(test_fold)


### Build the best decision tree

**TODO Build the Best Tree on the training set** in the next coding cell



In [85]:
grid = GridSearchCV(
    estimator=base_tree,
    param_grid=p_grid,
    scoring='balanced_accuracy',
    cv=my_splitter,
    return_train_score=True,
    refit=False
)

grid.fit(X_trainval, y_trainval)
best_params = grid.best_params_
print("Best parameters found:", best_params)

# --- Create best tree and retrain on full training data (train + val if needed) ---
best_tree = base_tree.set_params(**best_params)
best_tree.fit(np.concatenate([x_tr_NF, x_val_NF]),
              np.concatenate([y_tr_N, y_val_N]))

# --- (Optional) Evaluate on test set ---


Best parameters found: {'max_depth': 32, 'min_samples_leaf': 3, 'random_state': 101}


In [86]:
max_depth = best_tree.get_params()['max_depth']
min_samples_leaf = best_tree.get_params()['min_samples_leaf']
print(f"Best Tree - max_depth: {max_depth}, min_samples_leaf: {min_samples_leaf}")

Best Tree - max_depth: 32, min_samples_leaf: 3


### Interpret the best decision tree

In [87]:
pretty_print_sklearn_tree(best_tree, feature_names=vocab_list)

The binary tree structure has 853 nodes.
- depth   0 has    1 nodes, of which    0 are leaves
- depth   1 has    2 nodes, of which    0 are leaves
- depth   2 has    4 nodes, of which    0 are leaves
- depth   3 has    8 nodes, of which    0 are leaves
- depth   4 has   16 nodes, of which    5 are leaves
- depth   5 has   22 nodes, of which    8 are leaves
- depth   6 has   28 nodes, of which   14 are leaves
- depth   7 has   28 nodes, of which   12 are leaves
- depth   8 has   32 nodes, of which   13 are leaves
- depth   9 has   38 nodes, of which   17 are leaves
- depth  10 has   42 nodes, of which   21 are leaves
- depth  11 has   42 nodes, of which   19 are leaves
- depth  12 has   46 nodes, of which   24 are leaves
- depth  13 has   44 nodes, of which   22 are leaves
- depth  14 has   44 nodes, of which   22 are leaves
- depth  15 has   44 nodes, of which   25 are leaves
- depth  16 has   38 nodes, of which   20 are leaves
- depth  17 has   36 nodes, of which   17 are leaves
- dep

# Problem 2: Random forest

## 2A: Train a random forest with default settings

In [88]:
simple_forest = sklearn.ensemble.RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    max_features='sqrt',
    max_depth=3,
    min_samples_leaf=1,
    min_samples_split=2,
    random_state=101)

### Fit the forest

**TODO Train on the training set** in the next coding cell

In [89]:
simple_forest.fit(x_tr_NF, y_tr_N)

## 2B & Table 2: Feature Importances

### Table 2
**Sample Output** (Feel free to print all words and organize them in any software)

|**Important Words**|**Unimportant Words**|
|:-:|:-:|
|return |  gear  |
|excel |  a_variety  |
|great |  film_to  |
|worst |  the_premise  |
|poor |  this_might  |
|disappoint |  wanted_a  |
|your_money |  us_and  |
|i_love |  love_of  |
|the_best |  that_all  |
|don't |  out_with  |

In [90]:
importances = simple_forest.feature_importances_
indices = np.argsort(importances)[::-1]
top_10_words = [vocab_list[i] for i in indices[:10]]
top_10_importances = importances[indices[:10]]  
unimp_words = importances < 1e-6
###choosing now from them randomly
random_unimp_words= np.random.choice(np.array(vocab_list)[unimp_words], size=10, replace=False)
print("Top 10 important words in Random Forest:")
for word, imp in zip(top_10_words, top_10_importances):
    print(f"{word}: {imp:.6f}")
print("\n10 Randomly selected unimportant words (importance < 1e-6):")
for word in random_unimp_words:
    print(f"word: {word} , importance : {importances[vocab_list.index(word)]:.9f}") 

Top 10 important words in Random Forest:
return: 0.032990
excel: 0.029485
great: 0.028984
worst: 0.028407
poor: 0.026748
disappoint: 0.024952
your_money: 0.018002
i_love: 0.017955
the_best: 0.017662
don't: 0.017654

10 Randomly selected unimportant words (importance < 1e-6):
word: hatr , importance : 0.000000000
word: <num>_feet , importance : 0.000000000
word: not_know , importance : 0.000000000
word: prevent , importance : 0.000000000
word: over_time , importance : 0.000000000
word: the_script , importance : 0.000000000
word: against_the , importance : 0.000000000
word: get_that , importance : 0.000000000
word: ultim , importance : 0.000000000
word: watch_it , importance : 0.000000000


## 2C: Best Random Forest via grid search



This block might take 2-10 minutes. 

If yours runs significantly longer, try this out on Google Colab instead.

In [91]:
max_features = [3,10,33,100,333]
max_depth = [16,32]
min_samples_leaf = [1]
n_estimators = [100]
random_state = [101]
random_forest_basic = sklearn.ensemble.RandomForestClassifier(
    criterion='gini',
)

In [92]:
rf_search = sklearn.model_selection.GridSearchCV(
    estimator=random_forest_basic,
    param_grid={
        'max_features': max_features,
        'max_depth': max_depth,
        'min_samples_leaf': min_samples_leaf,
        'n_estimators': n_estimators,
        'random_state': random_state
    },
    scoring='balanced_accuracy',
    cv=my_splitter,
    return_train_score=True,
    refit=False
)


### Do the search!

In [93]:
rf_search.fit(X_trainval, y_trainval)
best_rf_params = rf_search.best_params_
print("Best Random Forest parameters found:", best_rf_params)

Best Random Forest parameters found: {'max_depth': 32, 'max_features': 33, 'min_samples_leaf': 1, 'n_estimators': 100, 'random_state': 101}


### Display search results

In [94]:
print("\nBest Random Forest parameters:")
for param, value in best_rf_params.items():
    print(f"{param}: {value}")



Best Random Forest parameters:
max_depth: 32
max_features: 33
min_samples_leaf: 1
n_estimators: 100
random_state: 101


### value of Max_features
The max features for this dataset is 7729 which is the different number of words in the vocabulary.

It is neccessary to have a hypertuning of this as it relates to bias variance tradeoff.

In case of large max_features, the trees will be more correlated as they will be using similar features to split on. This can lead to overfitting and high variance in the model.

In case of small max_features, the trees will be less correlated as they will be using different features to split on. This can lead to underfitting and high bias in the model.

So there will be some optimal value which one can find via tuning like we found for 33 

##### n_estimators use

The main tradeoff when choosing the number of estimators is between computational cost and model performance.

More estimators -> better performance as prediction become stable as it is being given by avging over the values of trees which smoothens out the noise however it also increases the computational cost as more trees need to be trained and evaluated.

Also we cannot generally overfit the model even after using more number of trees the reason is that each tree is trained on a different bootstrap sample of the data and uses a random subset of features at each split. This keeps the bias almost same each tree has same bias and variance decreases as randomness is reduced by avging.

### Build the best random forest using the best hyperparameters found in 2B 

This is necessary so you have the specific best performing forest in your workspace.

Train *only* on training set (do not merge train and valid)


In [None]:
### now building the best random forest 
best_random_forest = random_forest_basic.set_params(**best_rf_params)
best_random_forest.fit(np.concatenate([x_tr_NF, x_val_NF]),
                          np.concatenate([y_tr_N, y_val_N]))  
test_predictions = best_random_forest.predict(x_te_NF)
test_balanced_accuracy = sklearn.metrics.balanced_accuracy_score(y_te_N, test_predictions)
print(f"\nTest Balanced Accuracy of Best Random Forest: {test_balanced_accuracy:.4f}")



Test Balanced Accuracy of Best Random Forest: 0.8370


### Table 3: Comparison of methods on the bag-of-words to sentiment classification task.

### Table 3: Comparison of methods on the bag-of-words to sentiment classification task.

Please report **balanced accuracy** on the train, valid, and test sets, to 3 digits of precision

|**method**|**max depth**|**num trees**|**train BAcc**|**valid BAcc**|**test BAcc**|
|:-|:-:|:-:|:-:|:-:|:-:|
|simple Tree|3|1|0.6458|0.6446|0.6458|
|best Tree|32|1| 0.8772|0.7324| 0.7489|
|simple RandomForest|3|100|0.8191| 0.7972|0.7782|
|best RandomForest|32|100|0.964|0.851|0.837|

We cann see that the best performance is given by best Random forest followed by simple random forest followed by best decision tree and simple decision tree.

The reason for this trend is that random forests are ensemble methods that combine multiple decision trees to make predictions. This ensemble approach helps to reduce overfitting and improve generalization compared to a single decision tree.


In [99]:
### printing for each bhai or kya
criterion = 'gini'
max_depth = 3
min_samples_leaf = 1
min_samples_split = 2
random_state = 101
###lets use sklearn first then may be tthis regres
tree = sklearn.tree.DecisionTreeClassifier(criterion=criterion,
                                            max_depth=max_depth,
                                            min_samples_leaf=min_samples_leaf,
                                            min_samples_split=min_samples_split,
                                            random_state=random_state)
tree.fit(x_tr_NF, y_tr_N)
# pretty_print_sklearn_tree(tree, feature_names=vocab_l
###then check the vals for val test and train balanced accuracy
train_preds = tree.predict(x_tr_NF)
val_preds = tree.predict(x_val_NF)
test_preds = tree.predict(x_te_NF)
train_bal_acc = sklearn.metrics.balanced_accuracy_score(y_tr_N, train_preds)
val_bal_acc = sklearn.metrics.balanced_accuracy_score(y_val_N, val_preds)
test_bal_acc = sklearn.metrics.balanced_accuracy_score(y_te_N, test_preds)
print(f"Basic Decision Tree Balanced Accuracies - Train: {train_bal_acc:.4f}, Val: {val_bal_acc:.4f}, Test: {test_bal_acc:.4f}")


Basic Decision Tree Balanced Accuracies - Train: 0.6458, Val: 0.6446, Test: 0.6458


In [100]:

best_tree = sklearn.tree.DecisionTreeClassifier(criterion='gini',
                                                max_depth = 32,
                                                min_samples_leaf = 3,
random_state = 101)
best_tree.fit(x_tr_NF,y_tr_N)  
train_preds = best_tree.predict(x_tr_NF)
val_preds = best_tree.predict(x_val_NF)
test_preds = best_tree.predict(x_te_NF)
train_bal_acc = sklearn.metrics.balanced_accuracy_score(y_tr_N, train_preds)
val_bal_acc = sklearn.metrics.balanced_accuracy_score(y_val_N, val_preds)
test_bal_acc = sklearn.metrics.balanced_accuracy_score(y_te_N, test_preds)
print(f"Best Decision Tree Balanced Accuracies - Train: {train_bal_acc:.4f}, Val: {val_bal_acc:.4f}, Test: {test_bal_acc:.4f}")

Best Decision Tree Balanced Accuracies - Train: 0.8772, Val: 0.7324, Test: 0.7489


In [None]:

# {'max_depth': 32, 'min_samples_leaf': 3, 'random_state': 101}



##basic forest 
simple_forest = sklearn.ensemble.RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    max_features='sqrt',
    max_depth=3,
    min_samples_leaf=1,
    min_samples_split=2,
    random_state=101)
simple_forest.fit(x_tr_NF, y_tr_N)
train_preds = simple_forest.predict(x_tr_NF)
val_preds = simple_forest.predict(x_val_NF)
test_preds = simple_forest.predict(x_te_NF)
train_bal_acc = sklearn.metrics.balanced_accuracy_score(y_tr_N, train_preds)
val_bal_acc = sklearn.metrics.balanced_accuracy_score(y_val_N, val_preds)
test_bal_acc = sklearn.metrics.balanced_accuracy_score(y_te_N, test_preds)
print(f"Random Forest Balanced Accuracies - Train: {train_bal_acc:.4f}, Val: {val_bal_acc:.4f}, Test: {test_bal_acc:.4f}")




Random Forest Balanced Accuracies - Train: 0.8191, Val: 0.7972, Test: 0.7782


In [103]:

best_forest = sklearn.ensemble.RandomForestClassifier(
    criterion='gini',
    max_depth = 32,
    max_features = 33,
    min_samples_leaf = 1,
    n_estimators = 100,
    random_state = 101
)
best_forest.fit(x_tr_NF,y_tr_N)
train_preds = best_forest.predict(x_tr_NF)
val_preds = best_forest.predict(x_val_NF)
test_preds = best_forest.predict(x_te_NF)
train_bal_acc = sklearn.metrics.balanced_accuracy_score(y_tr_N, train_preds)
val_bal_acc = sklearn.metrics.balanced_accuracy_score(y_val_N, val_preds)
test_bal_acc = sklearn.metrics.balanced_accuracy_score(y_te_N, test_preds)
print(f"Best Random Forest Balanced Accuracies - Train: {train_bal_acc:.4f}, Val: {val_bal_acc:.4f}, Test: {test_bal_acc:.4f}")

Best Random Forest Balanced Accuracies - Train: 0.9644, Val: 0.8512, Test: 0.8369
