In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import os

In [None]:
def load_data(data_dir):
    texts, labels = [], []

    for label in ['pos', 'neg']:
        label_dir = os.path.join(data_dir, label)
        for filename in os.listdir(label_dir):
            with open(os.path.join(label_dir, filename), encoding='utf-8') as f:
                texts.append(f.read())
            labels.append(1 if label == 'pos' else 0)

    return texts, labels

In [None]:
train_dir = "datasets/aclImdb/train/"
test_dir = "datasets/aclImdb/test/"

In [None]:
X_train, y_train = load_data(train_dir)
X_test, y_test = load_data(test_dir)

In [None]:
vectorizer = CountVectorizer(stop_words='english')
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train_vectorized, y_train)

In [None]:
y_pred = model.predict(X_test_vectorized)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

In [None]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))