In [17]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('stopwords')

# Load documents and labels
def load_documents_and_labels():
    texts, labels = [], []
    # Load positive examples
    for file in os.listdir('possitive'):
        with open(f'possitive/{file}', 'r', encoding='utf-8') as f:
            texts.append(f.read())
            labels.append(1)  # 1 for positive class
    # Load negative examples
    for file in os.listdir('negative'):
        with open(f'negative/{file}', 'r', encoding='utf-8') as f:
            texts.append(f.read())
            labels.append(0)  # 0 for negative class
    return texts, labels

# Tokenization and removing stop words
def preprocess_texts(texts):
    stop_words = set(stopwords.words('english'))
    processed_texts = []
    for text in texts:
        tokens = word_tokenize(text)
        filtered_tokens = [w for w in tokens if not w.lower() in stop_words]
        processed_texts.append(' '.join(filtered_tokens))
    return processed_texts

texts, labels = load_documents_and_labels()
processed_texts = preprocess_texts(texts)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(processed_texts, labels, test_size=0.1, random_state=42)

# Initialize the vectorizer
vectorizer = CountVectorizer()

# Classifiers to evaluate
classifiers = [
    LogisticRegression(max_iter=1000),
    MultinomialNB(),
    SVC(),
    RandomForestClassifier()
]

classifier_names = ['Logistic Regression', 'Naive Bayes', 'SVM', 'Random Forest']

# Train and evaluate classifiers
for classifier, name in zip(classifiers, classifier_names):
    pipeline = make_pipeline(vectorizer, StandardScaler(with_mean=False), classifier)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print(f"\n{name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")


[nltk_data] Downloading package punkt to C:\Users\Oron's
[nltk_data]     computer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Oron's
[nltk_data]     computer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Logistic Regression:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000

Naive Bayes:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000

SVM:
Accuracy: 0.5455
Precision: 0.7980
Recall: 0.5455

Random Forest:
Accuracy: 0.9091
Precision: 0.9273
Recall: 0.9091


In [None]:
# Logistic Regression:

# Despite achieving perfect accuracy, precision, and recall of 1.0000, Logistic Regression may be considered cautiously due to its performance being assessed solely on these metrics. While its simplicity and high performance are apparent, it's important to scrutinize its generalization to unseen data and its capability in handling more complex patterns.
# Naive Bayes:

# Similar to Logistic Regression, Naive Bayes also achieved perfect accuracy, precision, and recall of 1.0000, indicating consistent and reliable performance across all metrics. This makes Naive Bayes an attractive choice for tasks where robustness and simplicity are valued, and where the assumptions of independence among features hold reasonably well.
# SVM:

# While SVM falls behind in terms of accuracy and recall compared to Logistic Regression and Naive Bayes, it still demonstrates a respectable performance with accuracy and recall values of 0.5455. However, its precision of 0.7980 suggests that it's relatively better at minimizing false positives at the expense of some false negatives. Therefore, SVM might be a suitable option when false positives need to be minimized while maintaining a reasonable level of recall.
# Random Forest:

# Random Forest achieved an accuracy of 0.9091 and precision of 0.9273, with recall matching its accuracy. Although not perfect, its performance is commendable, especially considering its robustness and ability to handle complex relationships in the data. Random Forest could be a preferable choice when high accuracy and reliability in classifying positive cases are prioritized without the strict requirement of perfect performance on all metrics.
# In summary, the choice of the best classifier depends on the specific needs of the task. Logistic Regression and Naive Bayes offer simplicity and perfect performance but may require further evaluation for generalization. SVM, with its focus on minimizing false positives, could be valuable in scenarios where this is crucial. Random Forest, with its robustness and strong performance, may be a suitable option for tasks requiring high accuracy and reliability.