In [1]:
import nltk
from nltk.corpus import reuters
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC



In [2]:
nltk.download('reuters')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package reuters to
[nltk_data]     /Users/tetianabas/nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tetianabas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tetianabas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
documents = reuters.fileids()
all_categories = reuters.categories()

documents = reuters.fileids()
X = [reuters.raw(doc_id) for doc_id in documents]
y = [reuters.categories(doc_id)[0] for doc_id in documents]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
stop_words = 'english'
vectorizer = TfidfVectorizer(stop_words=stop_words)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)

y_pred = classifier.predict(X_test_vectorized)

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", confusion_mat)



Accuracy: 0.6575532900834106

Classification Report:
                  precision    recall  f1-score   support

            acq       0.59      0.96      0.73       469
           alum       0.00      0.00      0.00         7
         barley       0.00      0.00      0.00         6
            bop       0.00      0.00      0.00        20
        carcass       0.00      0.00      0.00        15
     castor-oil       0.00      0.00      0.00         2
          cocoa       0.00      0.00      0.00        17
         coffee       1.00      0.12      0.21        25
         copper       0.00      0.00      0.00         9
           corn       1.00      0.06      0.12        48
         cotton       0.00      0.00      0.00         8
            cpi       0.00      0.00      0.00        19
            cpu       0.00      0.00      0.00         1
          crude       0.86      0.70      0.77        96
            dlr       1.00      0.15      0.26        33
           earn       0.68      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#I will run 3 most popular models to choose the best performing one 

#________________-RandomForestClassifier_________________

# Define the pipeline with TfidfVectorizer and RandomForestClassifier
pipeline_rf = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words='english')),
    ('classifier', RandomForestClassifier(random_state=42)),
])

# Define the parameter grid for RandomForestClassifier
param_grid_rf = {
    'vectorizer__max_features': [1000, 5000, 10000],
    'vectorizer__min_df': [1, 2, 5],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
}

# Initialize GridSearchCV for RandomForestClassifier
grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, scoring='accuracy', verbose=1)
grid_search_rf.fit(X_train, y_train)

# Make predictions on the test set using the best RandomForestClassifier model
y_pred_rf = grid_search_rf.best_estimator_.predict(X_test)

# Evaluate RandomForestClassifier model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_rep_rf = classification_report(y_test, y_pred_rf)
confusion_mat_rf = confusion_matrix(y_test, y_pred_rf)

print("RandomForestClassifier Results:")
print(f"Best Parameters: {grid_search_rf.best_params_}")
print(f"Accuracy: {accuracy_rf}")
print("Classification Report:\n", classification_rep_rf)
print("Confusion Matrix:\n", confusion_mat_rf)
print("\n")



#__________________________SVC___________________________

# Define the pipeline with 
pipeline_svm = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words='english')),
    ('classifier', SVC(random_state=42)),
])

# Define the parameter grid 
param_grid_svm = {
    'vectorizer__max_features': [1000, 5000, 10000],
    'vectorizer__min_df': [1, 2, 5],
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf'],
}

# Initialize GridSearchCV 
grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=5, scoring='accuracy', verbose=1)
grid_search_svm.fit(X_train, y_train)

# Make predictions on the test 
y_pred_svm = grid_search_svm.best_estimator_.predict(X_test)

#evaluate 
accuracy_svm = accuracy_score(y_test, y_pred_svm)
classification_rep_svm = classification_report(y_test, y_pred_svm)
confusion_mat_svm = confusion_matrix(y_test, y_pred_svm)

print("Support Vector Machine (SVM) Results:")
print(f"Best Parameters: {grid_search_svm.best_params_}")
print(f"Accuracy: {accuracy_svm}")
print("Classification Report:\n", classification_rep_svm)
print("Confusion Matrix:\n", confusion_mat_svm)
print("\n")


#-----------Gradient Boosting___________

# Define the pipeline 
pipeline_gb = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words='english')),
    ('classifier', GradientBoostingClassifier(random_state=42)),
])

# Define the parameter grid 
param_grid_gb = {
    'vectorizer__max_features': [1000, 5000, 10000],
    'vectorizer__min_df': [1, 2, 5],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
}

# Initialize GridSearchCV for gradient boosting
grid_search_gb = GridSearchCV(pipeline_gb, param_grid_gb, cv=5, scoring='accuracy', verbose=1)
grid_search_gb.fit(X_train, y_train)

#make predictions on the test set using the best gradient boosting model
y_pred_gb = grid_search_gb.best_estimator_.predict(X_test)

#evaluate
accuracy_gb = accuracy_score(y_test, y_pred_gb)
classification_rep_gb = classification_report(y_test, y_pred_gb)
confusion_mat_gb = confusion_matrix(y_test, y_pred_gb)

print("Gradient Boosting Results:")
print(f"Best Parameters: {grid_search_gb.best_params_}")
print(f"Accuracy: {accuracy_gb}")
print("Classification Report:\n", classification_rep_gb)
print("Confusion Matrix:\n", confusion_mat_gb)
print("\n")

# Compare and select the best-performing model
best_accuracy = max(accuracy_rf, accuracy_svm, accuracy_gb)
best_model = None

if best_accuracy == accuracy_rf:
    best_model = grid_search_rf.best_estimator_
    print("RandomForestClassifier is the best-performing model.")
elif best_accuracy == accuracy_svm:
    best_model = grid_search_svm.best_estimator_
    print("Support Vector Machine (SVM) is the best-performing model.")
else:
    best_model = grid_search_gb.best_estimator_
    print("Gradient Boosting is the best-performing model.")
