<a href="https://colab.research.google.com/github/TKing151/TextClassification_Survey/blob/main/Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Text classification on scikit's newsgroup text dataset. Multinomial Naive Bayes model.

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
#from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, f1_score, average_precision_score, confusion_matrix

# Load the 20 Newsgroups dataset (news articles classified into categories)
categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball', 'talk.politics.misc', 'sci.med']
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize and train a Multinomial Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Predict categories for the test data
predictions = model.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

f1 = f1_score(y_test, predictions, average='weighted')  # 'weighted' takes class imbalance into account
print("F1-score:", f1)

macro_f1 = f1_score(y_test, predictions, average='macro')
print("Macro F1-score:", macro_f1)

# Calculate micro F1-score
micro_f1 = f1_score(y_test, predictions, average='micro')
print("Micro F1-score:", micro_f1)

# Calculate weighted F1-score
weighted_f1 = f1_score(y_test, predictions, average='weighted')
print("Weighted F1-score:", weighted_f1)

# Calculate Average Precision Score
#avg_precision = average_precision_score(y_test, model.predict_proba(X_test_tfidf), average='macro')
#print("Average Precision Score:", avg_precision)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.7805953693495039
F1-score: 0.7787313881379052
Macro F1-score: 0.7699958618698883
Micro F1-score: 0.780595369349504
Weighted F1-score: 0.7787313881379052
Confusion Matrix:
 [[109   5  13  10   9]
 [  5 180   5   7   1]
 [ 12  10 174  16   4]
 [  8  16  10 153   5]
 [ 32   3   8  20  92]]


Tuned Hyper-parameters

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, f1_score, average_precision_score, confusion_matrix


# Load the 20 Newsgroups dataset (news articles classified into categories)
categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball', 'sci.med']
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize a Multinomial Naive Bayes classifier
model = MultinomialNB()

# Define hyperparameter grid for grid search
param_grid = {'alpha': [0.1, 0.4, .9, 2.0]}

# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_weighted') #'f1_macro')
grid_search.fit(X_train_tfidf, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model to make predictions
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test_tfidf)

# Calculate macro F1-score of the best model
macro_f1 = f1_score(y_test, predictions, average='macro')
print("Best Model Macro F1-score:", macro_f1)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

f1 = f1_score(y_test, predictions, average='weighted')  # 'weighted' takes class imbalance into account
print("F1-score:", f1)

macro_f1 = f1_score(y_test, predictions, average='macro')
print("Macro F1-score:", macro_f1)

# Calculate micro F1-score
micro_f1 = f1_score(y_test, predictions, average='micro')
print("Micro F1-score:", micro_f1)

# Calculate weighted F1-score
weighted_f1 = f1_score(y_test, predictions, average='weighted')
print("Weighted F1-score:", weighted_f1)

# Calculate Average Precision Score
#avg_precision = average_precision_score(y_test, model.predict_proba(X_test_tfidf), average='macro')
#print("Average Precision Score:", avg_precision)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:\n", conf_matrix)



Best Hyperparameters: {'alpha': 0.1}
Best Model Macro F1-score: 0.8444349931608249
Accuracy: 0.8457446808510638
F1-score: 0.8453748141646517
Macro F1-score: 0.8444349931608249
Micro F1-score: 0.8457446808510638
Weighted F1-score: 0.8453748141646517
Confusion Matrix:
 [[129   5  14  14]
 [  6 165  12   9]
 [  6   7 179   7]
 [ 11  13  12 163]]


mnb with lemmatization

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Load the 20 Newsgroups dataset
categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball', 'sci.med']
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

# Lemmatization function using NLTK
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)

# Apply lemmatization to the text data
X_train_lemmatized = [lemmatize_text(text) for text in X_train]
X_test_lemmatized = [lemmatize_text(text) for text in X_test]

# Convert lemmatized data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_lemmatized)
X_test_tfidf = tfidf_vectorizer.transform(X_test_lemmatized)

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'alpha': [0.1, 1.0, 10.0],
    'fit_prior': [True, False]
}

# Create MultinomialNB model
model = MultinomialNB()

# Create GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_weighted')
grid_search.fit(X_train_tfidf, y_train)

# Get best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_tfidf)
test_f1 = f1_score(y_test, y_pred, average='weighted')
print("Test Weighted F1-score:", test_f1)


Best Hyperparameters: {'alpha': 0.1, 'fit_prior': True}
Test Weighted F1-score: 0.8547694327523531


switch models to logistic regression

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the 20 Newsgroups dataset (news articles classified into categories)
categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball', 'sci.med']
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize a Logistic Regression classifier for multiclass classification
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# Train the model
model.fit(X_train_tfidf, y_train)

# Make predictions
predictions = model.predict(X_test_tfidf)

# Calculate weighted F1-score of the model
weighted_f1 = f1_score(y_test, predictions, average='weighted')
print("Model Weighted F1-score:", weighted_f1)


Model Weighted F1-score: 0.8166253018639785


tune hyper-parameters for logistic regression

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the 20 Newsgroups dataset (news articles classified into categories)
categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball', 'sci.med']
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize a Logistic Regression classifier for multiclass classification
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# Define the hyperparameter grid to search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_weighted')

# Perform hyperparameter tuning
grid_search.fit(X_train_tfidf, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model to make predictions
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test_tfidf)

# Calculate weighted F1-score of the best model
weighted_f1 = f1_score(y_test, predictions, average='weighted')
print("Best Model Weighted F1-score:", weighted_f1)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Hyperparameters: {'C': 1}
Best Model Weighted F1-score: 0.8166253018639785


switch to KNN

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the 20 Newsgroups dataset (news articles classified into categories)
categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball', 'sci.med']
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize a k-Nearest Neighbors classifier
model = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors

# Train the model
model.fit(X_train_tfidf, y_train)

# Make predictions
predictions = model.predict(X_test_tfidf)

# Calculate weighted F1-score of the model
weighted_f1 = f1_score(y_test, predictions, average='weighted')
print("Model Weighted F1-score:", weighted_f1)


Model Weighted F1-score: 0.4197603328155643


KNN with grid search

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the 20 Newsgroups dataset (news articles classified into categories)
categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball', 'sci.med']
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize a k-Nearest Neighbors classifier
knn = KNeighborsClassifier()

# Define the hyperparameter grid to search
param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9]  # Test different numbers of neighbors
}

# Initialize GridSearchCV
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='f1_weighted')

# Perform hyperparameter tuning
grid_search.fit(X_train_tfidf, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best model to make predictions
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test_tfidf)

# Calculate weighted F1-score of the best model
weighted_f1 = f1_score(y_test, predictions, average='weighted')
print("Best Model Weighted F1-score:", weighted_f1)


Best Hyperparameters: {'n_neighbors': 1}
Best Model Weighted F1-score: 0.44363394816344753


XGBoost without tuning

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the 20 Newsgroups dataset (news articles classified into categories)
categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball', 'sci.med']
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize an XGBoost classifier with default hyperparameters
xgb = XGBClassifier()

# Train the model
xgb.fit(X_train_tfidf, y_train)

# Make predictions
predictions = xgb.predict(X_test_tfidf)

# Calculate weighted F1-score of the model
weighted_f1 = f1_score(y_test, predictions, average='weighted')
print("Model Weighted F1-score:", weighted_f1)


Model Weighted F1-score: 0.7929220659716383


xg boost with some tuning and lemmatization

In [None]:
import numpy as np
import xgboost as xgb
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Load the 20 Newsgroups dataset
categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball', 'sci.med']
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

# Lemmatization function using NLTK
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)

# Apply lemmatization to the text data
X_train_lemmatized = [lemmatize_text(text) for text in X_train]
X_test_lemmatized = [lemmatize_text(text) for text in X_test]

# Convert lemmatized data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_lemmatized)
X_test_tfidf = tfidf_vectorizer.transform(X_test_lemmatized)

# Define hyperparameter ranges for random search
param_dist = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6],
    'min_child_weight': [1, 2, 3, 4],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Perform random search
n_iter_search = 10 # 50
best_f1 = 0
best_params = None

for _ in range(n_iter_search):
    params = {key: np.random.choice(values) for key, values in param_dist.items()}
    model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train)), **params)
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    f1 = f1_score(y_test, y_pred, average='weighted')

    if f1 > best_f1:
        best_f1 = f1
        best_params = params

print("Best F1-score:", best_f1)
print("Best Hyperparameters:", best_params)


Best F1-score: 0.8132935151768679
Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 3, 'min_child_weight': 4, 'gamma': 0.1, 'subsample': 1.0, 'colsample_bytree': 0.8}


Random forest

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the 20 Newsgroups dataset (news articles classified into categories)
categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball', 'sci.med']
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize a Random Forest classifier with default hyperparameters
rf = RandomForestClassifier()

# Train the model
rf.fit(X_train_tfidf, y_train)

# Make predictions
predictions = rf.predict(X_test_tfidf)

# Calculate weighted F1-score of the model
weighted_f1 = f1_score(y_test, predictions, average='weighted')
print("Model Weighted F1-score:", weighted_f1)


Model Weighted F1-score: 0.7582243111520194


Random forest with some tuning and lemmatization

In [None]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Load the 20 Newsgroups dataset
categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball', 'sci.med']
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

# Lemmatization function using NLTK
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)

# Apply lemmatization to the text data
X_train_lemmatized = [lemmatize_text(text) for text in X_train]
X_test_lemmatized = [lemmatize_text(text) for text in X_test]

# Convert lemmatized data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_lemmatized)
X_test_tfidf = tfidf_vectorizer.transform(X_test_lemmatized)

# Define hyperparameter ranges for random search
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Perform random search
n_iter_search = 50
best_f1 = 0
best_params = None

for _ in range(n_iter_search):
    params = {key: np.random.choice(values) for key, values in param_dist.items()}
    model = RandomForestClassifier(**params)
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    f1 = f1_score(y_test, y_pred, average='weighted')

    if f1 > best_f1:
        best_f1 = f1
        best_params = params

print("Best F1-score:", best_f1)
print("Best Hyperparameters:", best_params)


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best F1-score: 0.8090265170144113
Best Hyperparameters: {'n_estimators': 200, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt'}


Pytorch NN without optimization

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

# Load the 20 Newsgroups dataset (news articles classified into categories)
categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball', 'sci.med']
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Convert labels to numeric format
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Convert data to PyTorch tensors
X_train_tfidf = torch.tensor(X_train_tfidf.toarray(), dtype=torch.float32)
y_train_encoded = torch.tensor(y_train_encoded, dtype=torch.long)
X_test_tfidf = torch.tensor(X_test_tfidf.toarray(), dtype=torch.float32)

# Define a simple neural network model
class TextClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(TextClassifier, self).__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.fc(x)

# Initialize the model
input_dim = X_train_tfidf.shape[1]
output_dim = len(categories)
model = TextClassifier(input_dim, output_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tfidf)
    loss = criterion(outputs, y_train_encoded)
    loss.backward()
    optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")

# Evaluation
with torch.no_grad():
    model.eval()
    test_outputs = model(X_test_tfidf)
    predicted_labels = torch.argmax(test_outputs, dim=1).numpy()
    test_f1 = f1_score(y_test_encoded, predicted_labels, average='weighted')
    print("Test Weighted F1-score:", test_f1)


Epoch [1/10], Loss: 1.3873318433761597
Epoch [2/10], Loss: 1.3848925828933716
Epoch [3/10], Loss: 1.3824602365493774
Epoch [4/10], Loss: 1.380035161972046
Epoch [5/10], Loss: 1.3776167631149292
Epoch [6/10], Loss: 1.375205636024475
Epoch [7/10], Loss: 1.372801423072815
Epoch [8/10], Loss: 1.3704041242599487
Epoch [9/10], Loss: 1.3680142164230347
Epoch [10/10], Loss: 1.3656309843063354
Test Weighted F1-score: 0.3906516277983329


optimized NN

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.3.0-py3-none-any.whl (404 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.2/404.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.11.3-py3-none-any.whl (225 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.4/225.4 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.10.0 (from optuna)
  Downloading cmaes-0.10.0-py3-none-any.whl (29 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.11.3 cmaes-0.10.0 colorlog-6.7.0 optuna-3.3.0


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import optuna

# Load the 20 Newsgroups dataset (news articles classified into categories)
categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball', 'sci.med']
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Convert labels to numeric format
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Convert data to PyTorch tensors
X_train_tfidf = torch.tensor(X_train_tfidf.toarray(), dtype=torch.float32)
y_train_encoded = torch.tensor(y_train_encoded, dtype=torch.long)
X_test_tfidf = torch.tensor(X_test_tfidf.toarray(), dtype=torch.float32)

# Define a simple neural network model
class TextClassifier(nn.Module):
    def __init__(self, input_dim, output_dim, n_hidden, dropout_rate):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, n_hidden)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(n_hidden, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Define objective function for Optuna
def objective(trial):
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
    n_hidden = trial.suggest_int('n_hidden', 16, 256, log=True)
    dropout_rate = trial.suggest_float('dropout_rate', 0.2, 0.5)

    model = TextClassifier(input_dim, output_dim, n_hidden, dropout_rate)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    num_epochs = 10
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        outputs = model(X_train_tfidf)
        loss = criterion(outputs, y_train_encoded)
        loss.backward()
        optimizer.step()

    with torch.no_grad():
        model.eval()
        test_outputs = model(X_test_tfidf)
        predicted_labels = torch.argmax(test_outputs, dim=1).numpy()
        test_f1 = f1_score(y_test_encoded, predicted_labels, average='weighted')

    return 1.0 - test_f1  # Optuna minimizes the objective function

# Perform hyperparameter tuning using Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Get the best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)


[I 2023-08-28 02:23:34,581] A new study created in memory with name: no-name-dd349e58-69d2-45cf-b6b6-b2ee8f4d2677
[I 2023-08-28 02:23:35,704] Trial 0 finished with value: 0.9183819722979046 and parameters: {'learning_rate': 7.278203006709827e-05, 'n_hidden': 163, 'dropout_rate': 0.43901518036512166}. Best is trial 0 with value: 0.9183819722979046.
[I 2023-08-28 02:23:36,914] Trial 1 finished with value: 0.5769786292465385 and parameters: {'learning_rate': 0.00014181360841297754, 'n_hidden': 208, 'dropout_rate': 0.4126617806087691}. Best is trial 1 with value: 0.5769786292465385.
[I 2023-08-28 02:23:38,150] Trial 2 finished with value: 0.2284185182780153 and parameters: {'learning_rate': 0.004820909694630673, 'n_hidden': 118, 'dropout_rate': 0.4885984977960584}. Best is trial 2 with value: 0.2284185182780153.
[I 2023-08-28 02:23:38,957] Trial 3 finished with value: 0.8467111667361981 and parameters: {'learning_rate': 0.00020049616954438115, 'n_hidden': 113, 'dropout_rate': 0.40747879985

Best Hyperparameters: {'learning_rate': 0.0907641447977864, 'n_hidden': 62, 'dropout_rate': 0.2902043804815964}


In [None]:
# Use the best hyperparameters to train and evaluate the model
best_learning_rate = study.best_params['learning_rate']
best_n_hidden = study.best_params['n_hidden']
best_dropout_rate = study.best_params['dropout_rate']

# Initialize the model with the best hyperparameters
best_model = TextClassifier(input_dim, output_dim, best_n_hidden, best_dropout_rate)
optimizer = optim.Adam(best_model.parameters(), lr=best_learning_rate)
criterion = nn.CrossEntropyLoss()

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = best_model(X_train_tfidf)
    loss = criterion(outputs, y_train_encoded)
    loss.backward()
    optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")

# Evaluation
with torch.no_grad():
    best_model.eval()
    test_outputs = best_model(X_test_tfidf)
    predicted_labels = torch.argmax(test_outputs, dim=1).numpy()
    test_f1 = f1_score(y_test_encoded, predicted_labels, average='weighted')
    print("Test Weighted F1-score:", test_f1)


Epoch [1/10], Loss: 1.385244607925415
Epoch [2/10], Loss: 1.3104469776153564
Epoch [3/10], Loss: 1.0459202527999878
Epoch [4/10], Loss: 0.7829515933990479
Epoch [5/10], Loss: 0.5490607023239136
Epoch [6/10], Loss: 0.4481745660305023
Epoch [7/10], Loss: 0.37011855840682983
Epoch [8/10], Loss: 0.31552624702453613
Epoch [9/10], Loss: 0.27936795353889465
Epoch [10/10], Loss: 0.23024460673332214
Test Weighted F1-score: 0.8410137546827419


lemmatized NN

In [None]:
import optuna
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Load the 20 Newsgroups dataset
categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball', 'sci.med']
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

# Lemmatization function using NLTK
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)

# Apply lemmatization to the text data
X_train_lemmatized = [lemmatize_text(text) for text in X_train]
X_test_lemmatized = [lemmatize_text(text) for text in X_test]

# Convert lemmatized data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_lemmatized)
X_test_tfidf = tfidf_vectorizer.transform(X_test_lemmatized)

# Define the PyTorch neural network architecture
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, n_hidden, dropout_rate, output_size):
        super(NeuralNetwork, self).__init__()
        self.dropout = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(input_size, n_hidden)
        self.fc2 = nn.Linear(n_hidden, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Define the objective function for Optuna
def objective(trial):
    input_size = X_train_tfidf.shape[1]
    output_size = len(np.unique(y_train))
    n_hidden = trial.suggest_int("n_hidden", 32, 128)
    dropout_rate = trial.suggest_float("dropout_rate", 0.2, 0.5)

    model = NeuralNetwork(input_size, n_hidden, dropout_rate, output_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    epochs = 10
    batch_size = 64
    for epoch in range(epochs):
        model.train()
        for i in range(0, X_train_tfidf.shape[0], batch_size):
            inputs = torch.FloatTensor(X_train_tfidf[i:i+batch_size].toarray())
            labels = torch.LongTensor(y_train[i:i+batch_size])
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    model.eval()
    with torch.no_grad():
        y_pred = []
        for i in range(0, X_train_tfidf.shape[0], batch_size):
#for i in range(0, len(X_test_tfidf), batch_size):
            inputs = torch.FloatTensor(X_test_tfidf[i:i+batch_size].toarray())
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            y_pred.extend(predicted.tolist())

    return -f1_score(y_test, y_pred, average='weighted')

# Perform hyperparameter tuning with Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

# Print the optimization results
print("Best Trial:")
trial = study.best_trial
print("Value: ", trial.value)
print("Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


[I 2023-08-28 02:54:42,981] A new study created in memory with name: no-name-260e2146-49bb-40dc-9aff-83107e803283
[I 2023-08-28 02:54:44,245] Trial 0 finished with value: -0.85511950986777 and parameters: {'n_hidden': 93, 'dropout_rate': 0.4381659551759679}. Best is trial 0 with value: -0.85511950986777.
[I 2023-08-28 02:54:45,438] Trial 1 finished with value: -0.8538026605869927 and parameters: {'n_hidden': 73, 'dropout_rate': 0.3221388486931024}. Best is trial 0 with value: -0.85511950986777.
[I 2023-08-28 02:54:46,681] Trial 2 finished with value: -0.857813334789979 and parameters: {'n_hidden': 93, 'dropout_rate': 0.23103936619901347}. Best is trial 2 with value: -0.857813334789979.
[I 2023-08-28 02:54:47,872] Trial 3 finished with value: -0.8565371801297004 and parameters: {'n_hidden': 73, 'dropout_rate': 0.4192982865046845}. Best is trial 2 with value: -0.857813334789979.
[I 2023-08-28 02:54:49,683] Trial 4 finished with value: -0.8550947966629746 and parameters: {'n_hidden': 127,

Best Trial:
Value:  -0.8631641812680082
Params: 
    n_hidden: 37
    dropout_rate: 0.24070429449612527


In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Load the 20 Newsgroups dataset
categories = ['alt.atheism', 'comp.graphics', 'rec.sport.baseball', 'sci.med']
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

# Lemmatization function using NLTK
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)

# Apply lemmatization to the text data
X_train_lemmatized = [lemmatize_text(text) for text in X_train]
X_test_lemmatized = [lemmatize_text(text) for text in X_test]

# Convert lemmatized data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_lemmatized)
X_test_tfidf = tfidf_vectorizer.transform(X_test_lemmatized)

# Define the PyTorch neural network architecture with optimized hyperparameters
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, n_hidden, dropout_rate, output_size):
        super(NeuralNetwork, self).__init__()
        self.dropout = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(input_size, n_hidden)
        self.fc2 = nn.Linear(n_hidden, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Instantiate the NeuralNetwork with the best hyperparameters obtained from Optuna
best_n_hidden = 37  # Replace with the best value obtained from Optuna
best_dropout_rate = 0.24070429449612527  # Replace with the best value obtained from Optuna

input_size = X_train_tfidf.shape[1]
output_size = len(np.unique(y_train))

model = NeuralNetwork(input_size, best_n_hidden, best_dropout_rate, output_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 10
batch_size = 64
for epoch in range(epochs):
    model.train()
    for i in range(0, X_train_tfidf.shape[0], batch_size):
        inputs = torch.FloatTensor(X_train_tfidf[i:i+batch_size].toarray())
        labels = torch.LongTensor(y_train[i:i+batch_size])
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
with torch.no_grad():
    y_pred = []
    for i in range(0, X_test_tfidf.shape[0], batch_size):
        inputs = torch.FloatTensor(X_test_tfidf[i:i+batch_size].toarray())
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        y_pred.extend(predicted.tolist())

test_f1 = f1_score(y_test, y_pred, average='weighted')
print("Test Weighted F1-score:", test_f1)


Test Weighted F1-score: 0.855017112861199
