Experiment - 4 : Text classification Notebook

In [12]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


4.1.0 Data Loading

In [13]:
newsgroups = fetch_20newsgroups(subset='all', shuffle=True, remove=('headers', 'footers', 'quotes'))
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.2, random_state=42)

Tokenisation, stemming and text preprocessing

In [14]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Download NLTK stop words if not already downloaded
# nltk.download('stopwords')

# Initialize the Porter Stemmer for stemming
stemmer = PorterStemmer()

# Tokenization, stemming, and stop words removal function
def preprocess_text(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stopwords.words('english')]
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    processed_text = " ".join(stemmed_words)
    return processed_text

# Preprocess the entire dataset
X_train_preprocessed = [preprocess_text(text) for text in X_train]
X_test_preprocessed = [preprocess_text(text) for text in X_test]

# Print the preprocessed texts
print("Sample Preprocessed Text from X_train:")
print(X_train_preprocessed[0])



Sample Preprocessed Text from X_train:
# # 've gotten post group last coupl day . ( # recent ad feed list . ) , group # near death ? # seen mail list side , 'm get right amount traffic . patrick l. mahan -- - tgv window washer -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- - mahan @ tgv.com -- -- -- -- - wake person unnecessarili consid - lazaru long capit crime . first offens , notebook lazaru long patrick l. mahan -- - tgv window washer -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- - mahan @ tgv.com -- -- -- -- -


4.1.1 Preprocessing using tfidf vectorisation

In [15]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_preprocessed)
X_test_tfidf = tfidf_vectorizer.transform(X_test_preprocessed)


4.1.2 Naive-Bayes classifier

In [16]:
from sklearn.naive_bayes import MultinomialNB

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
y_pred_nb = nb_classifier.predict(X_test_tfidf)
f1_nb = f1_score(y_test, y_pred_nb, average='weighted')


4.1.3 Rocchio classifier

In [17]:
def rocchio_classifier(X_train, y_train, X_test):
    # Calculate the centroids for each class
    classes = np.unique(y_train)
    centroids = {}
    for c in classes:
        class_indices = np.where(y_train == c)[0]
        centroids[c] = np.mean(X_train[class_indices], axis=0)

    # Classify test data using Rocchio
    y_pred = []
    for sample in X_test:
        best_class = None
        best_distance = float('inf')
        for c in classes:
            distance = np.linalg.norm(sample - centroids[c])
            if distance < best_distance:
                best_distance = distance
                best_class = c
        y_pred.append(best_class)
    return np.array(y_pred)

# Train and predict using the Rocchio classifier
y_pred_rocchio = rocchio_classifier(X_train_tfidf, y_train, X_test_tfidf)

# Calculate F-score for the Rocchio classifier
f1_rocchio = f1_score(y_test, y_pred_rocchio, average='weighted')

4.1.4 KNN classifier

In [18]:
from sklearn.neighbors import KNeighborsClassifier

k = 5  # Adjust the value of k as needed
knn_classifier = KNeighborsClassifier(n_neighbors=k)
knn_classifier.fit(X_train_tfidf, y_train)
y_pred_knn = knn_classifier.predict(X_test_tfidf)
f1_knn = f1_score(y_test, y_pred_knn, average='weighted')


4.2 F-scores

In [19]:
print(f"F-score Naive Bayes: {f1_nb}")
print(f"F-score Rocchio: {f1_rocchio}")
print(f"F-score k-Nearest Neighbor (k={k}): {f1_knn}")



F-score Naive Bayes: 0.6973481022239094
F-score Rocchio: 0.6664976418174576
F-score k-Nearest Neighbor (k=5): 0.08855796706083052
