### Name : Sai Kumar Gandham
### Student ID: IG45378

Homework:

1) Using Gensim, train a doc2vec model on the Brown Corpus. Try to classify documents from each category.

In [13]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import brown
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import defaultdict

# Function to read corpus from Brown Corpus
def read_corpus(data, tokens_only=False):
    for i, line in enumerate(data):
        if tokens_only:
            yield line
        else:
            # Join tokenized sentences into a single string
            doc = ' '.join(line)
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(doc), [i])

In [14]:
# lets load Brown Corpus:
brown_corpus = brown.sents(categories=brown.categories())

# Prepare train and test corpus
train_corpus = list(read_corpus(brown_corpus))
test_corpus = list(read_corpus(brown_corpus, tokens_only=True))


In [15]:
#Train the Doc2Vec model
max_epochs = 20
vec_size = 100
alpha = 0.025

model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm=1)
model.build_vocab(train_corpus)

for epoch in range(max_epochs):
    print('Iteration {0}'.format(epoch))
    model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
    # Decrease the learning rate
    model.alpha -= 0.0002
    # Fix the learning rate, no decay
    model.min_alpha = model.alpha

Iteration 0
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19


In [17]:
# Classify documents from each category
category_docs = defaultdict(list)

for cat in brown.categories():
    for doc in brown.sents(categories=cat):
        inferred_vector = model.infer_vector(doc)
        category_docs[cat].append(inferred_vector)

# Displaying the number of documents per category
for category, vectors in category_docs.items():
    print(f"Category: {category}, Number of Documents: {len(vectors)}")


Category: adventure, Number of Documents: 4637
Category: belles_lettres, Number of Documents: 7209
Category: editorial, Number of Documents: 2997
Category: fiction, Number of Documents: 4249
Category: government, Number of Documents: 3032
Category: hobbies, Number of Documents: 4193
Category: humor, Number of Documents: 1053
Category: learned, Number of Documents: 7734
Category: lore, Number of Documents: 4881
Category: mystery, Number of Documents: 3886
Category: news, Number of Documents: 4623
Category: religion, Number of Documents: 1716
Category: reviews, Number of Documents: 1751
Category: romance, Number of Documents: 4431
Category: science_fiction, Number of Documents: 948



2) Use the stop word removal code from earlier on the 20 user groups:

How does that effect the word mover distance of documents? Pick 6 documents to compare (make sure to use the same splits so they are the same documents).

How does it effect the logistic regression classifier?

In [27]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups

# Load 20 newsgroups dataset
twenty_users = fetch_20newsgroups()

In [28]:
# Lets define stop words
stop_words = set(gensim.parsing.preprocessing.STOPWORDS)

# Function to remove stop words and tokenize
def preprocess(text, remove_stopwords=True):
    if remove_stopwords:
        return ' '.join([word for word in gensim.utils.simple_preprocess(text) if word not in stop_words])
    else:
        return ' '.join(gensim.utils.simple_preprocess(text))

# Preprocess documents with stop words removed
preprocessed_docs_stop = [preprocess(doc) for doc in twenty_users.data]
preprocessed_docs_nostop = [preprocess(doc, remove_stopwords=False) for doc in twenty_users.data]

# Vectorize preprocessed documents (with stop words removed)
vectorizer_stop = TfidfVectorizer()
X_stop = vectorizer_stop.fit_transform(preprocessed_docs_stop)

# Vectorize preprocessed documents (without stop words removed)
vectorizer_nostop = TfidfVectorizer()
X_nostop = vectorizer_nostop.fit_transform(preprocessed_docs_nostop)


In [29]:
# Train logistic regression classifier with stop words removed
trainX_stop, testX_stop, trainY_stop, testY_stop = train_test_split(X_stop, twenty_users.target)
lr_stopwords = LogisticRegression(max_iter=500)
lr_stopwords.fit(trainX_stop, trainY_stop)
accuracy_stopwords = accuracy_score(testY_stop, lr_stopwords.predict(testX_stop))

# Train logistic regression classifier without stop words removed
trainX_nostop, testX_nostop, trainY_nostop, testY_nostop = train_test_split(X_nostop, twenty_users.target)
lr_nostopwords = LogisticRegression(max_iter=500)
lr_nostopwords.fit(trainX_nostop, trainY_nostop)
accuracy_nostopwords = accuracy_score(testY_nostop, lr_nostopwords.predict(testX_nostop))


In [30]:
# Pick 6 documents to compare
docs_indices = [10, 20, 30, 40, 50, 60]
docs_to_compare = [twenty_users.data[i] for i in docs_indices]
preprocessed_docs_to_compare = [preprocess(doc) for doc in docs_to_compare]

In [31]:
# Calculate Word Movers Distance  without stop words
wmd_without_stopwords = []
for doc in preprocessed_docs_to_compare:
    wmd = model.wv.wmdistance(doc, preprocessed_docs_to_compare[0])  # Compare with the first document
    wmd_without_stopwords.append(wmd)

# Calculate Word Movers Distance with stop words removed
wmd_with_stopwords = []
for doc in docs_to_compare:
    wmd = model.wv.wmdistance(preprocess(doc), preprocessed_docs_to_compare[0])
    wmd_with_stopwords.append(wmd)

print("Effects of stop words on Word Mover's Distance (WMD):")
print("Mean WMD without stop words:", np.mean(wmd_without_stopwords))
print("Mean WMD with stop words removed:", np.mean(wmd_with_stopwords))
print()

print("Effect of stop words on logistic regression classifier accuracy:")
print("Accuracy with stop words removed:", accuracy_stopwords)
print("Accuracy without stop words removed:", accuracy_nostopwords)


Effects of stop words on Word Mover's Distance (WMD):
Mean WMD without stop words: inf
Mean WMD with stop words removed: inf

Effect of stop words on logistic regression classifier accuracy:
Accuracy with stop words removed: 0.8992576882290562
Accuracy without stop words removed: 0.8907741251325557


The "inf" values for Word Movers Distance mean that for some document pairs, the distance between them is so large that it's considered infinite. . 

As for the logistic regression classifier, when we remove stop words, it slightly boosts the accuracy. With stop words removed, the classifier gets around 90% of the predictions correct, compared to about 89% when we keep stop words. This shows that taking out the less important words helps the classifier focus better on the main ideas in the text, leading to a bit better performance.

#### Just trying to observe Some different outputs and doing further analysis

In [10]:
from sklearn.metrics import accuracy_score, classification_report


# Load 20 newsgroups dataset
twenty_users = fetch_20newsgroups()

# Define stop words
stop_words = set(gensim.parsing.preprocessing.STOPWORDS)

# Function to remove stop words and tokenize
def preprocess(text, remove_stopwords=True):
    if remove_stopwords:
        return ' '.join([word for word in gensim.utils.simple_preprocess(text) if word not in stop_words])
    else:
        return ' '.join(gensim.utils.simple_preprocess(text))

# Preprocess documents with stop words removed
preprocessed_docs_stop = [preprocess(doc) for doc in twenty_users.data]
preprocessed_docs_nostop = [preprocess(doc, remove_stopwords=False) for doc in twenty_users.data]

# Inspect a sample of preprocessed documents
print("Preprocessed document with stop words removed:")
print(preprocessed_docs_stop[0])
print("\nPreprocessed document without stop words removed:")
print(preprocessed_docs_nostop[0])


Preprocessed document with stop words removed:
lerxst wam umd edu thing subject car nntp posting host rac wam umd edu organization university maryland college park lines wondering enlighten car saw day door sports car looked late early called bricklin doors small addition bumper separate rest body know tellme model engine specs years production car history info funky looking car mail thanks il brought neighborhood lerxst

Preprocessed document without stop words removed:
from lerxst wam umd edu where my thing subject what car is this nntp posting host rac wam umd edu organization university of maryland college park lines was wondering if anyone out there could enlighten me on this car saw the other day it was door sports car looked to be from the late early it was called bricklin the doors were really small in addition the front bumper was separate from the rest of the body this is all know if anyone can tellme model name engine specs years of production where this car is made history 

In [11]:
# Vectorize preprocessed documents
vectorizer_stop = TfidfVectorizer()
X_stop = vectorizer_stop.fit_transform(preprocessed_docs_stop)

# Training the  logistic regression classifier with stop words removed
trainX_stop, testX_stop, trainY_stop, testY_stop = train_test_split(X_stop, twenty_users.target)
lr_stopwords = LogisticRegression(max_iter=500)
lr_stopwords.fit(trainX_stop, trainY_stop)
predictions_stop = lr_stopwords.predict(testX_stop)

# Calculating cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

# Calculating cosine similarity matrix for documents with stop words removed
cosine_sim_stop = cosine_similarity(X_stop)

# Printing cosine similarity matrix
print("Cosine Similarity Matrix (with stop words removed):")
print(cosine_sim_stop)

# Analyzing classifier performance
accuracy_stopwords = accuracy_score(testY_stop, predictions_stop)
report_stopwords = classification_report(testY_stop, predictions_stop, target_names=twenty_users.target_names)

print("\nClassifier Performance (with stop words removed):")
print("Accuracy:", accuracy_stopwords)
print("Classification Report:\n", report_stopwords)


Cosine Similarity Matrix (with stop words removed):
[[1.00000000e+00 1.55563277e-02 3.31727749e-02 ... 3.48750542e-03
  7.75473924e-03 2.00315282e-02]
 [1.55563277e-02 1.00000000e+00 2.79741207e-02 ... 5.66351706e-02
  5.81717547e-02 1.92273151e-02]
 [3.31727749e-02 2.79741207e-02 1.00000000e+00 ... 1.64039422e-03
  1.11668738e-02 1.39015404e-02]
 ...
 [3.48750542e-03 5.66351706e-02 1.64039422e-03 ... 1.00000000e+00
  2.60415660e-03 7.69593259e-04]
 [7.75473924e-03 5.81717547e-02 1.11668738e-02 ... 2.60415660e-03
  1.00000000e+00 5.77604860e-03]
 [2.00315282e-02 1.92273151e-02 1.39015404e-02 ... 7.69593259e-04
  5.77604860e-03 1.00000000e+00]]

Classifier Performance (with stop words removed):
Accuracy: 0.9042064333686816
Classification Report:
                           precision    recall  f1-score   support

             alt.atheism       0.93      0.94      0.94       122
           comp.graphics       0.71      0.85      0.78       141
 comp.os.ms-windows.misc       0.84      0.88