In [1]:
import nltk
nltk.download("brown")

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

#1

In [8]:
import gensim
from nltk.corpus import brown
import random

def read_corpus(data, tokens_only=False):
    for i, doc_id in enumerate(data):
        words = brown.words(fileids=doc_id)
        tokens = gensim.utils.simple_preprocess(' '.join(words))
        if tokens_only:
            yield tokens
        else:
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

categories = brown.categories()

train_corpus = [fileids for category in categories for fileids in brown.fileids(categories=category)]
random.shuffle(train_corpus)
test_corpus = train_corpus[:len(train_corpus)//10]
train_corpus = train_corpus[len(train_corpus)//10:]

model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
train_corpus_tagged = list(read_corpus(train_corpus))
model.build_vocab(train_corpus_tagged)
model.train(train_corpus_tagged, total_examples=model.corpus_count, epochs=model.epochs)

def classify_documents(model, test_corpus):
    for doc_id in test_corpus:
        words = brown.words(fileids=doc_id)
        test_tokens = gensim.utils.simple_preprocess(' '.join(words))
        inferred_vector = model.infer_vector(test_tokens)
        similar_docs = model.docvecs.most_similar([inferred_vector], topn=3)
        print(f"Document: {doc_id}, Category: {brown.categories(fileids=doc_id)[0]}")
        print(f"Most similar documents:")
        for sim_doc_id, similarity in similar_docs:
            print(f"- Document ID: {train_corpus[sim_doc_id]}, Similarity: {similarity}")

classify_documents(model, test_corpus)


  similar_docs = model.docvecs.most_similar([inferred_vector], topn=3)


Document: cb21, Category: editorial
Most similar documents:
- Document ID: cd11, Similarity: 0.67829829454422
- Document ID: cb20, Similarity: 0.6379902958869934
- Document ID: cb02, Similarity: 0.6181785464286804
Document: cb11, Category: editorial
Most similar documents:
- Document ID: ca37, Similarity: 0.7319857478141785
- Document ID: cb20, Similarity: 0.6810892820358276
- Document ID: cb25, Similarity: 0.6796932816505432
Document: cp24, Category: romance
Most similar documents:
- Document ID: ca13, Similarity: 0.7695900797843933
- Document ID: ca15, Similarity: 0.7195014357566833
- Document ID: ca39, Similarity: 0.7142265439033508
Document: cf15, Category: lore
Most similar documents:
- Document ID: cd10, Similarity: 0.7316956520080566
- Document ID: cj23, Similarity: 0.68105149269104
- Document ID: cd02, Similarity: 0.6486451625823975
Document: ce28, Category: hobbies
Most similar documents:
- Document ID: ch27, Similarity: 0.7422364354133606
- Document ID: ce30, Similarity: 0.72

#2

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn import datasets
from sklearn.model_selection import GridSearchCV
from nltk.corpus import names, movie_reviews
import nltk
import numpy as np
import pandas as pd

In [4]:
twenty_users = datasets.fetch_20newsgroups()
df = pd.DataFrame({'doc': twenty_users.data, 'target': twenty_users.target})

In [5]:
vectorizer = TfidfVectorizer(stop_words='english',smooth_idf=True)
input_matrix = vectorizer.fit_transform(df['doc'])

In [6]:
cosine_similarity(input_matrix)

array([[1.00000000e+00, 1.51285322e-02, 3.34308848e-02, ...,
        3.36092339e-03, 7.11948644e-03, 2.44369968e-02],
       [1.51285322e-02, 1.00000000e+00, 2.53275681e-02, ...,
        5.64478365e-02, 5.52317784e-02, 1.96872069e-02],
       [3.34308848e-02, 2.53275681e-02, 1.00000000e+00, ...,
        2.89641130e-03, 1.42140914e-02, 1.28258420e-02],
       ...,
       [3.36092339e-03, 5.64478365e-02, 2.89641130e-03, ...,
        1.00000000e+00, 2.45098179e-03, 7.81108150e-04],
       [7.11948644e-03, 5.52317784e-02, 1.42140914e-02, ...,
        2.45098179e-03, 1.00000000e+00, 5.58501637e-03],
       [2.44369968e-02, 1.96872069e-02, 1.28258420e-02, ...,
        7.81108150e-04, 5.58501637e-03, 1.00000000e+00]])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(input_matrix, df['target'], test_size=0.2, random_state=42)

logreg_classifier = LogisticRegression()
logreg_classifier.fit(X_train, y_train)


In [10]:
train_accuracy = logreg_classifier.score(X_train, y_train)
test_accuracy = logreg_classifier.score(X_test, y_test)

In [11]:
print(train_accuracy)
print(test_accuracy)

0.98342724560822
0.8983650022094565


Model with using the stop words(from Homework 4)

In [13]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [15]:
traindf, testdf = train_test_split(df)

from nltk.tokenize import word_tokenize
import nltk

distribution = nltk.FreqDist(word for doc in traindf['doc'] for word in word_tokenize(doc))
total_words = len(traindf['doc'])
words975 = [word for word, freq in distribution.items() if freq / total_words >= 0.975 or freq / total_words <= 0.025]

vectorizer = TfidfVectorizer(stop_words=words975)
X_train = vectorizer.fit_transform(traindf['doc'])
X_test = vectorizer.transform(testdf['doc'])

cos_sim_matrix = cosine_similarity(X_test)




In [18]:
print(cos_sim_matrix )

[[1.         0.04719337 0.12048388 ... 0.14880153 0.03393434 0.05006195]
 [0.04719337 1.         0.04008373 ... 0.08855366 0.02552764 0.03360864]
 [0.12048388 0.04008373 1.         ... 0.08296879 0.01688664 0.03254581]
 ...
 [0.14880153 0.08855366 0.08296879 ... 1.         0.03285317 0.02945898]
 [0.03393434 0.02552764 0.01688664 ... 0.03285317 1.         0.01301553]
 [0.05006195 0.03360864 0.03254581 ... 0.02945898 0.01301553 1.        ]]


In [16]:
classifier = LogisticRegression()
classifier.fit(X_train, traindf['target'])

train_accuracy = accuracy_score(traindf['target'], classifier.predict(X_train))
print(train_accuracy)

test_accuracy = accuracy_score(testdf['target'], classifier.predict(X_test))
print(test_accuracy)

0.9428403064230996
0.8229056203605515


The test accuracy with the removal of stop words comes as 0.89, which is higher than the testing accuracy that came up as 0.82 when the stop words were added.

The word model distance can be seen from the cosine_similarity matrix from both cases.

The values tend to be lower in case of stop words removal, as the stop words occur in documents, so the removal of them essentially improves the accuracy by focusing more on the meaningful content of the documents.