(Preprocess Step 1) Download MA-Reuters

In [22]:
!cd /root/nltk_data/corpora/ && wget https://www.kde.cs.tut.ac.jp/~aono/data/ma_reuters.zip

--2023-02-13 12:50:51--  https://www.kde.cs.tut.ac.jp/~aono/data/ma_reuters.zip
Resolving www.kde.cs.tut.ac.jp (www.kde.cs.tut.ac.jp)... 133.15.24.10
Connecting to www.kde.cs.tut.ac.jp (www.kde.cs.tut.ac.jp)|133.15.24.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6369593 (6.1M) [application/zip]
Saving to: ‘ma_reuters.zip.1’


2023-02-13 12:50:54 (2.87 MB/s) - ‘ma_reuters.zip.1’ saved [6369593/6369593]



(Preprocess Step 2) Installing NLTK

In [None]:
import nltk
nltk.download()

(Preprocess Step 3) Installing Scikit-Learn

In [24]:
! pip install scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


(Step 1) Sample Program & Modifying Category

In [25]:
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader import *

# Loading the corpus
ma_reuters = LazyCorpusLoader(
    'ma_reuters', CategorizedPlaintextCorpusReader, '(training|test).*',
    cat_file='cats.txt', encoding='ISO-8859-2')

# Load MA_Reuters
documents = ma_reuters.fileids()
print (str(len(documents)) + " total articles")

# extracting training and testing data (document ID)
train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))
print (str(len(train_docs_id)) + " training data")
print (str(len(test_docs_id)) + " testing data")

# Training and testing data
train_docs = [ma_reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [ma_reuters.raw(doc_id) for doc_id in test_docs_id]
 
# print the total number of categories
categories = ma_reuters.categories()
num_categories = len(categories)
print (num_categories, " categories")
print (categories)

10700 total articles
7713 training data
2987 testing data
55  categories
['acq', 'alum', 'barley', 'bop', 'carcass', 'cocoa', 'coffee', 'copper', 'corn', 'cotton', 'cpi', 'crude', 'dlr', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'hog', 'housing', 'interest', 'ipi', 'iron-steel', 'jobs', 'lead', 'livestock', 'meal-feed', 'money-fx', 'money-supply', 'nat-gas', 'oilseed', 'orange', 'palm-oil', 'pet-chem', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [27]:
# Modifying Catagory to Cocoa
# Documents in a category
category_docs = ma_reuters.fileids("cocoa");
document_id = category_docs[0] # The first document
# print the inside document
print (ma_reuters.raw(document_id))

COCOA EXPORTERS EXPECTED TO LIMIT SALES
  Major cocoa exporters are likely to
  limit sales in the weeks ahead in an effort to boost world
  prices, sources close to a meeting of the Cocoa Producers
  Alliance (CPA) said.
      The sources said the depressed world market had been one of
  the main topics discussed in a closed door meeting of the
  11-member CPA which began on Monday.
      They said producers agreed that cutting sales would aid the
  buffer stock manager of a new international cocoa pact in his
  effort to support prices.
      Major cocoa producing and consuming nations agreed
  operation rules for the buffer stock at a meeting in London
  last month and the stock manager is expected to enter the
  market soon.
      Prices, under the weight of three successive cocoa
  surpluses, recently fell to the level at which the manager has
  to buy cocoa under stock rules.
      The buffer stock aims to keep prices within a pre-set range
  by buying when prices fall and sellin

In [28]:
#Tokenization with NLTK, TF-IDF vectorizer with scikit-learn
from nltk import word_tokenize
import re # regular expression
 
def tokenize(text): # returning tokens
    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text))

    p = re.compile('[a-zA-Z]+')
    filtered_tokens = list(filter (lambda token: p.match(token) and len(token) >= min_length, words))
    return filtered_tokens

from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', tokenizer=tokenize)
# fit_transform
vectorised_train_documents = vectorizer.fit_transform(train_docs)
# transform
vectorised_test_documents = vectorizer.transform(test_docs)
print("converted to TF-IF model")
print("training document dimension ：",vectorised_train_documents.shape)
print("testing document dimension：",vectorised_test_documents.shape)

converted to TF-IF model
training document dimension ： (7713, 26978)
testing document dimension： (2987, 26978)


In [32]:
#SVM classification
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import jaccard_score
from sklearn.metrics import hamming_loss
import numpy as np

mlb = MultiLabelBinarizer()

train_labels = mlb.fit_transform([ma_reuters.categories(doc_id) \
                             for doc_id in train_docs_id])
test_labels = mlb.transform([ma_reuters.categories(doc_id) \
                             for doc_id in test_docs_id])
# 木内的代码在这里多一个print

#  multi-class, mult-label classification + unknown class prediction
OVR_classifier = OneVsRestClassifier(LinearSVC(random_state=41)) 
OVR_classifier.fit(vectorised_train_documents, train_labels)
OVR_predictions = OVR_classifier.predict(vectorised_test_documents)

# Jaccard coefficient
print (f"Jaccard coeeficients:"
    f"{np.round(jaccard_score(test_labels,OVR_predictions, average='samples'),3)}")

# Hamming loss Calculate 
print (f"Hamming loss:"
    f"{np.round(hamming_loss(test_labels,OVR_predictions),3)}")

Jaccard coeeficients:0.86
Hamming loss:0.005


(Step 2) Evaluate The Category

In [33]:
#以下为木内在原SVM代码段的补充代码
print("Category: Jaccard Calculate score")
max = [-1  ,[""]]
min = [1000,[""]]
for category, jscore in zip(categories, jaccard_score(test_labels,OVR_predictions, average=None)):
    if max[0]<jscore:
        max = [jscore, [category]]
    elif max[0] == jscore:
        max[1].append(category)
    
    if min[0]> jscore:
        min = [jscore, [category]]
    elif min[0] == jscore:
        min[1].append(category)

print(f"max: {max[1]} : {max[0]}")
print(f"min: {min[1]} : {min[0]}")

Category: Jaccard Calculate score
max: ['earn'] : 0.9690627843494085
min: ['lead', 'pet-chem', 'soy-oil', 'strategic-metal', 'yen'] : 0.0


(Step 3) Senpai Method

In [34]:
! pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [35]:
from gensim import models

def split_docs(docs: list):
    return [list(filter(None, re.split('[ \n]',td))) for td in docs]


class LabeledListSentence(object):
    def __init__(self, words_list, labels):
        self.words_list = words_list
        self.labels = labels

    def __iter__(self):
        for i, words in enumerate(self.words_list):
            yield models.doc2vec.TaggedDocument(words, [self.labels[i]])

In [36]:
splited_train_docs = split_docs(train_docs)
splited_test_docs = split_docs(test_docs)

train_labels_text = [ma_reuters.categories(doc_id) for doc_id in train_docs_id]
print(len(train_labels_text), len(splited_train_docs))

train_sentences = [models.doc2vec.TaggedDocument(words, label) for words,label in zip(splited_train_docs, train_labels_text)]


model = models.Doc2Vec(documents=train_sentences,vector_size=2000, window=2, min_count=1, workers=4)


model.save('doc2vec.model')

7713 7713


In [37]:
from gensim import models

# Load a model from a file after learning

model = models.Doc2Vec.load('doc2vec.model') # Above mentioned models

train_vec_list = []
for docs in splited_train_docs:
    train_vec_list.append(model.infer_vector(docs))
train_vec = np.array(train_vec_list)

test_vec_list = []
for docs in splited_test_docs:
    test_vec_list.append(model.infer_vector(docs))
test_vec = np.array(test_vec_list)



# Training + prediction with multi label classifier
OVR_classifier = OneVsRestClassifier(LinearSVC(random_state=41)) 
OVR_classifier.fit(train_vec, train_labels)

OVR_predictions = OVR_classifier.predict(test_vec)

# Jaccard coefficient
print (f"Jaccard coeeficients:"
    f"{np.round(jaccard_score(test_labels,OVR_predictions, average='samples'),3)}")



Jaccard coeeficients:0.65
