In [12]:
try:
    import nltk
except ModuleNotFoundError:
    !pip install nltk
    
try:
    import numpy as np
except ModuleNotFoundError:
    !pip install numpy

In [13]:
## This code downloads the required packages.

nltk_packages = [
    ("reuters", "corpora/reuters.zip")
]

for pid, fid in nltk_packages:
    try:
        nltk.data.find(fid)
    except LookupError:
        nltk.download(pid)

[nltk_data] Downloading package reuters to C:\Users\Suraksha
[nltk_data]     Aithal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\reuters.zip.


## Setting up corpus

In [14]:
from nltk.corpus import reuters

## Setting up train/test data

In [15]:
train_documents, train_categories = zip(*[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('training/')])
test_documents, test_categories = zip(*[(reuters.raw(i), reuters.categories(i)) for i in reuters.fileids() if i.startswith('test/')])

In [16]:
all_categories = sorted(list(set(reuters.categories())))

In [18]:
#Tokenizing the documents and removing the stopwords

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


stop_words = set(stopwords.words("english"))

train_tokens = [nltk.word_tokenize(text) for text in train_documents]
test_tokens = [nltk.word_tokenize(text) for text in test_documents]



#converting the words in train_Set to lower case


all_words = []
for w in train_tokens:
    for i in w:
        all_words.append(i.lower())
 

    

#converting the words in test_set to lower case
all_words_test_set = []
for w in test_tokens:
    for i in w:
        all_words_test_set.append(i.lower())

        



In [19]:
 import string


#removing the stop_Words in train_set


filtered_sentence = []
for w in all_words:
        if w not in stop_words:
            filtered_sentence.append(w)
        

#removing the stop_words in test_set
filtered_sentence_test = []
for i in all_words_test_set:
        if i not in stop_words:
            filtered_sentence_test.append(i)




# removing punctuation from training sets

no_punc=[]
Punctuation = ['.',',','"','?',';',':','}','{','(',')','!','@','#',"''",'""',
                   "``","'s","&",">","<"]
for i in filtered_sentence:
    if i not in Punctuation:
        no_punc.append(i)
        


# removing punctuation from test set
no_punctuation=[]
Punctuation = ['.',',','"','?',';',':','}','{','(',')','!','@','#',"''",'""',
                   "``","'s","&",">","<"]
for i in filtered_sentence_test:
    if i not in Punctuation:
        no_punctuation.append(i)
        

    



In [20]:
#lemmatizing the train_set


lemmatizer = WordNetLemmatizer()
lemmatized =[]
lemmatized = [lemmatizer.lemmatize(w) for w in no_punc]




#lemmatizing the test_Set
lemmatized_test_set =[]
lemmatized_test_set = [lemmatizer.lemmatize(w) for w in no_punctuation]



In [21]:
word_feat=nltk.FreqDist(lemmatized)


In [22]:
#Feature extraction


word_features = [x for (x,y) in word_feat.most_common(2500)]
lemmas = []
for w in all_categories:
    lemmas = lemmatizer.lemmatize(w)
    word_features.append(lemmas)
    
    


   


In [23]:
#feature vector formation

X_train = [[1 if w in tokens else 0 for w in word_features] for tokens in train_tokens]
X_test  = [[1 if w in tokens else 0 for w in word_features] for tokens in test_tokens]

In [24]:
# Multi-label binarizing

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
mlb.fit(train_categories + test_categories)

print("These are the all categories from the MultiLabelBinarizer:\n{}".format(", ".join(mlb.classes_)))

example = mlb.transform([train_categories[6]])[0]
print("\nCategories: {}\nVector: {}".format(train_categories[6], example))
print("\nThe 0th entry represents the label '{}'".format(mlb.classes_[0]))


y_train = mlb.transform(train_categories)
y_test  = mlb.transform(test_categories)

These are the all categories from the MultiLabelBinarizer:
acq, alum, barley, bop, carcass, castor-oil, cocoa, coconut, coconut-oil, coffee, copper, copra-cake, corn, cotton, cotton-oil, cpi, cpu, crude, dfl, dlr, dmk, earn, fuel, gas, gnp, gold, grain, groundnut, groundnut-oil, heat, hog, housing, income, instal-debt, interest, ipi, iron-steel, jet, jobs, l-cattle, lead, lei, lin-oil, livestock, lumber, meal-feed, money-fx, money-supply, naphtha, nat-gas, nickel, nkr, nzdlr, oat, oilseed, orange, palladium, palm-oil, palmkernel, pet-chem, platinum, potato, propane, rand, rape-oil, rapeseed, reserves, retail, rice, rubber, rye, ship, silver, sorghum, soy-meal, soy-oil, soybean, strategic-metal, sugar, sun-meal, sun-oil, sunseed, tea, tin, trade, veg-oil, wheat, wpi, yen, zinc

Categories: ['acq', 'trade']
Vector: [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 

In [25]:
#classifier training

from sklearn.tree import DecisionTreeClassifier

from collections import OrderedDict



In [26]:
clfs = OrderedDict()

for i, category in enumerate(all_categories):
    clf = DecisionTreeClassifier()
    
    # We train each classifier individually, so we must use
    # only 0 or 1 as y_train.
    y_train_clf = [yt[i] for yt in y_train]
    
    # .fit() will train the model with the training data
    clf.fit(X_train, y_train_clf)
    
    clfs[category] = clf

In [27]:
#classifer evaluation

y_pred = np.zeros((len(y_test), len(all_categories)))

for i, (cat, clf) in enumerate(clfs.items()):
    y_pred[:, i] = clf.predict(X_test)

In [28]:
from sklearn import metrics

In [29]:
print("Accuracy : {:.4f}".format(metrics.accuracy_score(y_test, y_pred)))
print("Precision: {:.4f}".format(metrics.precision_score(y_test, y_pred, average='micro')))
print("Recall   : {:.4f}".format(metrics.recall_score(y_test, y_pred, average='micro')))
print("F1-Score : {:.4f}".format(metrics.f1_score(y_test, y_pred, average='micro')))







Accuracy : 0.6101
Precision: 0.7421
Recall   : 0.7118
F1-Score : 0.7267


In [30]:
print(metrics.classification_report(y_true=y_test, y_pred=y_pred, target_names=mlb.classes_))






                 precision    recall  f1-score   support

            acq       0.84      0.75      0.79       719
           alum       0.55      0.48      0.51        23
         barley       0.60      0.43      0.50        14
            bop       0.61      0.47      0.53        30
        carcass       0.48      0.56      0.51        18
     castor-oil       0.00      0.00      0.00         1
          cocoa       0.60      1.00      0.75        18
        coconut       0.50      0.50      0.50         2
    coconut-oil       0.00      0.00      0.00         3
         coffee       0.77      0.86      0.81        28
         copper       0.80      0.67      0.73        18
     copra-cake       0.00      0.00      0.00         1
           corn       0.71      0.73      0.72        56
         cotton       0.78      0.70      0.74        20
     cotton-oil       0.00      0.00      0.00         2
            cpi       0.37      0.36      0.36        28
            cpu       0.00    

  'precision', 'predicted', average, warn_for)


In [32]:
#pipeline

example_text = "This example text should cover coconuts. But we chose bad features, so we have wrong labels."


example_tokens = nltk.word_tokenize(example_text)

filtered_sentence = []
for w in example_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)

no_punc=[]
Punctuation = ['.',',','"','?',';',':','}','{','(',')','!','@','#',"''",'""',"``","'s","&",">","<"]
for i in filtered_sentence:
    if i not in Punctuation:
        no_punc.append(i)


lemma = []
for w in no_punc:
    w = lemmatizer.lemmatize(w)
    lemma.append(w)

    
example_features = [[1 if w in lemma else 0 for w in word_features]]

example_preds = [clf.predict(example_features)[0] for clf in clfs.values()]

example_labels = mlb.inverse_transform(np.array([example_preds]))

print("Example text: {}".format(example_text))
print("Example labels: {}".format(example_labels))


Example text: This example text should cover coconuts. But we chose bad features, so we have wrong labels.
Example labels: [('coconut', 'oilseed')]
