In [1]:
import pandas as pd
from collections import Counter
import json
import joblib

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from sklearn.model_selection import train_test_split

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [2]:
df = pd.read_excel(
    "../data/news_categories.xlsx",
    engine="openpyxl",
)

In [3]:
df.shape

(160, 5)

In [4]:
df.head()

Unnamed: 0,Title,Content,Link,Photo,Category
0,Bitcoin battles whales above $22K as BTC price...,Bitcoin bulls are now attacking the “final” ma...,https://cointelegraph.com/news/bitcoin-battles...,https://ibb.co/Mp2t3t0,Bitcoin
1,How to Buy Bitcoin: A Guide to Investing in th...,One minute Bitcoin’s price is up. The next it’...,https://www.bloomberg.com/news/articles/2020-1...,https://ibb.co/NtkBVP3,Bitcoin
2,More Than Half Of All Bitcoin Trades Are Fake,A new Forbes analysis of 157 crypto exchanges ...,https://www.forbes.com/sites/javierpaz/2022/08...,https://ibb.co/27FGtgP,Bitcoin
3,Bitcoin hits 3-week high as trader says 'all s...,Bitcoin (BTC) kept grinding higher at the Sept...,https://cointelegraph.com/news/bitcoin-hits-3-...,https://ibb.co/bLKdXLt,Bitcoin
4,How Much Does It Cost to Mine 1 Bitcoin?,JPMorgan strategists may have just called Bitc...,https://decrypt.co/105251/how-much-does-cost-m...,https://ibb.co/bQBgWKC,Bitcoin


In [5]:
df.tail()

Unnamed: 0,Title,Content,Link,Photo,Category
155,"""Vitalik Buterin Scam"" Hits Verified Account o...",The closer we get to the transition of Ethereu...,https://u.today/vitalik-buterin-scam-hits-veri...,https://ibb.co/jW3HcWk,Others
156,Crypto Tracer Elliptic Hires SaaS Veteran Jack...,Blockchain analytics company Elliptic on Thurs...,https://www.coindesk.com/business/2022/09/08/c...,https://ibb.co/Qn3L7ph,Others
157,Former Deutsche Bank Executive and Ex-OCC Chie...,The former head of the Office of the Comptroll...,https://dailyhodl.com/2022/09/07/former-deutsc...,https://ibb.co/PspjKTq,Others
158,Russian Millionaire’s Startup Plans Ruble Stab...,"Alexander Lebedev, the former owner of Russia’...",https://www.coindesk.com/business/2022/09/14/r...,https://ibb.co/p0hFfQ4,Others
159,"Celsius Resembled Ponzi Scheme at Times, Vermo...",Crypto lender Celsius Network misled investors...,https://www.coindesk.com/policy/2022/09/07/cel...,https://ibb.co/kVTKKT3,Others


In [6]:
df.isnull().sum()

Title       0
Content     0
Link        0
Photo       0
Category    0
dtype: int64

In [7]:
df["Category"] = df["Category"].apply(
    lambda cats: sorted(
        [cat.strip() for cat in cats.split(",")]
    )
)

In [8]:
df["Category"].apply(lambda x: tuple(sorted(x))).value_counts()

(Bitcoin,)                      20
(Others,)                       19
(Gaming,)                       18
(Regulations,)                  18
(NFT,)                          18
(Blockchain,)                   16
(Defi,)                         15
(Altcoins, Blockchain)          13
(Altcoins,)                     11
(Blockchain, Defi)               2
(Blockchain, NFT)                2
(Altcoins, Blockchain, Defi)     1
(Altcoins, Defi, NFT)            1
(Altcoins, Defi)                 1
(Altcoins, Gaming)               1
(Blockchain, Gaming)             1
(Gaming, NFT)                    1
(Bitcoin, Regulations)           1
(Altcoins, Others)               1
Name: Category, dtype: int64

In [9]:
Counter(
    [cat for cats in df["Category"].tolist() for cat in cats]
).most_common()

[('Blockchain', 35),
 ('Altcoins', 29),
 ('NFT', 22),
 ('Bitcoin', 21),
 ('Gaming', 21),
 ('Defi', 20),
 ('Others', 20),
 ('Regulations', 19)]

---

In [10]:
df_explode = df.explode("Category").copy()

In [11]:
df_explode.shape

(187, 5)

In [12]:
df_explode.head()

Unnamed: 0,Title,Content,Link,Photo,Category
0,Bitcoin battles whales above $22K as BTC price...,Bitcoin bulls are now attacking the “final” ma...,https://cointelegraph.com/news/bitcoin-battles...,https://ibb.co/Mp2t3t0,Bitcoin
1,How to Buy Bitcoin: A Guide to Investing in th...,One minute Bitcoin’s price is up. The next it’...,https://www.bloomberg.com/news/articles/2020-1...,https://ibb.co/NtkBVP3,Bitcoin
2,More Than Half Of All Bitcoin Trades Are Fake,A new Forbes analysis of 157 crypto exchanges ...,https://www.forbes.com/sites/javierpaz/2022/08...,https://ibb.co/27FGtgP,Bitcoin
3,Bitcoin hits 3-week high as trader says 'all s...,Bitcoin (BTC) kept grinding higher at the Sept...,https://cointelegraph.com/news/bitcoin-hits-3-...,https://ibb.co/bLKdXLt,Bitcoin
4,How Much Does It Cost to Mine 1 Bitcoin?,JPMorgan strategists may have just called Bitc...,https://decrypt.co/105251/how-much-does-cost-m...,https://ibb.co/bQBgWKC,Bitcoin


In [13]:
rule_categories_match = df_explode.apply(
    lambda row: row["Category"].lower() in row["Title"].lower(),
    axis=1
)
rule_categories_match.name = "Match"

In [14]:
rule_categories_match.mean()

0.3155080213903743

In [15]:
pd.crosstab(
    index=df_explode["Category"],
    columns=rule_categories_match,
)

Match,False,True
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Altcoins,29,0
Bitcoin,0,21
Blockchain,32,3
Defi,8,12
Gaming,20,1
NFT,2,20
Others,20,0
Regulations,17,2


In [16]:
# for cat in [
#     "blockchain",
#     "bitcoin",
#     "altcoin",
#     "nft",
#     "defi",
#     "game",
#     "regulation",
# ]:
#     df["Title"].apply(
#         lambda x: cat in x
#     )

---

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    df["Title"], df["Category"],
    test_size=0.3,
    random_state=42,
    # stratify=df["Category"],
)

In [18]:
mlb = MultiLabelBinarizer()

y_train_mlb = mlb.fit_transform(y_train)
y_test_mlb = mlb.transform(y_test)

In [19]:
y_train_mlb.shape, y_test_mlb.shape

((112, 8), (48, 8))

In [20]:
tokenizer = word_tokenize
stemmer = PorterStemmer()

def tokenize(text):
    tokens = tokenizer(text)
    stems = []
    for item in tokens:
        stems.append(stemmer.stem(item))
    return stems

In [21]:
model = Pipeline(
    [
        ("tf-idf", TfidfVectorizer(
            min_df=0.01,
            # tokenizer=tokenize,
        )),
        ("logreg", MultiOutputClassifier(
            LogisticRegression(
                class_weight="balanced",
                C=0.1,
            ),
        )),
    ]
)

In [22]:
model.fit(X_train, y_train_mlb)

Pipeline(steps=[('tf-idf', TfidfVectorizer(min_df=0.01)),
                ('logreg',
                 MultiOutputClassifier(estimator=LogisticRegression(C=0.1,
                                                                    class_weight='balanced')))])

In [23]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [24]:
print(
    classification_report(
        y_true=y_train_mlb,
        y_pred=y_train_pred,
        target_names=mlb.classes_,
        zero_division=0,
    )
)

              precision    recall  f1-score   support

    Altcoins       0.75      1.00      0.86        18
     Bitcoin       0.86      0.92      0.89        13
  Blockchain       0.81      0.88      0.84        24
        Defi       0.88      1.00      0.94        15
      Gaming       0.93      0.93      0.93        15
         NFT       0.92      0.79      0.85        14
      Others       0.74      0.93      0.82        15
 Regulations       0.76      0.93      0.84        14

   micro avg       0.82      0.92      0.87       128
   macro avg       0.83      0.92      0.87       128
weighted avg       0.83      0.92      0.87       128
 samples avg       0.84      0.92      0.87       128



In [25]:
print(
    classification_report(
        y_true=y_test_mlb,
        y_pred=y_test_pred,
        target_names=mlb.classes_,
        zero_division=0,
    )
)

              precision    recall  f1-score   support

    Altcoins       0.44      0.36      0.40        11
     Bitcoin       1.00      1.00      1.00         8
  Blockchain       0.70      0.64      0.67        11
        Defi       1.00      0.80      0.89         5
      Gaming       1.00      0.33      0.50         6
         NFT       0.89      1.00      0.94         8
      Others       0.25      0.60      0.35         5
 Regulations       0.67      0.40      0.50         5

   micro avg       0.67      0.64      0.66        59
   macro avg       0.74      0.64      0.66        59
weighted avg       0.73      0.64      0.66        59
 samples avg       0.65      0.71      0.66        59



In [26]:
multilabel_confusion_matrix(
    y_true=y_test_mlb,
    y_pred=y_test_pred,
)

array([[[32,  5],
        [ 7,  4]],

       [[40,  0],
        [ 0,  8]],

       [[34,  3],
        [ 4,  7]],

       [[43,  0],
        [ 1,  4]],

       [[42,  0],
        [ 4,  2]],

       [[39,  1],
        [ 0,  8]],

       [[34,  9],
        [ 2,  3]],

       [[42,  1],
        [ 3,  2]]])

---

In [27]:
joblib.dump(model, "model/model_categorizer.joblib")
joblib.dump(mlb, "model/multilabel_binarizer.joblib")

['model/multilabel_binarizer.joblib']