In [1]:
import re, pandas as pd, numpy as np, requests, bs4, matplotlib.pyplot as plt
import wordcloud, nltk
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score, precision_score, accuracy_score
import wordcloud
import text_mining_utils as tm
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering 
from sklearn.feature_selection import f_classif


nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fuad9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
corpus = pd.read_csv('corpus.csv')
corpus

Unnamed: 0,Article,Class
0,\n \n NBA's Top 100 Players for ...,NBA
1,Rare ComicsTop 50 Marvel Comics Of 2015Top 50 ...,Marvel
2,LibraryLibraryComics and Graphic Novels\nColum...,DC
3,NBA Top 100 Players 2021-2022 | Ranking The Be...,NBA
4,Best DC Comics of 2019Comic Book HeraldA Comic...,DC
...,...,...
85,screenrant.comThe Best DC Comics Miniseries of...,DC
86,30 Best Marvel Comics (2022 UPDATED) Definitiv...,Marvel
87,Stephen Curry Ranks as NBA’s Most-Liked Player...,NBA
88,Best DC Comics storiesGamesRadar+ is supported...,DC


In [3]:
dt_clf = DecisionTreeClassifier(random_state=1)
y = corpus.Class

In [4]:
## Hyperparameter Tuning
params = {
    "criterion": ['gini', 'entropy'],
    "max_depth": range(3, 16),
    "min_samples_split": range(2, 16),
    "min_samples_leaf": range(3, 10),
    "min_impurity_decrease": [0.01, 0.02, 0.03, 0.04, 0.05]
}

In [5]:
## Baseline Count Matrix
documents = list(corpus.Article)
baseline_count_matrix = tm.build_count_matrix(documents)

baseline_count_scores = tm.crossvalidate_model(dt_clf, baseline_count_matrix, y, print_=True)

Accuracy: 0.90
Precision macro: 0.91
Recall macro: 0.90


In [7]:
## change the params of the DT to the optimal ones above
opt_baseline_count_clf = DecisionTreeClassifier(random_state=1,
                                      criterion='gini',
                                      max_depth=3,
                                      min_impurity_decrease=0.01,
                                      min_samples_split=2,
                                      min_samples_leaf=5)

## retrain and get performance
opt_baseline_count_scores = tm.crossvalidate_model(opt_baseline_count_clf,
                                         baseline_count_matrix,
                                         y)

Accuracy: 0.92
Precision macro: 0.93
Recall macro: 0.92


In [8]:
## Baseline TF Matrix
baseline_tf_matrix = tm.build_tf_matrix(documents)
tm.crossvalidate_model(dt_clf, baseline_tf_matrix, y, print_=True)

Accuracy: 0.93
Precision macro: 0.94
Recall macro: 0.93


(0.9333333333333333, 0.9390476190476191, 0.9333333333333333)

In [12]:
## change the params of the DT to the optimal ones above
opt_baseline_tf_clf = DecisionTreeClassifier(random_state=1,
                                      criterion='gini',
                                      max_depth=4,
                                      min_impurity_decrease=0.01,
                                      min_samples_split=2,
                                      min_samples_leaf=2)

## retrain and get performance
opt_baseline_tf_scores = tm.crossvalidate_model(opt_baseline_tf_clf,
                                         baseline_count_matrix,
                                         y)

Accuracy: 0.91
Precision macro: 0.92
Recall macro: 0.91


In [None]:
## Baseline TFIDF Matrix
baseline_tfidf_matrix = tm.build_tfidf_matrix(documents)
tm.crossvalidate_model(dt_clf, baseline_tfidf_matrix, y, print_=True)

In [4]:
#Initial cleaning of data
clean_data = corpus.copy()
clean_data.Article = clean_data.Article.apply(tm.clean_doc)
clean_data

Unnamed: 0,Article,Class
0,NBA's Top 100 Players for 2019-20: Ranking th...,NBA
1,Rare ComicsTop 50 Marvel Comics Of 2015Top 50 ...,Marvel
2,LibraryLibraryComics and Graphic Novels Columb...,DC
3,NBA Top 100 Players 2021-2022 | Ranking The Be...,NBA
4,Best DC Comics of 2019Comic Book HeraldA Comic...,DC
...,...,...
85,screenrant.comThe Best DC Comics Miniseries of...,DC
86,30 Best Marvel Comics Definitive GuideFree U.S...,Marvel
87,Stephen Curry Ranks as NBA’s Most-Liked Player...,NBA
88,Best DC Comics storiesGamesRadar+ is supported...,DC


In [8]:
clean_count_matrix = tm.build_count_matrix(list(clean_data.Article))
tm.crossvalidate_model(dt_clf, clean_count_matrix, y, print_=True)
print("No. of terms after cleaning:", clean_count_matrix.shape[1])

Accuracy: 0.88
Precision macro: 0.89
Recall macro: 0.88
No. of terms after cleaning: 22478


In [None]:
tm.search_optimal_params(dt_clf, clean_count_matrix,
                        y, params)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



In [5]:
improved_data = clean_data.copy()

In [14]:
# RFE
rfe_data = improved_data.copy()
rfe_tfidf_matrix = tm.build_tfidf_matrix(
    list(rfe_data.Article))
rfe_reduced_tfidf_matrix = tm.rfe_selection(
    dt_clf, rfe_tfidf_matrix, y, n=100, step=2)
rfe_tfidf_scores = tm.crossvalidate_model(
    dt_clf, rfe_reduced_tfidf_matrix, y)
print("No. of terms after rfe:", 
      rfe_reduced_tfidf_matrix.shape[1])

[('humanity.There', 11190), ('slide', 11190), ('restoring', 11189), ('OFFICIAL', 11189), ('affair', 11188), ('boom', 11188), ('distinctive', 11187), ('TRUE', 11187), ('Zods', 11186), ('CRIME', 11186), ('Luthors', 11185), ('CASES', 11185), ('pitted', 11184), ('girls', 11184), ('RankedSuperman', 11183), ('KID', 11183), ('Origins', 11182), ('boys', 11182), ('ideology.RELATED', 11181), ('COLT', 11181), ('civilisations.The', 11180), ('light-hearted', 11180), ('myths', 11179), ('OUTLAW', 11179), ('Aztec', 11178), ('aimed', 11178), ('warrior', 11177), ('audience.Joe', 11177), ('pantheon.Wonder', 11176), ('weary', 11176), ('Universes', 11175), ('magazine', 11175), ('Loki', 11174), ('balm', 11174), ('Norse', 11173), ('look-out', 11173), ('legends', 11172), ('sort-of', 11172), ('further.Mythology', 11171), ('avenue', 11171), ('Noir', 11170), ('MY', 11170), ('character.From', 11169), ('ROMANCE', 11169), ('therefore', 11168), ('voices', 11168), ('loves', 11167), ('letters', 11167), ('comics.Batman

In [15]:


tm.search_optimal_params(dt_clf, rfe_reduced_tfidf_matrix,
                        y, params)

({'criterion': 'gini',
  'max_depth': 3,
  'min_impurity_decrease': 0.01,
  'min_samples_leaf': 5,
  'min_samples_split': 2},
 0.9555555555555555)

In [17]:
## change the params of the DT to the optimal ones above
opt_tfidf_clf = DecisionTreeClassifier(random_state=1,
                                      criterion='gini',
                                      max_depth=3,
                                      min_impurity_decrease=0.01,
                                      min_samples_split=2,
                                      min_samples_leaf=5)

## retrain and get performance
opt_tfidf_scores = tm.crossvalidate_model(opt_tfidf_clf,
                                         rfe_reduced_tfidf_matrix,
                                         y)

Accuracy: 0.96
Precision macro: 0.96
Recall macro: 0.96


In [11]:
## Univariate Feature Selection
uni_data = improved_data.copy()
uni_tfidf_matrix = tm.build_tfidf_matrix(
    list(uni_data.Article))
uni_reduced_tfidf_matrix = tm.univariate_selection(
    uni_tfidf_matrix, uni_data.Class, scheme=f_classif)
uni_reduced_tfidf_scores = tm.crossvalidate_model(
    dt_clf, uni_reduced_tfidf_matrix, y)
print("No. of terms after applying anova feature selection:", 
      uni_reduced_tfidf_matrix.shape[0])

('DC', 103.04304351383176)
('Batman', 89.88744180615656)
('Superman', 77.94561663486404)
('Wonder', 42.48320481800983)
('Marvel', 35.36081085733576)
('NBA', 34.743470546656326)
('Woman', 34.28310463091443)
('Captain', 28.253974264748138)
('Lantern', 26.819260183295523)
('Hulk', 21.54085049058878)
('comic', 21.52173577047774)
('season', 21.131095662344254)
('America', 20.951227343357562)
('Spider-Man', 20.492595398935897)
('Comics', 19.348798873495035)
('Justice', 17.908332503163283)
('League', 17.13915578905292)
('Green', 16.85510313042924)
('last', 16.54938577353703)
('X-Men', 16.214625664916795)
('league', 16.183313187943046)
('Thor', 15.987526941577178)
('Steel', 15.866293485277316)
('Gotham', 15.579147117217115)
('character', 15.06636697356377)
('Detective', 14.99177720496389)
('characters', 14.747154536937948)
('Kent', 14.493629154184667)
('Doctor', 14.382853611378424)
('Iron', 14.17445302612868)
('Avengers', 13.985853757999102)
('Strange', 13.757362245129684)
('players', 13.22126

In [12]:
## Hyperparameter Tuning
params = {
    "criterion": ['gini', 'entropy'],
    "max_depth": range(3, 16),
    "min_samples_split": range(2, 16),
    "min_samples_leaf": range(3, 10),
    "min_impurity_decrease": [0.01, 0.02, 0.03, 0.04, 0.05]
}

tm.search_optimal_params(dt_clf, uni_reduced_tfidf_matrix,
                        y, params)

({'criterion': 'gini',
  'max_depth': 3,
  'min_impurity_decrease': 0.01,
  'min_samples_leaf': 3,
  'min_samples_split': 2},
 0.9222222222222223)

In [13]:
## change the params of the DT to the optimal ones above
opt_tfidf_clf = DecisionTreeClassifier(random_state=1,
                                      criterion='gini',
                                      max_depth=3,
                                      min_impurity_decrease=0.01,
                                      min_samples_split=2,
                                      min_samples_leaf=3)

## retrain and get performance
opt_tfidf_scores = tm.crossvalidate_model(opt_tfidf_clf,
                                         uni_reduced_tfidf_matrix,
                                         y)

Accuracy: 0.92
Precision macro: 0.93
Recall macro: 0.92
