In [1]:
import re, pandas as pd, numpy as np, requests, bs4, matplotlib.pyplot as plt
import wordcloud, nltk
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score, precision_score, accuracy_score
import wordcloud
import text_mining_utils as tm
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering 
from sklearn.feature_selection import f_classif


nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fuad9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
corpus = pd.read_csv('corpus.csv')
corpus

Unnamed: 0,Article,Class
0,\n \n NBA's Top 100 Players for ...,NBA
1,Rare ComicsTop 50 Marvel Comics Of 2015Top 50 ...,Marvel
2,LibraryLibraryComics and Graphic Novels\nColum...,DC
3,NBA Top 100 Players 2021-2022 | Ranking The Be...,NBA
4,Best DC Comics of 2019Comic Book HeraldA Comic...,DC
...,...,...
85,screenrant.comThe Best DC Comics Miniseries of...,DC
86,30 Best Marvel Comics (2022 UPDATED) Definitiv...,Marvel
87,Stephen Curry Ranks as NBA’s Most-Liked Player...,NBA
88,Best DC Comics storiesGamesRadar+ is supported...,DC


In [3]:
dt_clf = DecisionTreeClassifier(random_state=1)
y = corpus.Class

### Text preprocessing tasks: what preprocessing tasks are the most suitable for your data? Choose at least 3 tasks based on your findings from data understanding and discuss why they might be suitable. Document and discuss the incremental performance after each applied technique to the 3 matrices and decide whether they should be included in the final pipeline (justify your decisions) 

In [4]:
#Initial cleaning of data
clean_data = corpus.copy()
clean_data.Article = clean_data.Article.apply(tm.clean_doc)
clean_data

Unnamed: 0,Article,Class
0,NBA's Top 100 Players for 2019-20: Ranking th...,NBA
1,Rare ComicsTop 50 Marvel Comics Of 2015Top 50 ...,Marvel
2,LibraryLibraryComics and Graphic Novels Columb...,DC
3,NBA Top 100 Players 2021-2022 | Ranking The Be...,NBA
4,Best DC Comics of 2019Comic Book HeraldA Comic...,DC
...,...,...
85,screenrant.comThe Best DC Comics Miniseries of...,DC
86,30 Best Marvel Comics Definitive GuideFree U.S...,Marvel
87,Stephen Curry Ranks as NBA’s Most-Liked Player...,NBA
88,Best DC Comics storiesGamesRadar+ is supported...,DC


In [5]:
clean_count_matrix = tm.build_count_matrix(list(clean_data.Article))
tm.crossvalidate_model(dt_clf, clean_count_matrix, y, print_=True)
print("No. of terms after cleaning:", clean_count_matrix.shape[1])

Accuracy: 0.88
Precision macro: 0.89
Recall macro: 0.88
No. of terms after cleaning: 22478


In [6]:
clean_tf_matrix = tm.build_tf_matrix(list(clean_data.Article))
tm.crossvalidate_model(dt_clf, clean_tf_matrix, y, print_=True)
print("No. of terms after cleaning:", clean_tf_matrix.shape[1])

Accuracy: 0.91
Precision macro: 0.91
Recall macro: 0.91
No. of terms after cleaning: 22478


In [7]:
clean_tfidf_matrix = tm.build_tfidf_matrix(list(clean_data.Article))
tm.crossvalidate_model(dt_clf, clean_tfidf_matrix, y, print_=True)
print("No. of terms after cleaning:", clean_tfidf_matrix.shape[1])

Accuracy: 0.91
Precision macro: 0.91
Recall macro: 0.91
No. of terms after cleaning: 22478


In [8]:
# Stop words removal
improved_data = clean_data.copy()
universal_sw = nltk.corpus.stopwords.words('english')
print(universal_sw)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
swr_u_data = improved_data.copy()
swr_u_data.Article = swr_u_data.Article.apply(tm.remove_sw, sw=universal_sw)

swr_u_count_matrix = tm.build_count_matrix(list(swr_u_data.Article))
tm.crossvalidate_model(dt_clf, swr_u_count_matrix, y, print_=True)
print("No. of terms after removal:", swr_u_count_matrix.shape[1])

Accuracy: 0.92
Precision macro: 0.93
Recall macro: 0.92
No. of terms after removal: 22294


In [10]:
swr_u_tf_matrix = tm.build_tf_matrix(list(swr_u_data.Article))
tm.crossvalidate_model(dt_clf, swr_u_tf_matrix, y, print_=True)
print("No. of terms after removal:", swr_u_tf_matrix.shape[1])

Accuracy: 0.94
Precision macro: 0.95
Recall macro: 0.94
No. of terms after removal: 22294


In [11]:
swr_u_tfidf_matrix = tm.build_tfidf_matrix(list(swr_u_data.Article))
tm.crossvalidate_model(dt_clf, swr_u_tfidf_matrix, y, print_=True)
print("No. of terms after removal:", swr_u_tfidf_matrix.shape[1])

Accuracy: 0.94
Precision macro: 0.95
Recall macro: 0.94
No. of terms after removal: 22294


In [12]:
custom_sw = ['the', 'of', 'and', 'to', 'in', 'is', 'was', 'on', 's']
swr_c_data = improved_data.copy()

swr_c_data.Article = swr_c_data.Article.apply(tm.remove_sw, sw=custom_sw)
swr_c_count_matrix = tm.build_count_matrix(list(swr_c_data.Article))

tm.crossvalidate_model(dt_clf, swr_c_count_matrix, y, print_=True)
print("No. of terms after removal:", swr_c_count_matrix.shape[1])

Accuracy: 0.94
Precision macro: 0.95
Recall macro: 0.94
No. of terms after removal: 22425


In [13]:
swr_c_tf_matrix = tm.build_tf_matrix(list(swr_c_data.Article))
tm.crossvalidate_model(dt_clf, swr_c_tf_matrix, y, print_=True)
print("No. of terms after removal:", swr_c_tf_matrix.shape[1])

Accuracy: 0.93
Precision macro: 0.94
Recall macro: 0.93
No. of terms after removal: 22425


In [14]:
swr_c_tfidf_matrix = tm.build_tfidf_matrix(list(swr_c_data.Article))
tm.crossvalidate_model(dt_clf, swr_c_tfidf_matrix, y, print_=True)
print("No. of terms after removal:", swr_c_tfidf_matrix.shape[1])

Accuracy: 0.93
Precision macro: 0.94
Recall macro: 0.93
No. of terms after removal: 22425


In [15]:
#Improving the BOW
repl_dictionary = {
    'comics': ['comic(s)[-]books', 'stories'],
    'superhero':['superheroes', 'hero(es)'],
    'writer': ['author(s)', 'creator(s)'],
    'NBA': ['league'],
    'team': ['franchise(s)'],
    'season': ['year']
}

improved_data.Article = improved_data.Article.apply(tm.improve_bow, replc_dict=repl_dictionary)

improved_count_matrix = tm.build_count_matrix(list(improved_data.Article))

tm.crossvalidate_model(dt_clf, improved_count_matrix, y, print_=True)
print("No. of terms after improving the bow:", improved_count_matrix.shape[1])

Accuracy: 0.91
Precision macro: 0.92
Recall macro: 0.91
No. of terms after improving the bow: 22452


In [16]:
improved_tf_matrix = tm.build_tf_matrix(list(improved_data.Article))
tm.crossvalidate_model(dt_clf, improved_tf_matrix, y, print_=True)
print("No. of terms after improving the bow:", improved_tf_matrix.shape[1])

Accuracy: 0.90
Precision macro: 0.90
Recall macro: 0.90
No. of terms after improving the bow: 22452


In [17]:
improved_tfidf_matrix = tm.build_tfidf_matrix(list(improved_data.Article))
tm.crossvalidate_model(dt_clf, improved_tfidf_matrix, y, print_=True)
print("No. of terms after improving the bow:", improved_tfidf_matrix.shape[1])

Accuracy: 0.90
Precision macro: 0.90
Recall macro: 0.90
No. of terms after improving the bow: 22452


### Algorithms-based Feature selection/reduction tasks: Choose at least 2 techniques to try. Document  and discuss the performance after each applied technique and decidewhich one to include in the final p ipeline (justify your decision); of the terms chosen by the algorithms as being the most predictive, do they concur with the terms you thought would be the  best predictors from data understanding?

In [18]:
## Univariate Feature Selection
uni_data = improved_data.copy()
uni_tfidf_matrix = tm.build_tfidf_matrix(
    list(uni_data.Article))
uni_reduced_tfidf_matrix = tm.univariate_selection(
    uni_tfidf_matrix, uni_data.Class, scheme=f_classif)
uni_reduced_tfidf_scores = tm.crossvalidate_model(
    dt_clf, uni_reduced_tfidf_matrix, y)
print("No. of terms after applying anova feature selection:", 
      uni_reduced_tfidf_matrix.shape[0])

('DC', 103.04304351383176)
('Batman', 89.88744180615656)
('Superman', 77.94561663486404)
('Wonder', 42.48320481800983)
('Marvel', 35.36081085733576)
('NBA', 34.926781841324456)
('Woman', 34.28310463091443)
('Captain', 28.253974264748138)
('Lantern', 26.819260183295523)
('Hulk', 21.54085049058878)
('comic', 21.52173577047774)
('America', 20.951227343357562)
('Spider-Man', 20.492595398935897)
('Comics', 19.348798873495035)
('Justice', 17.908332503163283)
('Green', 16.85510313042924)
('last', 16.54938577353703)
('X-Men', 16.214625664916795)
('Thor', 15.987526941577178)
('season', 15.921724733565263)
('Steel', 15.866293485277316)
('Gotham', 15.579147117217115)
('character', 15.06636697356377)
('Detective', 14.99177720496389)
('characters', 14.747154536937948)
('Kent', 14.493629154184667)
('Doctor', 14.382853611378424)
('Iron', 14.17445302612868)
('Avengers', 13.985853757999102)
('Strange', 13.757362245129684)
('players', 13.221265487024437)
('Flash', 13.161290543922677)
('book', 13.1456194

In [19]:
# RFE
rfe_data = improved_data.copy()
rfe_tfidf_matrix = tm.build_tfidf_matrix(
    list(rfe_data.Article))
rfe_reduced_tfidf_matrix = tm.rfe_selection(
    dt_clf, rfe_tfidf_matrix, y, n=100, step=2)
rfe_tfidf_scores = tm.crossvalidate_model(
    dt_clf, rfe_reduced_tfidf_matrix, y)
print("No. of terms after rfe:", 
      rfe_reduced_tfidf_matrix.shape[1])

[('H.G', 11177), ('KID', 11177), ('tales', 11176), ('CASES', 11176), ('comics.Whether', 11175), ('COLT', 11175), ('humanity.There', 11174), ('CRIME', 11174), ('restoring', 11173), ('OUTLAW', 11173), ('affair', 11172), ('TRUE', 11172), ('distinctive', 11171), ('aimed', 11171), ('Zods', 11170), ('audience.Joe', 11170), ('Luthors', 11169), ('magazine', 11169), ('pitted', 11168), ('OFFICIAL', 11168), ('RankedSuperman', 11167), ('look-out', 11167), ('Origins', 11166), ('avenue', 11166), ('ideology.RELATED', 11165), ('MY', 11165), ('civilisations.The', 11164), ('slide', 11164), ('myths', 11163), ('ROMANCE', 11163), ('Aztec', 11162), ('letters', 11162), ('warrior', 11161), ('industry.By', 11161), ('pantheon.Wonder', 11160), ('boom', 11160), ('Universes', 11159), ('occurred', 11159), ('Loki', 11158), ('girls', 11158), ('Norse', 11157), ('mashed', 11157), ('legends', 11156), ('boys', 11156), ('further.Mythology', 11155), ('Westerns', 11155), ('Noir', 11154), ('COWBOY', 11154), ('character.From'