In [1]:
import pandas as pd
from tqdm.auto import tqdm
from time import time

tqdm.pandas()

In [2]:
def import_labelled_data(path="data/level-0.5/data.json"):
    data = pd.read_json(path, encoding="latin-1")
    return data



print("Loading data...")

test_dataset = import_labelled_data(path="../../data/level-0.5/irrelevant.json", )

print("Data loaded.")


test_dataset = test_dataset.sample(5000)

test_dataset = test_dataset.sample(frac=1).reset_index(drop=True)


test_dataset.info()
test_dataset.head()

Loading data...
Data loaded.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   url           5000 non-null   object
 1   text          5000 non-null   object
 2   relevance     5000 non-null   object
 3   multiclasses  5000 non-null   object
dtypes: object(4)
memory usage: 156.4+ KB


Unnamed: 0,url,text,relevance,multiclasses
0,https://medicinainterna.net.pe/sites/default/f...,Complementary Therapies in Medicine 78 (2023) ...,irrelevant,[]
1,https://mubs.ac.ug/docs/newsletters/October_20...,MUBS October Issue 2017 N E W S L E T T E R MU...,irrelevant,[]
2,https://tesis.pucp.edu.pe/repositorio/bitstrea...,TESIS PUCP Esta obra ha sido publicada bajo la...,irrelevant,[]
3,https://repositorio.unfv.edu.pe/bitstream/hand...,Vicerrectorado de 1 INVESTIGACIÓN ESCUELA UNIV...,irrelevant,[]
4,https://zooreach.org/downloads/ZOO_CAMP_PHVA_r...,Population and Habitat Viability Assessment (P...,irrelevant,[]


In [3]:
from pickle import load

with open('models/LogisticRegression.pkl', 'rb') as file:
    classifier = load(file)

In [4]:
start = time()

probabilities = classifier.predict_proba(test_dataset)[:,1]

end = time()

In [5]:
print(f'\n\nPrediction time on {len(test_dataset)} articles: ', end-start, ' seconds')
print(f'\nFiles processed per second: {len(test_dataset) / (end-start)}')



Prediction time on 5000 articles:  178.1938190460205  seconds

Files processed per second: 28.059334643412605


In [6]:
test_dataset['score'] = probabilities
test_dataset['prediction'] = (test_dataset['score'] > 0.5).apply(lambda x:'relevant' if x else 'irrelevant')

test_dataset = test_dataset.sort_values(by='score', ascending=False)

In [7]:
test_dataset.head()

Unnamed: 0,url,text,relevance,multiclasses,score,prediction
3413,https://wwjournal.org/wp-content/uploads/sites...,Western Wildlife 11:11–18 • 2024 Submitted: 18...,irrelevant,[],0.680068,relevant
239,https://cms.zsl.org/sites/default/files/2022-1...,CAFF Assessment Series Report September 2015 A...,irrelevant,[],0.675969,relevant
1543,https://apps.worldagroforestry.org/downloads/P...,Conservation Agriculture With Trees: Principle...,irrelevant,[],0.635086,relevant
1034,https://zooreach.org/downloads/ZOO_CAMP_PHVA_r...,T h e The STaTuS and diSTribuTion S T a of f e...,irrelevant,[],0.582627,relevant
989,https://zenodo.org/records/1234577/files/artic...,628 J O U R NAL OF APPLIED METEOROLOGY AND CLI...,irrelevant,[],0.556977,relevant


In [8]:

with pd.option_context('display.max_colwidth', 400):
    display(test_dataset[test_dataset['prediction']=='relevant'][['score', 'prediction','url','text']])
    display(test_dataset['url'].head(20))

Unnamed: 0,score,prediction,url,text
3413,0.680068,relevant,https://wwjournal.org/wp-content/uploads/sites/9/2024/06/Roy_etal_WW_2024.pdf,"Western Wildlife 11:11–18 • 2024 Submitted: 18 January 2024; Accepted: 4 June 2024. NoN-bulrush habitat u se by amargosa Voles (Microtus californicus scirpensis) 1,2,6 1,3 1,4 1,5 austin roy , andrés M. lópez-pérez , laura Backus , s tephanie castle , 1,2 1 d eana l. clifford , and Janet foley 1 Department of Veterinary Medicine and Epidemiology, University of California, Davis, 2108 Tupper Ha..."
239,0.675969,relevant,https://cms.zsl.org/sites/default/files/2022-12/ASTI_MigBirds_Index_2015_0.pdf,"CAFF Assessment Series Report September 2015 Arctic Species Trend Index: Migratory Birds Index ARCTIC COUNCIL Acknowledgements CAFF Designated Agencies: • Norwegian Environment Agency, Trondheim, Norway • Environment Canada, Ottawa, Canada • Faroese Museum of Natural History, Tórshavn, Faroe Islands (Kingdom of Denmark) • Finnish Ministry of the Environment, Helsinki, Finland • Icelandic Insti..."
1543,0.635086,relevant,https://apps.worldagroforestry.org/downloads/Publications/PDFS/TM17693.pdf,"Conservation Agriculture With Trees: Principles and Practice A simplified guide for Extension Staff and Farmers Joseph Mutua, Jonathan Muriuki, Peter Gachie, Mieke Bourne and Jude Capis © World Agroforestry Centre, (ICRAF) Nairobi, Kenya, 2014 Technical Manual No. 21 ISBN 978-92-9059-350-8 Prepared by: Joseph Mutua (Kenya Network for Agricultural Technologies, KENDAT), Jonathan Muriuki, Peter ..."
1034,0.582627,relevant,https://zooreach.org/downloads/ZOO_CAMP_PHVA_reports/2010EHFWBreport.pdf,"T h e The STaTuS and diSTribuTion S T a of f e wa r b odiver r Sh Te i SiTy T u S in The eaSTern himalaya a n d D.J. Allen, S. Molur and B.A. Daniel (Compilers) d i S T r i b u T i A o n Y o f A f r L e S h A w a T M e I r b H i o d i v N e r S R i T y E i n T T h S e e A a S T E inTernaTional union for ConServaTion of naTure e r WORLD HEADQUARTERS n Rue Mauverney 28 h 1196 Gland i m Switzerla..."
989,0.556977,relevant,https://zenodo.org/records/1234577/files/article.pdf?download=1,"628 J O U R NAL OF APPLIED METEOROLOGY AND CLIMATOLOGY VOLUME 51 Quantiﬁcation of the Impact of Nauru Island on ARM Measurements CHARLES N. LONG AND SALLY A. MCFARLANE Paciﬁc Northwest National Laboratory, Richland, Washington (Manuscript received 26 August 2011, in ﬁnal form 31 October 2011) ABSTRACT Nauru Island at times generates low clouds that impact low-level cloud statistics and downwel..."
2214,0.511945,relevant,http://wiki.daz3d.com/lib/exe/fetch.php/public/read_me/index/16122/16122_songbirdremixvultures2.pdf,Avian Models for 3D Applications by Ken Gilliland 1 Songbird ReMix Contents Manual Introduction 3 Overview and Use 3 Conforming Crest Quick Reference 4 Creating a Songbird ReMix Bird 5 Using Conforming Crests with Poser 6 Using Conforming Crests with DAZ Studio 7 Tips and Tricks 8 Field Guide Field Guide List 9 General Information about Vultures 10 New World Vultures King Vulture 12 Lesser Yel...


3413                                                                                                                                                                                                                       https://wwjournal.org/wp-content/uploads/sites/9/2024/06/Roy_etal_WW_2024.pdf
239                                                                                                                                                                                                                       https://cms.zsl.org/sites/default/files/2022-12/ASTI_MigBirds_Index_2015_0.pdf
1543                                                                                                                                                                                                                          https://apps.worldagroforestry.org/downloads/Publications/PDFS/TM17693.pdf
1034                                                                                                         