In [24]:
import pickle
import json
import sys
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\theod\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\theod\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [6]:
lem = WordNetLemmatizer()

In [7]:
with open('TrainingData/20news/df20.pkl', 'rb') as news20:
    data20 = pickle.load(news20)
data20

Unnamed: 0,sentence,label
0,from: (where's my thing)\nsubject: what car i...,rec
1,from: (guy kuo)\nsubject: si clock poll - fin...,comp
2,from: (thomas e willis)\nsubject: pb question...,comp
3,from: jgreen@amber (joe green)\nsubject: re: w...,comp
4,from: (jonathan mcdowell)\nsubject: re: shutt...,sci
...,...,...
18254,from: (stupendous man)\nsubject: re: temperat...,sci
18255,from: (jim smyton)\nsubject: re: monitors - s...,comp
18256,from: \nsubject: re: game length (was re: brav...,rec
18257,from: \nsubject: intel chmos 8086/8088 design...,misc


In [15]:
with open('TrainingData/20news/seedwords.json', 'r') as my_file:
    seed20 = pd.read_json(my_file.read()).map(lem.lemmatize)
seed20


  seed20 = pd.read_json(my_file.read()).map(lem.lemmatize)


Unnamed: 0,alt,comp,misc,rec,sci,talk,soc
0,atheism,graphic,sale,car,encryption,turkish,church
1,atheist,window,offer,bike,circuit,gun,jesus
2,religion,scsi,shipping,game,candida,jew,christ
3,objective,mac,forsale,team,space,armenian,christian


In [6]:
for seeds in seed20.values():
    for seed in seeds:
        print(lem.lemmatize(seed))
        seed = lem.lemmatize(seed)
seed20

{'comp': ['graphics', 'windows', 'scsi', 'mac'],
 'misc': ['sale', 'offer', 'shipping', 'forsale'],
 'rec': ['car', 'bike', 'game', 'team'],
 'sci': ['encryption', 'circuit', 'candida', 'space'],
 'talk': ['turkish', 'gun', 'stephanopoulos', 'armenian'],
 'religion': ['church', 'jesus', 'jehovah', 'religion', 'atheism']}

In [27]:
def lemmatize_sentence(sentence):
    tokens = word_tokenize(sentence)
    lemmatized_tokens = [lem.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)
data20['sentence'] = data20['sentence'].apply(lemmatize_sentence)

In [16]:
tfidf = TfidfVectorizer()
matrix = tfidf.fit_transform(data20['sentence'])
matrix

<18259x95568 sparse matrix of type '<class 'numpy.float64'>'
	with 2545870 stored elements in Compressed Sparse Row format>

In [28]:
def compute_relevance(document, ix, seed):
    relevance = {label: 0 for label in seed.keys()}
    relevance['na'] = sys.float_info.min
    tokens = document.split()
    
    for label, words in seed.items():
        for word in words:
            if word in tokens:
                index = tfidf.vocabulary_.get(word)
                if index is not None:
                    relevance[label] += matrix[ix, index]

    print(relevance)
    return max(zip(relevance.values(), relevance.keys()))[1]

In [29]:
data20['sentence'][14]

"from : ( johnny l lee ) subject : re : == moving sale === summary : re : === moving sale === organization : ub line : 44 nntp-posting-host : lictor.acsu.buffalo.edu reduced price ! i have a list of thing forsale on behalf of my brother , who 's moving ( moved already ) offer : 1 ) black and decker duster plus ( portable hand vaccum ) purchased for $ 32 , $ 12 2 ) sr-1000 dual cassette portable player , am/fm 5-band graphic equalizer , high speed dubing , duo tape.tape deck a , seems to have lost treble sound . but , i bet it 's fixable . purchased for $ 80 $ 25 3 ) monolux zoom microscope , up to 1200x magnification made in japan , includes case and accessory purchased for $ 50 $ 20 4 ) sunbeam 1400 hair dryer , the dryer you put your head under/into . you know , the one you see in the salon . ( do n't ask me why my bro had it ) purchased for $ 60 $ 24 5 ) everylast speed bag , all leather . brand new , never used $ 10 6 ) osterizer pusle matic blender , with 10 speed and a cookbook ,

In [30]:
compute_relevance(data20['sentence'][14], 14, seed20)

{'alt': 0, 'comp': 0.0, 'misc': 0.2165868778493515, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}


'misc'

In [31]:
out = []
for index, row in data20.iterrows():
    out.append(compute_relevance(row['sentence'], index, seed20))
out

{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0.41975356143825954, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0.04995113646746571, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0.0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0.8979080741107713, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}


{'alt': 0, 'comp': 0, 'misc': 0.037373540064526894, 'rec': 0, 'sci': 0.26920353906046735, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0.14926365804400893, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0.17526970110675835, 'misc': 0, 'rec': 0, 'sci': 0.06719292285393864, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0.0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0.0, 'misc': 0.0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0.3764475322712579, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0.06823864245737703, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc'

{'alt': 0, 'comp': 0.020681736796392118, 'misc': 0, 'rec': 0.009233083562374546, 'sci': 0, 'talk': 0.07745993330089139, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0.0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0.0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0.08852374372553176, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0.005050027092201803, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0.0811178751304113, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0.0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, '

{'alt': 0.5249794894389822, 'comp': 0, 'misc': 0, 'rec': 0.008303573595013885, 'sci': 0, 'talk': 0, 'soc': 0.11468854953537042, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0.0, 'misc': 0.1886020867919289, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0.13685557518861624, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0.07698063559551474, 'sci': 0.05372227950302643, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0.0415530387770891, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp

{'alt': 0.13587369053541928, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0.10335328964231448, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0.010789748267909313, 'rec': 0, 'sci': 0, 'talk': 0.0, 'soc': 0.23848984864110043, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0.08929851503581519, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0.14445026158030544, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0.10244420116500276, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0

{'alt': 0.039073889932679594, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0.2740884953596957, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0.04865704284950948, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0.07216370887134373, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.225073

{'alt': 0.18898653376405158, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0.01997544929334312, 'talk': 0, 'soc': 0.020965203910833827, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0.05972148867118086, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.22507

{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0.3004616128302531, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0.04491602500906867, 'rec': 0.061563775532028485, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0.0492029215190539, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0.09012928310326723, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0.0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0.3543954160719977, 'sci': 0.03695955886225567, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0.0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0,

{'alt': 0.07575862248711857, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0.013521112234233095, 'talk': 0.16349458648663845, 'soc': 0.014191063786807288, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0.1110745863166628, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0.05511980147083519, 'comp': 0, 'misc': 0.009452613885028554, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0.01979023442945964, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0.18341283390255264, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-3

{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0.0, 'soc': 0.03759612318981826, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0.2472628929090287, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0.0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0.0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0.0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0.13314036290073678, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0.02939392017661621, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0.18493217710203966, 'sci': 0, 'talk': 0,

{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0.0, 'soc': 0.1574129814907353, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0.029396150153622336, 'misc': 0, 'rec': 0.026820169444998473, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0.044444527110854296, 'rec': 0, 'sci': 0, 'talk': 0.04475899489836541, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0.058074535000973905, 'misc': 0.24606855380505538, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0.2513185789292217, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, '

{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0.08549181472956148, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0.11236130373441502, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0.04190762116024152, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{'alt': 0, 'comp': 0, 'misc': 0, 'rec': 0, 'sci': 0, 'talk': 0, 'soc': 0, 'na': 2.2250738585072014e-308}
{

['rec',
 'na',
 'comp',
 'na',
 'na',
 'na',
 'na',
 'comp',
 'na',
 'na',
 'rec',
 'soc',
 'na',
 'sci',
 'misc',
 'na',
 'na',
 'rec',
 'sci',
 'comp',
 'na',
 'rec',
 'na',
 'comp',
 'comp',
 'misc',
 'na',
 'rec',
 'na',
 'rec',
 'rec',
 'na',
 'na',
 'na',
 'na',
 'na',
 'na',
 'sci',
 'rec',
 'talk',
 'rec',
 'na',
 'na',
 'rec',
 'na',
 'na',
 'na',
 'rec',
 'rec',
 'sci',
 'na',
 'soc',
 'na',
 'na',
 'talk',
 'na',
 'rec',
 'rec',
 'na',
 'sci',
 'na',
 'na',
 'misc',
 'rec',
 'alt',
 'na',
 'talk',
 'alt',
 'misc',
 'talk',
 'talk',
 'rec',
 'na',
 'na',
 'na',
 'misc',
 'rec',
 'comp',
 'na',
 'na',
 'na',
 'na',
 'na',
 'rec',
 'comp',
 'na',
 'na',
 'rec',
 'na',
 'na',
 'talk',
 'na',
 'rec',
 'na',
 'rec',
 'na',
 'na',
 'na',
 'comp',
 'misc',
 'rec',
 'na',
 'na',
 'na',
 'sci',
 'na',
 'na',
 'misc',
 'na',
 'comp',
 'comp',
 'na',
 'rec',
 'na',
 'na',
 'sci',
 'comp',
 'sci',
 'alt',
 'rec',
 'comp',
 'na',
 'soc',
 'na',
 'na',
 'talk',
 'na',
 'rec',
 'misc',
 'na

In [12]:
compute_relevance(data20['sentence'][1], 1)

TypeError: compute_relevance() missing 1 required positional argument: 'seed'

In [None]:
data20['sentence'][1]

In [32]:
out= pd.Series(out)

In [33]:
data20['pred_label'] = out
data20

Unnamed: 0,sentence,label,pred_label
0,from : ( where 's my thing ) subject : what ca...,rec,rec
1,from : ( guy kuo ) subject : si clock poll - f...,comp,na
2,from : ( thomas e willis ) subject : pb questi...,comp,comp
3,from : jgreen @ amber ( joe green ) subject : ...,comp,na
4,from : ( jonathan mcdowell ) subject : re : sh...,sci,na
...,...,...,...
18254,from : ( stupendous man ) subject : re : tempe...,sci,sci
18255,from : ( jim smyton ) subject : re : monitor -...,comp,na
18256,from : subject : re : game length ( wa re : br...,rec,rec
18257,from : subject : intel chmos 8086/8088 design ...,misc,misc


In [34]:
f1_score(data20["label"], data20['pred_label'], average='macro'), f1_score(data20["label"], data20['pred_label'], average='micro')

(0.41893490403061084, 0.33588915055589025)

In [222]:
dropped_na = data20[data20['pred_label'] != 'na']
f1_score(dropped_na["label"], dropped_na['pred_label'], average='macro'), f1_score(dropped_na["label"], dropped_na['pred_label'], average='micro')

(0.7545942451270236, 0.8043334654511826)

In [37]:
pred_label_dist = data20.groupby('pred_label').count()['label'].drop('na')
pred_label_dist = pred_label_dist.sort_values(ascending = False)[0:3]
np.random.choice(pred_label_dist.index, p=pred_label_dist.values/sum(pred_label_dist.values))

'rec'

In [38]:
def imputate_na_sample(x):
    if x =='na':
        return np.random.choice(pred_label_dist.index, p=pred_label_dist.values/sum(pred_label_dist.values))
    return x

In [39]:
data20['pred_label'].apply(imputate_na_sample)

0         rec
1         rec
2        comp
3         sci
4         rec
         ... 
18254     sci
18255     sci
18256     rec
18257    misc
18258     rec
Name: pred_label, Length: 18259, dtype: object

In [40]:
f1_score(data20["label"], data20['pred_label'].apply(imputate_na_sample), average='macro'),f1_score(data20["label"], data20['pred_label'].apply(imputate_na_sample), average='micro')

(0.4792154100446629, 0.46240210307245744)

In [41]:
def imputate_na(x):
    if x =='na':
        return np.random.choice(['rec','comp'])
    return x

In [42]:
f1_score(data20["label"], data20['pred_label'].apply(imputate_na), average='macro'),f1_score(data20["label"], data20['pred_label'].apply(imputate_na), average='micro')

(0.48018145064724393, 0.4731913029191084)

In [273]:
with open('TrainingData/nyt/dfnyt.pkl', 'rb') as news20:
    datanyt = pickle.load(news20)
    
with open('TrainingData/nyt/seedwords.json', 'r') as news20:
    seednyt = json.load(news20)
datanyt, seednyt

(                                                sentence     label
 0      nasa, in preparation for a spacewalk on saturd...   science
 1      if professional pride and strong defiance can ...    sports
 2      admittedly, the language is reconstructed and ...      arts
 3      palermo, sicily — roberta vinci beat top-seede...    sports
 4      the argentine soccer club san lorenzo complete...    sports
 ...                                                  ...       ...
 11522  melbourne, australia — after the tears and the...    sports
 11523  perth, australia — wimbledon semifinalist jerz...    sports
 11524  while the nets are hoisting no. 5 to the barcl...    sports
 11525  sydney — tennis australia chief executive stev...    sports
 11526  stephen watts is the president of sap asia pac...  business
 
 [11527 rows x 2 columns],
 {'arts': ['music', 'orchestra', 'album', 'opera', 'ballet'],
  'business': ['companies', 'euro', 'economy', 'batteries', 'sales'],
  'science': ['space', 

In [278]:
out = []
tfidf = TfidfVectorizer()
matrix = tfidf.fit_transform(datanyt['sentence'])
for index, row in datanyt.iterrows():
    out.append(compute_relevance(row['sentence'], index, seednyt))
out= pd.Series(out)
datanyt['pred_label'] = out
datanyt

{'arts': 0, 'business': 0, 'science': 0.1016353076188235, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.09455724740884795, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0.1500744666578636, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.024649430205199517, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.09473053959734355, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0.35458940516222903, 'business': 0.018482853589771284, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0.04352766408863566, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0.3023672893273313, 'business': 0, 'science': 0

{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.06671703879070406, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0.10850670173681136, 'science': 0.022020298651648198, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.012439668729219203, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.1703183425512768, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.06899084465179989, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0.14227251520682002, 'business': 0.02192777217979758, 'science': 0.024116700669119848, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science'

{'arts': 0.26127658384390656, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.11737354086915651, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.08521650100298353, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.153280940908508, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0.058098627137523416, 'science': 0, 'sports': 0.24878767062948437, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.056184474775437636, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0.059404529460701815, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.17349204162759946, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science':

{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.038075485729172416, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.0194972700508215, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0.022920713712847856, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.02095247229233492, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0.03988371756702688, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 

{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0.027545312478496023, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.06526285236209575, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0.0825487167641657, 'science': 0.0169996452264583, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.01452150621792451, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.015911131910027243, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0.24196604672309677, 'sports': 0, 'politics': 0, 'na':

{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.021644724720286147, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.06437795612689551, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.012080958236801997, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0.15784062609868785, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0.15296879722518505, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.07731146753366336, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0.01700093451902875, 'science': 0.036407209720280276, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0.017699830411412713, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0.04729121182059348, 'bus

{'arts': 0, 'business': 0.2311966924109199, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.3357463783361116, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.07150645248626732, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0.42756850284480225, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0.030042837130923695, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0.022075463472236358, 'science': 0, 'sports': 0, 'politics': 0.08781659914695185, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0.07057125729341973, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.019

{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0.06328577688991577, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.041207289396530775, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.05023930174364411, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.01683467122626135, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.2898539264404326, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0

{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.032527420036463604, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.06916543371650188, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0.2474658648045493, 'business': 0, 'science': 0.02279031469896992, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0.032274582619483266, 'science': 0, 'sports': 0, 'politics': 0.34651419335483563, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.014334802855273563, 'politics': 0, 'na

{'arts': 0.14007504499534265, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.03070696549398492, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.0695927102071278, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.028250126782794295, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.07928572831453008, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0.2966211337059351, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0.033036234255001365, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0, 'politics': 0, 'na':

{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.013031670796328441, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.323075206536434, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0.08406191550219333, 'science': 0, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.16469462998065418, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.09321526555460892, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0.14634986986052054, 'business': 0, 'science': 0.08051189006810291, 'sports': 0, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.10789391764359285, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0, 'sports': 0.22014139858379964, 'politics': 0, 'na': 2.2250738585072014e-308}
{'arts': 0, 'business': 0, 'science': 0

Unnamed: 0,sentence,label,pred_label
0,"nasa, in preparation for a spacewalk on saturd...",science,science
1,if professional pride and strong defiance can ...,sports,sports
2,"admittedly, the language is reconstructed and ...",arts,arts
3,"palermo, sicily — roberta vinci beat top-seede...",sports,na
4,the argentine soccer club san lorenzo complete...,sports,sports
...,...,...,...
11522,"melbourne, australia — after the tears and the...",sports,na
11523,"perth, australia — wimbledon semifinalist jerz...",sports,sports
11524,while the nets are hoisting no. 5 to the barcl...,sports,sports
11525,sydney — tennis australia chief executive stev...,sports,na


In [279]:
f1_score(datanyt["label"], datanyt['pred_label'], average='macro'), f1_score(datanyt["label"], datanyt['pred_label'], average='micro')

(0.48066425169992977, 0.5906133425869697)

In [280]:
f1_score(datanyt["label"], datanyt['pred_label'].apply(imputate_na_sample), average='macro'),f1_score(datanyt["label"], datanyt['pred_label'].apply(imputate_na_sample), average='micro')

(0.3604981887749473, 0.5906133425869697)