In [1]:
import pickle
import json
import sys
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\theod\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\theod\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
lem = WordNetLemmatizer()
def lemmatize_sentence(sentence):
    #Lemmatizing to simplify corpus
    tokens = word_tokenize(sentence)
    lemmatized_tokens = [lem.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

In [17]:
def load_pickle(path):
    with open(path, 'rb') as file:
        df =  pickle.load(file)
        df['sentence'] = df['sentence'].apply(lemmatize_sentence)
        return df
def load_json(path):
    with open(path, 'r') as file:
        return pd.read_json(file.read()).map(lem.lemmatize)

In [18]:
data20 = load_pickle('TrainingData/20news/df20.pkl')
datanyt = load_pickle('TrainingData/nyt/dfnyt.pkl')
seed20 = load_json('TrainingData/20news/seedwords.json')
seednyt = load_json('TrainingData/nyt/seedwords.json')

  return pd.read_json(file.read()).map(lem.lemmatize)
  return pd.read_json(file.read()).map(lem.lemmatize)


In [19]:
def compute_relevance(document, ix, seed, tfidf, matrix):
    #Helper function for label generator, takes in an individual document, index of said document in df, seed words, tfidf vectorizor and tfidf matrix of corpus

    relevance = {label: 0 for label in seed.keys()}
    relevance['na'] = sys.float_info.min
    tokens = document.split()
    
    for label, words in seed.items():
        for word in words:
            if word in tokens:
                index = tfidf.vocabulary_.get(word)
                if index is not None:
                    relevance[label] += matrix[ix, index]
    return max(zip(relevance.values(), relevance.keys()))[1]

In [20]:
def generate_labels_tfidf(df, seed):
    #Takes pd df of data and dict of seedwords, returns a series of label predictions based on tfidf relevence of data on seedwords
    #NOTE: returns 'na' values if no seedwords are found
    tfidf = TfidfVectorizer()
    matrix = tfidf.fit_transform(df['sentence'])
    out = []
    for index, row in df.iterrows():
        out.append(compute_relevance(row['sentence'], index, seed, tfidf, matrix))
    return pd.Series(out)

In [21]:
pred20 = generate_labels_tfidf(data20, seed20)
prednyt = generate_labels_tfidf(datanyt, seednyt)
pred20

0         rec
1          na
2        comp
3        comp
4          na
         ... 
18254     sci
18255    comp
18256     rec
18257    misc
18258     soc
Length: 18259, dtype: object

In [22]:
def score(expt, pred):
    return f1_score(expt, pred, average='macro'), f1_score(expt, pred, average='micro')

In [24]:
score(data20['label'], pred20), score(datanyt['label'], prednyt)

((0.4830039874305079, 0.43315625171148475),
 (0.4791134690073362, 0.6198490500563894))

In [23]:
((0.41406641964259294, 0.3334246125198532),
 (0.48066425169992977, 0.5906133425869697))

((0.41406641964259294, 0.3334246125198532),
 (0.48066425169992977, 0.5906133425869697))

In [25]:
def impute(pred, trim = 0):
    #imputes na labels by randomly selecting from found distribution, trimming bottom n categories if chosen
    pred_label_dist = pred[pred != 'na'].value_counts()
    if trim:
        pred_label_dist = pred_label_dist[:-trim]
    impute_na = lambda x:  np.random.choice(pred_label_dist.index, p=pred_label_dist.values/sum(pred_label_dist.values)) if x == 'na' else x
    return pred.apply(impute_na)

In [39]:
def tune_imputation(pred, data):
    #automatically finds best impute trim level for highest f1 score
    pred_labels = len(pred[pred != 'na'].unique())
    scores = {}
    for i in range(pred_labels):
        scores[i] = score(impute(pred, i), data['label'])
    #print(scores)
    return  max(scores.items(), key=lambda x: sum(x[1]))

In [40]:
tune_imputation(pred20, data20), tune_imputation(prednyt, datanyt)

((4, (0.5396539348061802, 0.5398433649159319)),
 (4, (0.6052557279014772, 0.8353431074867702)))

In [32]:
baselines = {
    '20' : (.49,.48),
    'nyt': (.65, .58)
}
baselines

{'20': (0.49, 0.48), 'nyt': (0.65, 0.58)}