In [1]:
"""
Dedicated to doctor professor Oren Kurland
"""

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict
import math
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
stop_words = set(stopwords.words('english'))


def get_num_of_words(list_of_sentences):
    num = 0
    for sen in list_of_sentences:
        num += len(sen.split())
    return num


def calc_tf_score(list_of_sentences, num_of_words):
    tf_score = defaultdict(int)
    for sen in list_of_sentences:
        for word in sen:
            if word not in stop_words:
                tf_score[word] += 1
    # Dividing by total_word_length for each dictionary element
    tf_score.update((x, y / int(num_of_words)) for x, y in tf_score.items())
    return tf_score


def check_sent(word, sentences):
    final = [all([w in x for w in word]) for x in sentences]
    sent_len = [sentences[i] for i in range(0, len(final)) if final[i]]
    return int(len(sent_len))


def calc_idf_score(list_of_sentences, num_of_sens):
    idf_score = defaultdict(int)
    for sen in list_of_sentences:
        for word in sen:
            if word not in stop_words:
                idf_score[word] = check_sent(word, list_of_sentences)
    # Dividing by total_word_length for each dictionary element
    idf_score.update((x, math.log(int(num_of_sens)/y)) for x, y in idf_score.items())
    return idf_score


def tf_idf_dict(list_of_sentences):
    """
    calc tfidf score to all words in a list of sentences
    :param list_of_sentences: the list of all sentences
    :return: tf_idf score dict
    """
    num_of_sen = len(list_of_sentences)
    num_of_words = get_num_of_words(list_of_sentences)
    tf_score = calc_tf_score(list_of_sentences, num_of_words)
    idf_score = calc_idf_score(list_of_sentences, num_of_sen)
    tf_idf_score = {key: tf_score[key] * idf_score.get(key, 0) for key in tf_score.keys()}
    return tf_idf_score


def get_key_words_for_sen(sen, tf_idf_score, lower_threshold, upper_threshold=10000000):
    """
    get keywords for a sentence
    :param sen: the sentence
    :param tf_idf_score: tf_idf_score dict like above
    :param lower_threshold: lower bound on tfidf score
    :param upper_threshold: upper bound on tfidf score
    :return: keywords for sentence sen
    """
    keywords = []
    for word in sen:
        if tf_idf_score[word] > lower_threshold and tf_idf_score[word] < upper_threshold:
            keywords.append(word)
    return keywords


def pipeline(list_of_sentences, lower_threshold, upper_threshold=10000000):
    """
    do everything above for a list of sentences
    :param list_of_sentences: the list of all sentences
    :param lower_threshold: lower bound on tfidf score
    :param upper_threshold: upper bound on tfidf score
    :return: nothing
    """
    tf_idf_score = tf_idf_dict(list_of_sentences)
    key_list = []
    for sentence in list_of_sentences:
        key_list.append(get_key_words_for_sen(sentence, tf_idf_score, lower_threshold, upper_threshold))
    df = pd.DataFrame()
    df["sentence"] = list_of_sentences
    df["keywords"] = key_list
    df.to_csv("keywords_ds.csv")
    df.to_pickle("keywords_ds.pkl")


In [29]:
df = pd.read_csv('./IBM_Debater_(R)/IBM_Debater_(R)_arg_quality_rank_30k/arg_quality_rank_30k.csv')
df = df[df.WA > 0.8]
df_topics = pd.read_csv('./30k_topicks.csv')
df_no_keywords = df.merge(df_topics, on='topic')
df_no_keywords = df_no_keywords[['argument', 'adj_topic']].rename(columns={'argument': 'sentence', 'adj_topic': 'topic'})
df_no_keywords

Unnamed: 0,sentence,topic
0,"""marriage"" isn't keeping up with the times. a...",marriage
1,abandoning marriage allows for people to grow ...,marriage
2,Abandoning weddings would offend many religiou...,marriage
3,"as society and values change, marriage is no l...",marriage
4,by adopting such practice we can get rid of al...,marriage
...,...,...
17288,We should subsidize embryonic stem cell resear...,embryonic cel
17289,we should subsidize embryonic stem cell resear...,embryonic cel
17290,we shouldn't subsidize embryonic stem cell res...,embryonic cel
17291,we shouldn't subsidize stem cell research beca...,embryonic cel


In [25]:
# df = pd.read_csv('single_topic.csv')
# df_winner = pd.DataFrame(df['Winner'].to_numpy(), columns=['sentence'])
# df_loser = pd.DataFrame(df['Loser'].to_numpy(), columns=['sentence'])
# df = pd.concat([df_winner, df_loser]).reset_index()[['sentence']]

In [30]:
df = pd.DataFrame(df_no_keywords['sentence'].unique(), columns=['sentence'])

In [32]:
# all_sent_tf_idf_dict = tf_idf_dict(df['sentence'].tolist())

In [33]:
corpus = df['sentence'].tolist()
vectorizer = TfidfVectorizer(stop_words=list(stop_words), lowercase=True)
doc_term_matrix = vectorizer.fit_transform(corpus)

In [34]:
vectorizer.vocabulary_
doc_term_matrix = doc_term_matrix.toarray()

In [35]:
def get_key_words_for_sen_egor(sen, sen_id, vocab, doc_term_matrix, lower_threshold, upper_threshold=10000000):
    """
    get keywords for a sentence
    :param sen: the sentence
    :param tf_idf_score: tf_idf_score dict like above
    :param lower_threshold: lower bound on tfidf score
    :param upper_threshold: upper bound on tfidf score
    :return: keywords for sentence sen
    """
    keywords = set()
    all_words = set()
    for word in sen.split():
        word = word.lower()
        if word not in vocab or word in keywords:
            continue
        word_index = vocab[word.lower()]
        tf_idf_score = doc_term_matrix[sen_id][word_index]
        if tf_idf_score > lower_threshold and tf_idf_score < upper_threshold:
            keywords.add((word, tf_idf_score))
        all_words.add((word, tf_idf_score))
    keywords = sorted(list(keywords), key=lambda x: x[1], reverse=True)[:6]
    all_words = sorted(list(all_words), key=lambda x: x[1], reverse=True)[:6]
    if len(keywords) >= 2:
        return [word for word, _ in keywords]
    else:
        return [word for word, _ in all_words[:2]]

In [49]:
vocab = vectorizer.vocabulary_
topics = df_no_keywords['topic']
keywords_list = []
for sent_index, topic in zip(range(len(corpus)), topics):
    keywords = get_key_words_for_sen_egor(corpus[sent_index], sent_index, vocab, doc_term_matrix,
                                          lower_threshold=0.28, upper_threshold=1)
#     keywords_list.append([topic]+keywords)
    keywords_list.append(topic+': '+', '.join(keywords))
df = pd.DataFrame()
df["keywords"] = keywords_list
df["sentence"] = corpus

In [53]:
df_30k_keywords = df.merge(df_no_keywords, on='sentence')

In [54]:
df_30k_keywords.to_csv('keywords_30k.csv', index=False, header=True)