In [None]:
# Before importing the nltk (natural language processing library) make sure
# that you properly installed it before. The code for installing it 
# on Jupyter Notebook is in the following.
!pip install nltk
import numpy as np
# After installing the nltk library we can import it 
# to make use of it.
import nltk
from nltk.corpus import wordnet as wn
import pandas as pd
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize, pos_tag
nltk.download('wordnet')

In [None]:
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
import pandas as pd


# I changed the conversion function that was given in the beginning of the assignment.
# My new created function is equivalent to the function that was given (convert_tag function).
def conversion_new(tag):
    if tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('V'):
        return wn.VERB
    elif tag.startswith('R'):
        return wn.ADV
    else:
        return None


def doc_to_synsets(doc):
    """
    Returns a list of synsets in document.

    Tokenizes and tags the words in the document doc.
    Then finds the first synset for each word/tag combination.
    If a synset is not found for that combination it is skipped.

    Args:
        doc: string to be converted

    Returns:
        list of synsets

    Example:
        doc_to_synsets('Fish are nvqjp friends.')
        Out: [Synset('fish.n.01'), Synset('be.v.01'), Synset('friend.n.01')]
    """
    synsets = []
    from nltk.stem import WordNetLemmatizer
    import nltk
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')
    from nltk import word_tokenize, pos_tag
    tokens = nltk.word_tokenize(doc)
    doc_tags = nltk.pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    for tag in doc_tags:
        try:
            wn_tag = conversion_new(tag[1])
            synsets.append(wn.synsets(tag[0], pos = wn_tag)[0])
        except IndexError:
            continue
    return synsets


def similarity_score(s1, s2):
    """
    Calculate the normalized similarity score of s1 onto s2

    For each synset in s1, finds the synset in s2 with the largest similarity value.
    Sum of all of the largest similarity values and normalize this value by dividing it by the
    number of largest similarity values found.

    Args:
        s1, s2: list of synsets from doc_to_synsets

    Returns:
        normalized similarity score of s1 onto s2

    Example:
        synsets1 = doc_to_synsets('I like cats')
        synsets2 = doc_to_synsets('I like dogs')
        similarity_score(synsets1, synsets2)
        Out: 0.73333333333333339
    """
    
    s =[]
    for i1 in s1:
        r = []
        for i2 in s2:
            r.append(i1.path_similarity(i2))
        result = [x for x in r if x is not None]
        if len(result) > 0 :
            s.append(max(result))

    ans = sum(s)/len(s)
    return ans


def document_path_similarity(doc1, doc2):
    """Finds the symmetrical similarity between doc1 and doc2"""

    synsets1 = doc_to_synsets(doc1)
    synsets2 = doc_to_synsets(doc2)

    return (similarity_score(synsets1, synsets2) + similarity_score(synsets2, synsets1)) / 2

In [None]:
document_path_similarity('I like cat', 'I like dog')

In [None]:
def test_document_path_similarity():
    doc1 = 'This is a function to test document_path_similarity.'
    doc2 = 'Use this function to see if your code in doc_to_synsets \
    and similarity_score is correct!'
    return document_path_similarity(doc1, doc2)
test_document_path_similarity()

In [None]:
# Use this dataframe for questions most_similar_docs and label_accuracy
paraphrases = pd.read_csv('paraphrases.csv')
paraphrases.head()

In [None]:
def most_similar_docs():
    paraphrases['scores'] = np.NaN
    for i in range(len(paraphrases)):
        paraphrases['scores'][i] = document_path_similarity(paraphrases['D1'][i], paraphrases['D2'][i])
    d1 = paraphrases['D1'][paraphrases['scores'].idxmax(axis = 0)]
    d2 = paraphrases['D2'][paraphrases['scores'].idxmax(axis = 0)]
    
    return (d1, d2, paraphrases['scores'].max())
most_similar_docs()

In [None]:
def label_accuracy():
    from sklearn.metrics import accuracy_score
    paraphrases['scores'] = np.NaN
    for i in range(len(paraphrases)):
        paraphrases['scores'][i] = document_path_similarity(paraphrases['D1'][i], paraphrases['D2'][i])
    paraphrases['pred_label'] = np.NaN
    for i in range(len(paraphrases)):
        if paraphrases['scores'][i] > 0.75:
            paraphrases['pred_label'][i] = 1
        else:
            paraphrases['pred_label'][i] = 0
    answer = accuracy_score(paraphrases[paraphrases.columns[0]], paraphrases['pred_label'])
    return answer
label_accuracy()

In [None]:
import pickle
import gensim
from sklearn.feature_extraction.text import CountVectorizer

# Load the list of documents
with open('newsgroups', 'rb') as f:
    newsgroup_data = pickle.load(f)

# Use CountVectorizor to find three letter tokens, remove stop_words, 
# remove tokens that don't appear in at least 20 documents,
# remove tokens that appear in more than 20% of the documents
vect = CountVectorizer(min_df=20, max_df=0.2, stop_words='english', 
                       token_pattern='(?u)\\b\\w\\w\\w+\\b')
# Fit and transform
X = vect.fit_transform(newsgroup_data)

# Convert sparse matrix to gensim corpus.
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

# Mapping from word IDs to words (To be used in LdaModel's id2word parameter)
id_map = dict((v, k) for k, v in vect.vocabulary_.items())


In [None]:
# Use the gensim.models.ldamodel.LdaModel constructor to estimate 
# LDA model parameters on the corpus, and save to the variable `ldamodel`

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10,
                                          id2word = id_map, passes = 25, random_state = 34)


In [None]:
def lda_topics():
    
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10,
                                          id2word = id_map, passes = 25, random_state = 34)
    ans = ldamodel.print_topics(num_topics = 10, num_words = 10)
    return ans
lda_topics()

In [None]:
new_doc = ["\n\nIt's my understanding that the freezing will start to occur because \
of the\ngrowing distance of Pluto and Charon from the Sun, due to it's\nelliptical orbit. \
It is not due to shadowing effects. \n\n\nPluto can shadow Charon, and vice-versa.\n\nGeorge \
Krumins\n-- "]

In [None]:
def topic_distribution():
    
    from gensim import corpora, models
    new_doc = ["\n\nIt's my understanding that the freezing will start to occur because \
    of the\ngrowing distance of Pluto and Charon from the Sun, due to it's\nelliptical orbit. \
    It is not due to shadowing effects. \n\n\nPluto can shadow Charon, and vice-versa.\n\nGeorge \
    Krumins\n-- "]
    vect = CountVectorizer(stop_words='english')
    # Fit and transform
    X = vect.fit_transform(new_doc)

    # Convert sparse matrix to gensim corpus.
    corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)  
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10,
                                          id2word = id_map, passes = 25, random_state = 34)
    dictionary_new = corpora.Dictionary(vect.vocabulary_.items())
    bow = dictionary_new.doc2bow(new_doc[0].split())
    ans = ldamodel.get_document_topics(bow)
    
    return ans
topic_distribution()

In [None]:
topics= ['Health', 'Science', 'Automobiles', 'Politics', 'Government', 
         'Travel', 'Computers & IT', 'Sports', 
         'Business', 'Society & Lifestyle', 'Religion', 'Education']

def topic_names():
    
    return ["Education", "Automobiles", "Computers & IT", "Religion", "Automobiles", "Sports", "Health", "Religion", "Computers & IT", "Science"]
topic_names()