# Import package

In [1]:
import nltk
import numpy as np
import pandas as pd
import re
from nltk.corpus import wordnet as wn

In [2]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/admin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load data set

In [3]:
documents = pd.read_csv("./nlp_country.csv")
documents

Unnamed: 0,Name,Abstract
0,Japan,Japan is an island country in East Asia. Locat...
1,United States,"The United States of America (USA), commonly k..."
2,England,England is a country that is part of the Unite...
3,China,"China, officially the People's Republic of Chi..."
4,India,"India, also known as the Republic of India,[19..."
5,Korea,Korea is a region in East Asia.[3] Since 1948 ...
6,Germany,"Germany, officially the Federal Republic of Ge..."
7,Russia,"Russia, or the Russian Federation[12], is a tr..."
8,France,"France, officially the French Republic, is a c..."
9,Italy,"Italy, officially the Italian Republic,[10][11..."


In [4]:
text_data = documents["Abstract"].values
text_data

array(['Japan is an island country in East Asia. Located in the Pacific Ocean, it lies off the eastern coast of the Asian continent and stretches from the Sea of Okhotsk in the north to the East China Sea and the Philippine Sea in the south. The kanji that make up Japan\'s name mean \'sun origin\', and it is often called the "Land of the Rising Sun". Japan is a stratovolcanic archipelago consisting of about 6,852 islands. The four largest are Honshu, Hokkaido, Kyushu, and Shikoku, which make up about ninety-seven percent of Japan\'s land area and often are referred to as home islands. The country is divided into 47 prefectures in eight regions, with Hokkaido being the northernmost prefecture and Okinawa being the southernmost one. Japan is the 2nd most populous island country. The population of 127 million is the world\'s eleventh largest, of which 98.5% are ethnic Japanese. 90.7% of people live in cities, while 9.3% live in the countryside.[16] About 13.8 million people live in Tokyo,

# Preprocessing

In [5]:
en_stop = nltk.corpus.stopwords.words('english')

In [6]:
def preprocessing_text(text):
    def cleaning_text(text):
        pattern1 = '@|%'
        text = re.sub(pattern1, '', text)    
        pattern2 = '\[[0-9 ]*\]'
        text = re.sub(pattern2, '', text)    
        pattern3 = '\([a-z ]*\)'
        text = re.sub(pattern3, '', text)    
        pattern4 = '[0-9]'
        text = re.sub(pattern4, '', text)
        return text
  
    def tokenize_text(text):
        text = re.sub('[.,]', '', text)
        return text.split()

    def lemmatize_word(word):
        word=word.lower()
    
        lemma = wn.morphy(word)
        if lemma is None:
            return word
        else:
            return lemma
    
    def remove_stopwords(word, stopwordset):
        if word in stopwordset:
            return None
        else:
            return word
            
    text = cleaning_text(text)
    tokens = tokenize_text(text)
    tokens = [lemmatize_word(word) for word in tokens]
    tokens = [remove_stopwords(word, en_stop) for word in tokens]
    tokens = [word for word in tokens if word is not None]
    return tokens

In [7]:
pp_text_data = [preprocessing_text(text) for text in text_data]
pp_text_data

[['japan',
  'island',
  'country',
  'east',
  'asia',
  'locate',
  'pacific',
  'ocean',
  'lie',
  'eastern',
  'coast',
  'asian',
  'continent',
  'stretch',
  'sea',
  'okhotsk',
  'north',
  'east',
  'china',
  'sea',
  'philippine',
  'sea',
  'south',
  'kanji',
  'make',
  "japan's",
  'name',
  'mean',
  "'sun",
  "origin'",
  'often',
  'call',
  '"land',
  'rising',
  'sun"',
  'japan',
  'stratovolcanic',
  'archipelago',
  'consist',
  'island',
  'four',
  'large',
  'honshu',
  'hokkaido',
  'kyushu',
  'shikoku',
  'make',
  'ninety-seven',
  'percent',
  "japan's",
  'land',
  'area',
  'often',
  'refer',
  'home',
  'island',
  'country',
  'divide',
  'prefecture',
  'eight',
  'region',
  'hokkaido',
  'northernmost',
  'prefecture',
  'okinawa',
  'southernmost',
  'one',
  'japan',
  'nd',
  'populous',
  'island',
  'country',
  'population',
  'million',
  "world's",
  'eleventh',
  'large',
  'ethnic',
  'japanese',
  'people',
  'live',
  'city',
  'live'

# Set Based similarity: Set representation + Jaccard Distance and Dice Similarity

In [8]:
from nltk.metrics import jaccard_distance

def cal_jd(base_set, set_list):
    result = {}
    set_a = set(base_set)
    for i, x in enumerate(set_list):
        set_b = set(set_list[i])
        result[i] = 1 - jaccard_distance(set_a, set_b)
    
    return result

In [9]:
result = cal_jd(pp_text_data[0], pp_text_data)
sorted(result.items(), key=lambda x:x[1], reverse = True)

[(0, 1.0),
 (8, 0.17579908675799083),
 (6, 0.17298578199052128),
 (9, 0.17004048582995956),
 (12, 0.14592274678111583),
 (13, 0.14392059553349879),
 (10, 0.1410891089108911),
 (14, 0.14087759815242495),
 (5, 0.13970588235294112),
 (3, 0.13701923076923073),
 (15, 0.13603818615751795),
 (7, 0.13559322033898302),
 (1, 0.13406593406593403),
 (11, 0.12740384615384615),
 (2, 0.12121212121212122),
 (4, 0.1071428571428571)]

In [10]:
def dice_similarity(set_a, set_b):
    num_intersection = len(set.intersection(set_a, set_b))
    sum_nums = len(set_a) + len(set_b)
    try:
        return 2 * num_intersection / sum_nums
    except ZeroDivisionError:
        return 1.0
    
def cal_ds(base_set, set_list):
    result = {}
    set_a = set(base_set)
    for i, x in enumerate(set_list):
        set_b = set(set_list[i])
        result[i] = dice_similarity(set_a, set_b)
    
    return result

In [11]:
result = cal_ds(pp_text_data[0], pp_text_data)
sorted(result.items(), key=lambda x:x[1], reverse = True)

[(0, 1.0),
 (8, 0.29902912621359223),
 (6, 0.29494949494949496),
 (9, 0.2906574394463668),
 (12, 0.2546816479400749),
 (13, 0.25162689804772237),
 (10, 0.2472885032537961),
 (14, 0.24696356275303644),
 (5, 0.24516129032258063),
 (3, 0.24101479915433405),
 (15, 0.23949579831932774),
 (7, 0.23880597014925373),
 (1, 0.2364341085271318),
 (11, 0.2260127931769723),
 (2, 0.21621621621621623),
 (4, 0.1935483870967742)]

# Vector Based similarity: tfidf + cosine similarity

In [12]:
def tfidf_vectorizer(docs):
    def tf(word2id, doc):
        term_counts = np.zeros(len(word2id))
        for term in word2id.keys():
            term_counts[word2id[term]] = doc.count(term)
        tf_values = list(map(lambda x: x/sum(term_counts), term_counts))
        return tf_values
    
    def idf(word2id, docs):
        idf = np.zeros(len(word2id))
        for term in word2id.keys():
            idf[word2id[term]] = np.log(len(docs) / sum([bool(term in doc) for doc in docs]))
        return idf
    
    word2id = {}
    for doc in docs:
        for w in doc:
            if w not in word2id:
                word2id[w] = len(word2id)
                
    return [[_tf*_idf for _tf, _idf in zip(tf(word2id, doc), idf(word2id, docs))] for doc in docs], word2id

In [13]:
def cosine_similarity(list_a, list_b):
    inner_prod = np.array(list_a).dot(np.array(list_b))
    norm_a = np.linalg.norm(list_a)
    norm_b = np.linalg.norm(list_b)
    try:
        return inner_prod / (norm_a * norm_b)
    except ZeroDivisionError:
        return 1.0

In [14]:
def cal_cosine(vector, vector_list):
    result = {}
    for i, x in enumerate(vector_list):
        result[i] = cosine_similarity(vector, vector_list[i])
    
    return result

In [15]:
tfidf_vector, word2id = tfidf_vectorizer(pp_text_data)

In [16]:
word2id.items()

dict_items([('japan', 0), ('island', 1), ('country', 2), ('east', 3), ('asia', 4), ('locate', 5), ('pacific', 6), ('ocean', 7), ('lie', 8), ('eastern', 9), ('coast', 10), ('asian', 11), ('continent', 12), ('stretch', 13), ('sea', 14), ('okhotsk', 15), ('north', 16), ('china', 17), ('philippine', 18), ('south', 19), ('kanji', 20), ('make', 21), ("japan's", 22), ('name', 23), ('mean', 24), ("'sun", 25), ("origin'", 26), ('often', 27), ('call', 28), ('"land', 29), ('rising', 30), ('sun"', 31), ('stratovolcanic', 32), ('archipelago', 33), ('consist', 34), ('four', 35), ('large', 36), ('honshu', 37), ('hokkaido', 38), ('kyushu', 39), ('shikoku', 40), ('ninety-seven', 41), ('percent', 42), ('land', 43), ('area', 44), ('refer', 45), ('home', 46), ('divide', 47), ('prefecture', 48), ('eight', 49), ('region', 50), ('northernmost', 51), ('okinawa', 52), ('southernmost', 53), ('one', 54), ('nd', 55), ('populous', 56), ('population', 57), ('million', 58), ("world's", 59), ('eleventh', 60), ('ethni

In [17]:
res = cal_cosine(tfidf_vector[0], tfidf_vector)
sorted(res.items(), key=lambda x:x[1], reverse = True)

[(0, 1.0),
 (5, 0.089213868005443),
 (3, 0.07494324927746153),
 (14, 0.06794378321355647),
 (9, 0.05446867547327852),
 (8, 0.05061679443369346),
 (13, 0.05035814117836253),
 (1, 0.04945156965230686),
 (7, 0.04340970910393382),
 (6, 0.04329186935344453),
 (12, 0.038469390607195876),
 (2, 0.0355002685981015),
 (10, 0.03479541972998953),
 (11, 0.03392463518350004),
 (15, 0.029516361108928312),
 (4, 0.02200165046387345)]