In [1]:
# SemanticRelatedWord is used to generate synonyms for each word
from DST.semantic_related_word.SemanticRelatedWord import SemanticRelatedWord
# WordDiscrimination is used to classify semantically related words
from DST.word_discrimination.WordDiscrimination import WordDiscrimination,default_classify_func
from os.path import dirname,join
from collections import defaultdict
import os
import logging
import json
import csv
import codecs



In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logging.getLogger("gensim").setLevel(logging.WARNING)
PROJ_PATH = dirname(os.getcwd())
DEFAULT_OUT_DIR = join(PROJ_PATH,"result")

#clean data， one line one sentence
with open (join(PROJ_PATH,"data/tags.txt"),mode="r",encoding="utf-8") as fr:
    data = [" ".join(line.strip().split())+"\n" for line in fr if len(line)>4]
    
with open (join(PROJ_PATH,"data/clean_tags.txt"),mode="w",encoding="utf-8") as fw:
    fw.writelines(data)
    
# Get terms (all words in clean_tags.txt)
vocab = defaultdict(int)
with open (join(PROJ_PATH,"data/clean_tags.txt"),mode="r",encoding="utf-8") as fr:
    for line in fr:
        for word in line.split():
            vocab[word]+=1

terms=[]
# Appear more than 5 times is considered as a term
for k,v in vocab.items():
    if v>5:
        terms.append(k)

In [3]:
# Get synonyms
sr = SemanticRelatedWord(domain_corpus_phrase_path=join(PROJ_PATH,"data/clean_tags.txt"), 
                         fasttext_path=join(PROJ_PATH,"models/fasttext/fasttext.model"), 
                         skipgram_path=join(PROJ_PATH,"models/skipgram/skipgram.model"), 
                         file_overwrite=True,topn_fasttext=8, 
                         topn_skipgram=15, similarity_threshold_fasttext=0.8,
                         similarity_threshold_skipgram=0.78, min_count=5, 
                         size=200, workers=8, window=5)
semantic_related_words = sr.getSemanticRelatedWords(terms=terms)


2019-03-18 09:27:14,469 - DST.semantic_related_word.SemanticRelatedWord - INFO - train fasttext...
2019-03-18 09:28:12,222 - DST.semantic_related_word.SemanticRelatedWord - INFO - save fasttext to local...
2019-03-18 09:28:13,550 - DST.semantic_related_word.SemanticRelatedWord - INFO - train skipgram...
2019-03-18 09:28:28,971 - DST.semantic_related_word.SemanticRelatedWord - INFO - save skipgram to local...


In [4]:
# Classify synonyms, get abbreviation, synonym and other
wd = WordDiscrimination(classify_word_func=default_classify_func, 
                        semantic_related_types=["abbreviation", "other", "synonym"], 
                        group_dict=False, 
                        group_word_type="synonym", 
                        domain_vocab=None)
final_dict = wd.discriminate_words(semantic_related_words)

In [18]:
# View abbr
count = 0
for k,v in final_dict.items():
    if len(v["abbreviation"])>0:
        count+=1
        print(k,str(v["abbreviation"]))
print(count)

developers ['devs', 'devops']
bacon ['bao']
christmastree ['chritmas', 'chrismas']
santaclaus ['santas']
characterdesign ['charactern']
skateboard ['skater']
skateboarder ['skater']
advertising ['ads']
facebook ['fb']
black-and-white ['bach', 'blackwhite']
rocketboy ['rboy']
animate ['ae']
screenprint ['screent']
avatar ['avaa']
wordpress ['wp']
flower ['flor']
triangles ['tangle']
bitcoin ['btc']
sunrise ['suns']
turkish ['tuk']
javascript ['js']
aftereffects ['affects']
beltramo ['bltr']
basketball ['bball']
powerpoint ['ppt']
francisco ['franco']
miniature ['mature']
contact ['cta']
advertisment ['ads']
bookshelf ['bose']
psychedelic ['psychic']
baseball ['bball']
volunteer ['volt', 'vote']
television ['tv']
avatars ['avaa']
adaptive ['api']
statistics ['stats']
balloons ['balls']
cinema4d ['c4d']
styleguide ['styled']
bastard ['bard']
brewery ['beer']
champagne ['cane']
cocktail ['coca']
brooklyn ['bklyn']
roar ['ra']
100dayui ['100ui']
montana ['mona']
cigarettes ['cigars']
wirefr

In [9]:
abb_count = 0
for k,v in final_dict.items():
    if len(v["abbreviation"])>0:
        abb_count += len(v["abbreviation"])
print(abb_count)

468


In [8]:
syn_count = 0
for k,v in final_dict.items():
    syn_count += len(v["synonym"])
print(syn_count)

8966


In [10]:
# write into a json
f = open("final_thesaurus.json","w")
f.write(json.dumps(final_dict))
f.close()

In [23]:
A = []
for k,v in final_dict.items():
    abb = []
    other = []
    syn = []
    for i in range(0, len(v["abbreviation"])):
        abb.append(v["abbreviation"][i])
    for j in range(0, len(v["synonym"])):
        syn.append(v["synonym"][j])
    for z in range(0, len(v["other"])):
        other.append(v["other"][z])
    A.append([k, abb, syn, other])
A  

[['003',
  [],
  [],
  ['100ui',
   '030',
   '100daysofui',
   '085',
   '032',
   'day004',
   '004',
   '008',
   'dailyui008',
   'error404',
   'designchallenge',
   'pagenotfound',
   'dailyui003',
   'comingsoon',
   '048',
   '4monthschallenge',
   'errorpage',
   '404page']],
 ['30', [], [], ['day4', '030', '014', '011', 'day2', 'days', '-', 'day5']],
 ['days',
  [],
  [],
  ['day4', 'day3', 'day2', '36', '36days', '30', '100days', 'day1', 'day5']],
 ['of', [], [], []],
 ['ui',
  [],
  [],
  ['ux.ui', 'ui.', 'uxui', 'ui-ux', '?ui', 'ux-ui', 'ui8', 'uix']],
 ['colorful', [], ['colorful.', 'colourful', 'colorfull'], ['colorbeats']],
 ['daily',
  [],
  [],
  ['daily100',
   'dailyui014',
   'dailyui015',
   'uidaily',
   'dailyui007',
   'dailyui011',
   'dailyui100',
   'daily-ui']],
 ['landing', [], [], ['landingpage', 'landing-page', 'landingspage']],
 ['page',
  [],
  ['page.', 'pager'],
  ['fullpage',
   'landing-page',
   'loginpage',
   'webpage',
   'mainpage',
   'landin

In [24]:
import pandas as pd
name=['key','abbreviation','synonyms', "other"]
test=pd.DataFrame(columns=name,data=A)
print(test)

                          key abbreviation  \
0                         003           []   
1                          30           []   
2                        days           []   
3                          of           []   
4                          ui           []   
5                    colorful           []   
6                       daily           []   
7                     landing           []   
8                        page           []   
9                       music           []   
10                      event           []   
11                 collection           []   
12                       icon           []   
13                iconography           []   
14                       line           []   
15                     online           []   
16                        set           []   
17                       shop           []   
18                      store           []   
19                     canvas           []   
20                        art     