# GNN for NLP
---
By Xiaoran Li
For JSAI2022

In [2]:
import glob
import tqdm
import numpy as np

In [3]:
def countMinMaxAver(lines):
    min_len = 10000
    aver_len = 0
    max_len = 0
    for temp in lines:
        temp = temp.split()
        aver_len = aver_len + len(temp)
        if len(temp) < min_len:
            min_len = len(temp)
        if len(temp) > max_len:
            max_len = len(temp)
    aver_len = 1.0 * aver_len / len(lines)
    print('min_len : ' + str(min_len))
    print('max_len : ' + str(max_len))
    print('average_len : ' + str(aver_len))

# Short Text Classification (STC)

## get dataset for STC
* Ohsumed: http://disi.unitn.it/moschitti/corpora.htm

In [4]:
STC_Benchmark_path = "../../Benchmark_Data/Short-Text-Classification"
Ohsumed_category_description_path = STC_Benchmark_path+"/Ohsumed/First-Level-Categories-of-Cardiovascular-Disease.txt"
Ohsumed_training_path = STC_Benchmark_path+"/Ohsumed/ohsumed-first-20000-docs/training"
Ohsumed_test_path = STC_Benchmark_path+"/Ohsumed/ohsumed-first-20000-docs/test"
Ohsumed_ohsumed_all_path = STC_Benchmark_path+"/Ohsumed/ohsumed-all"

In [12]:
def getOhsumed(Ohsumed_category_description_path, Ohsumed_path):
    Ohsumed_category_description = {}
    for line in open(Ohsumed_category_description_path):
        description, category = line.strip().split(' - ')
        Ohsumed_category_description[category] = description
    Ohsumed_all = glob.glob(f'{Ohsumed_path}/*/*')
    #where the documents with multiple labels are removed. We use the titles for short text classification.
    shorttext2category_dic = {}
    for category in tqdm.tqdm(Ohsumed_category_description.keys()):
        for file in glob.glob(f'{Ohsumed_path}/{category}/*'):
            with open(file) as f:
                title = f.readlines()[0].strip()
                if title not in shorttext2category_dic.keys():
                    shorttext2category_dic[title] = []
                shorttext2category_dic[title].append(category)
    shorttext2category_dic = {k:v for k,v in shorttext2category_dic.items() if len(v) == 1}
    return shorttext2category_dic,Ohsumed_category_description

In [13]:
Ohsumed_training, Ohsumed_category_description = getOhsumed(Ohsumed_category_description_path, Ohsumed_training_path)
Ohsumed_test, _ = getOhsumed(Ohsumed_category_description_path, Ohsumed_test_path)
Ohsumed_ohsumed_all, _ = getOhsumed(Ohsumed_category_description_path, Ohsumed_ohsumed_all_path)

100%|███████████████████████████████████████████| 23/23 [00:01<00:00, 21.84it/s]
100%|███████████████████████████████████████████| 23/23 [00:01<00:00, 17.84it/s]
100%|███████████████████████████████████████████| 23/23 [00:06<00:00,  3.70it/s]


In [14]:
len(Ohsumed_training) + len(Ohsumed_test)

7400

In [15]:
countMinMaxAver(Ohsumed_training.keys())
countMinMaxAver(Ohsumed_test.keys())
countMinMaxAver(Ohsumed_ohsumed_all.keys())

min_len : 1
max_len : 35
average_len : 11.820673220137028
min_len : 1
max_len : 36
average_len : 12.016077170418006
min_len : 1
max_len : 44
average_len : 12.106931992127706


In [19]:
np.save(STC_Benchmark_path+"/Ohsumed/Ohsumed_training",Ohsumed_training)
np.save(STC_Benchmark_path+"/Ohsumed/Ohsumed_category_description",Ohsumed_category_description)
np.save(STC_Benchmark_path+"/Ohsumed/Ohsumed_test",Ohsumed_test)
np.save(STC_Benchmark_path+"/Ohsumed/Ohsumed_ohsumed_all",Ohsumed_ohsumed_all)

## Twitter Corpus for STC

* https://www.nltk.org/howto/twitter.html#corpus_reader

In [166]:
import nltk

In [86]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/sauron/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [167]:
from nltk.corpus import twitter_samples
twitter_samples.fileids()

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

In [168]:
negative_tweets = twitter_samples.strings('negative_tweets.json')
positive_tweets = twitter_samples.strings('positive_tweets.json')

In [171]:
len(negative_tweets)+len(positive_tweets)

10000

In [172]:
countMinMaxAver(negative_tweets)
countMinMaxAver(positive_tweets)

min_len : 1
max_len : 35
average_len : 11.3716
min_len : 2
max_len : 31
average_len : 11.9078


In [93]:
np.save(STC_Benchmark_path+"/Twitter/negative_tweets",negative_tweets)
np.save(STC_Benchmark_path+"/Twitter/positive_tweets",positive_tweets)

## Movie Review Corpus for STC

* https://www.cs.cornell.edu/people/pabo/movie-review-data/

In [173]:
MR_pos_path = STC_Benchmark_path+"/MR/rt-polaritydata/rt-polaritydata/rt-polarity.pos"
MR_neg_path = STC_Benchmark_path+"/MR/rt-polaritydata/rt-polaritydata/rt-polarity.neg"

In [174]:
with open(MR_pos_path, encoding='utf-8', errors='ignore') as f:
    positive_snippets = f.readlines()
with open(MR_neg_path, encoding='utf-8', errors='ignore') as f:
    negative_snippets = f.readlines()

In [175]:
positive_snippets = [sentence.strip() for sentence in positive_snippets]
negative_snippets = [sentence.strip() for sentence in negative_snippets]

In [176]:
len(positive_snippets)+len(negative_snippets)

10662

In [178]:
countMinMaxAver(positive_snippets)
countMinMaxAver(negative_snippets)

min_len : 2
max_len : 59
average_len : 21.085537422622398
min_len : 1
max_len : 56
average_len : 20.933408366160194


In [114]:
np.save(STC_Benchmark_path+"/MR/rt-polaritydata/positive_snippets",positive_snippets)
np.save(STC_Benchmark_path+"/MR/rt-polaritydata/negative_snippets",negative_snippets)

## TagMyNews Corpus for STC
* http://acube.di.unipi.it/tmn-dataset/

In [20]:
tagmynews_path = STC_Benchmark_path+"/TagMyNews/tagmynews.txt"

In [21]:
with open(tagmynews_path) as f:
    tagmynews = f.readlines()

In [22]:
category_list = ["sci_tech","business","entertainment","us","sport","world","health"]

In [23]:
tagmynews = "".join(tagmynews)
tagmynews = tagmynews.split("\n\n")
tagmynews_dict = {items.split("\n")[0]:items.split("\n")[-1] for items in tagmynews}

In [24]:
[print(items,end="............") for items in tagmynews if len(items.split("\n"))!=7]
[print(k,v) for k,v in tagmynews_dict.items() if v not in category_list]
len(tagmynews_dict)

31280

In [25]:
countMinMaxAver(tagmynews_dict.keys())

min_len : 0
max_len : 23
average_len : 8.024776214833759


In [27]:
for i,(k,v) in enumerate(tagmynews_dict.items()):
    if len(k.split()) == 0:
        print(i,k,v)

12490  sci_tech


In [118]:
np.save(STC_Benchmark_path+"/TagMyNews/tagmynews_dict_31280",tagmynews_dict)

## Snippets Corpus for STC
* http://acube.di.unipi.it/tmn-dataset/

In [40]:
snippets_path = STC_Benchmark_path+"/Snippets/snippets.txt"

In [41]:
with open(snippets_path) as f:
    snippets = f.readlines()

In [42]:
countMinMaxAver(snippets)

min_len : 2
max_len : 39
average_len : 18.88995137763371


In [46]:
for items in snippets:
    if len(items.split()) == 2:
        print(items)

artmargins culture-arts-entertainment

cinematography culture-arts-entertainment

artmargins culture-arts-entertainment



In [210]:
snippets = [items.strip().split() for items in snippets]
snippets_dict = {" ".join(items_list[:-1]):items_list[-1] for items_list in snippets}

In [211]:
countMinMaxAver(snippets_dict.keys())
len(snippets_dict)

min_len : 1
max_len : 38
average_len : 17.89337721102427


12155

In [212]:
np.save(STC_Benchmark_path+"/Snippets/snippets_dict_12155",snippets_dict)