# Load Dataset

In [15]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
df_train = pd.read_xml('./JASSS_DATA/jasss/jass_paper_final.xml')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 776 entries, 0 to 775
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Title     776 non-null    object
 1   Writer    776 non-null    object
 2   KeyWord   575 non-null    object
 3   Abstract  234 non-null    object
dtypes: object(4)
memory usage: 24.4+ KB


In [3]:
df_train['Description'] = df_train['Abstract'] + ' ' + df_train['KeyWord'] + ' ' + df_train['Title']
df_train.dropna(inplace=True)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 234 entries, 0 to 263
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        234 non-null    object
 1   Writer       234 non-null    object
 2   KeyWord      234 non-null    object
 3   Abstract     234 non-null    object
 4   Description  234 non-null    object
dtypes: object(5)
memory usage: 11.0+ KB


In [4]:
df_train = df_train[['Title', 'Description', 'KeyWord']]

In [5]:
df_train.head()

Unnamed: 0,Title,Description,KeyWord
0,Agent-Based Modelling of Future Dairy and Plan...,A reduction in the production and consumption ...,"Plant-Based Milk, Dairy Reduction, Sustainable..."
1,ReMoTe-S. Residential Mobility of Tenants in S...,Sustainable housing is a key priority for Swit...,"Household Mobility, Household Relocation, Hous..."
2,Sharing Risk Under Heterogeneity: Exploring Pa...,Motivated by the emergence of new Peer-to-Peer...,"Risk-Sharing, Risk Aversion, Solidarity, Uncer..."
3,An Agent-Based Model of Motor Insurance Custom...,Attracting and retaining loyal customers is a ...,"Insurance, Word-Of-Mouth, Agent-Based-Model, N..."
4,Generation of Synthetic Populations in Social ...,With the aim of building realistic model of so...,"Synthetic Population, Agent-Based Simulation M..."


In [6]:
def normalize_document(doc):
    
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = nltk.word_tokenize(doc)
    tokens_tags = nltk.pos_tag(tokens)
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []
    for token, tag in tokens_tags:
        if tag in ['RBS','RBR', 'RB']:
            
            lemmatized_tokens.append(wordnet_lemmatizer.lemmatize(token, pos='r'))
        elif tag in ['JJ','JJS','JJR']:
            lemmatized_tokens.append(wordnet_lemmatizer.lemmatize(token, pos='a'))
        elif tag in ['VB','VBP','VBZ','VBD','VBN','VBG']:
            lemmatized_tokens.append(wordnet_lemmatizer.lemmatize(token, pos='v'))
        elif tag in ['NN','NNS','NNP','NNPS']:
            lemmatized_tokens.append(wordnet_lemmatizer.lemmatize(token, pos='n'))
        else:
            lemmatized_tokens.append(wordnet_lemmatizer.lemmatize(token))
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc

In [17]:
stop_words = nltk.corpus.stopwords.words('english')
normalize_corpus = np.vectorize(normalize_document)
norm_corpus = normalize_corpus(df_train['Description'].values)

In [18]:
norm_corpus

array(['reduction production consumption meat dairy across much world critical climate change mitigation alleviation ecological stress improved health update agentbased model abm historic uk milk consumption apply scenario dairy reduction adoption plantbased milk pbm 2050 update model comprise cognitive function agent perceive physical health environmental characteristic milk choice modify habit social influence use european social survey 2018 british social attitude 2008 survey data empirically inform model take backcasting approach calibrate parameter publish uk dairy reduction target 2030 2050 test different price relationship characterisation environmental concern may affect simulated milk consumption 2020 2050 scenario core target 20 less dairy 2030 35 2050 largely produce plausible consumption trajectory however current pricing dairy pbm simulate consumption mostly unable deliver desire core target improved markedly dairy price set organic level influence change environmental con

In [19]:
tags = norm_corpus
for i in range(len(tags)):
    tag_list = tags[i].split(', ')
#         if tag == 'Agent-Based Modelling':
#             tag = 'Agent Based Modeling'
#         if '-' in tag:
#             tag = tag.replace('-', ' ')
    df_train['Description'][i] = ' '.join(tag_list)
    tags[i] = ' '.join(tag_list)

In [20]:
tags

array(['reduction production consumption meat dairy across much world critical climate change mitigation alleviation ecological stress improved health update agentbased model abm historic uk milk consumption apply scenario dairy reduction adoption plantbased milk pbm 2050 update model comprise cognitive function agent perceive physical health environmental characteristic milk choice modify habit social influence use european social survey 2018 british social attitude 2008 survey data empirically inform model take backcasting approach calibrate parameter publish uk dairy reduction target 2030 2050 test different price relationship characterisation environmental concern may affect simulated milk consumption 2020 2050 scenario core target 20 less dairy 2030 35 2050 largely produce plausible consumption trajectory however current pricing dairy pbm simulate consumption mostly unable deliver desire core target improved markedly dairy price set organic level influence change environmental con

In [25]:
tag_dic = {}
for tag_list in tags:
    tag_list = tag_list.split(' ')
    for tag in tag_list:
        if tag not in tag_dic:
            tag_dic[tag] = 1
        else:
            tag_dic[tag] += 1

df = pd.DataFrame(list(tag_dic.items()), columns=['tag', 'count']).sort_values(by = 'count', axis = 0, ascending = False)
print('标签总数', len(df))
df.head(50)

标签总数 4605


Unnamed: 0,tag,count
19,model,1117
18,agentbased,437
274,simulation,324
41,social,293
43,use,292
33,agent,246
317,opinion,208
324,network,199
272,dynamic,186
91,result,164


In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

In [None]:
df[:10].plot(x='tag', y='count', kind='bar', legend=False, grid=True, figsize=(10,6), color=mcolors.TABLEAU_COLORS, fontsize=18)
plt.title('Tag Distribution', fontsize=18)
plt.ylabel('Count', fontsize=18)
plt.xlabel('Tag', fontsize=18)


In [None]:
plt.bar(df[:5]['tag'], df[:5]['count'], color=mcolors.TABLEAU_COLORS)
plt.title('Tag Distribution', fontsize=15)
plt.xlabel('Tags', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.xticks(rotation='vertical', fontsize=15)
plt.show()

# Text pre-processing

In [None]:
import nltk
import re
import numpy as np
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
stop_words = nltk.corpus.stopwords.words('english')

In [None]:
def normalize_document(doc):
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    tokens = nltk.word_tokenize(doc)
    tokens_tags = nltk.pos_tag(tokens)
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = []
    for token, tag in tokens_tags:
        if tag in ['RBS','RBR', 'RB']:
            lemmatized_tokens.append(wordnet_lemmatizer.lemmatize(token, pos='r'))
        elif tag in ['JJ','JJS','JJR']:
            lemmatized_tokens.append(wordnet_lemmatizer.lemmatize(token, pos='a'))
        elif tag in ['VB','VBP','VBZ','VBD','VBN','VBG']:
            lemmatized_tokens.append(wordnet_lemmatizer.lemmatize(token, pos='v'))
        elif tag in ['NN','NNS','NNP','NNPS']:
            lemmatized_tokens.append(wordnet_lemmatizer.lemmatize(token, pos='n'))
        else:
            lemmatized_tokens.append(wordnet_lemmatizer.lemmatize(token))
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc

In [None]:
normalize_corpus = np.vectorize(normalize_document)
norm_corpus = normalize_corpus(df_train['Title'].values)

In [None]:
len(norm_corpus)

In [None]:
norm_corpus[0]

In [None]:
df_train['Title'].values[0]

# Token Vectorize

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
len(stop_words)

In [None]:
# Token Vectorize
cv = CountVectorizer(ngram_range=(1,2), min_df=10, max_df=0.75, stop_words=stop_words)
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix.shape

# Extract TF-IDF Feature

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

In [None]:
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(cv_matrix)
tfidf_matrix = tfidf.toarray()

In [None]:
tfidf.shape

# Load Test Data

In [None]:
df_test = pd.read_xml('./JASSS_DATA/jasss/jasss_test1.xml')

In [None]:
df_test['Description'] = df_test['Abstract'] + ' ' + df_test['KeyWord'] + ' ' + df_test['Title']
df_test.dropna(inplace=True)

In [None]:
df_test = df_test[['Title', 'Description', 'KeyWord']]

In [None]:
for i in range(len(df_test.KeyWord)):
    tag_list = df_test['KeyWord'][i].split(', ')
    for j in range(len(tag_list)):
        tag = tag_list[j]
        if tag == 'Agent-Based Modelling':
            tag = 'Agent-Based Modeling'
        tag_list[j] = tag
    df_test['KeyWord'][i] = tag_list
df_test['KeyWord'][1]

In [None]:
x_test, y_test = df_test.Description, df_test.KeyWord

In [None]:
tag_dic_test = {}
for tag_list in df_test['KeyWord'].values:
    for tag in tag_list:
        if tag not in tag_dic_test:
            tag_dic_test[tag] = 1
        else:
            tag_dic_test[tag] += 1

df_tag_test = pd.DataFrame(list(tag_dic_test.items()), columns=['tag', 'count']).sort_values(by = 'count', axis = 0, ascending = False)
print('标签总数', len(df_tag_test))
df_tag_test.head(20)

# MultiLabel

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder

In [None]:
x_train, y_train = df_train.Description, df_train.KeyWord

y_train = list(y_train.values)
for i in range(len(y_train)):
    y_train[i] = set(y_train[i])
y_train

In [None]:
y_test = list(y_test.values)
for i in range(len(y_test)):
    y_test[i] = set(y_test[i])
y_test

In [None]:
x_train = normalize_corpus(x_train.values)
x_test = normalize_corpus(x_test.values)


mlb = MultiLabelBinarizer(classes=sorted(tag_dic.keys()))
y_train = mlb.fit_transform(y_train)
y_test = mlb.fit_transform(y_test)


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [None]:
def print_evaluation_scores(y_val, predicted):
    accuracy = accuracy_score(y_val, predicted)
    f1_score_macro = f1_score(y_val, predicted, average='macro', zero_division=0)
    f1_score_micro = f1_score(y_val, predicted, average='micro', zero_division=0)
    f1_score_weighted = f1_score(y_val, predicted, average='weighted', zero_division=0)
    print("accuracy:", accuracy)
    print("f1_score_macro:", f1_score_macro)
    print("f1_score_micro:", f1_score_micro)
    print("f1_score_weighted:", f1_score_weighted)

# Navie Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
pipe = Pipeline([
                    ('tfidf', TfidfVectorizer(min_df=5,max_df=0.9,ngram_range=(1,2),token_pattern='(\S+)')),
                    ('clf', OneVsRestClassifier(MultinomialNB())),
                ])

pipe.fit(x_train, y_train)
predicted = pipe.predict(x_test)
print(predicted)
print_evaluation_scores(y_test, predicted)

# Support Vector Machine

In [None]:
pipe = Pipeline([
        ('tfidf', TfidfVectorizer(min_df=5,max_df=0.9,ngram_range=(1,2),token_pattern='(\S+)')),
        ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1))
    ])

pipe.fit(x_train, y_train)
predicted = pipe.predict(x_test)
print(predicted)
print_evaluation_scores(y_test, predicted)

In [None]:
pipe.score(x_test, y_test)

In [None]:
mlb.inverse_transform(predicted)

# Logical Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
pipe = Pipeline([
        ('tfidf', TfidfVectorizer(min_df=5,max_df=0.9,ngram_range=(1,2),token_pattern='(\S+)')),
        ('clf', OneVsRestClassifier(LogisticRegression(), n_jobs=1))
    ])

pipe.fit(x_train, y_train)
predicted = pipe.predict(x_test)
print(predicted)
print_evaluation_scores(y_test, predicted)