# RNN

## Data

Load back the dataframe that's been pre-processed at last stage.

In [14]:
import pandas as pd
import nltk

def custom_tokenize(text):
    text = ''.join([ch for ch in text if not ch.isdigit() and 
                    ch not in ['@', '#', ':', '.', ',', ';', '$', '%', '&', '\'',
                               '-', '(', ')', '-', '[', ']', '“', '”', '\'\'', '+',
                               '/', '``', '|', '—']])
    tokens = nltk.word_tokenize(text)
    return tokens

df_words = pd.read_pickle("./df/df_words.pkl")

### Split training and validating set : 80-20

In [17]:
import pandas as pd
X_train = pd.read_pickle("./df/X_train.pkl")
X_test  = pd.read_pickle("./df/X_test.pkl") 
y_train = pd.read_pickle("./df/y_train.pkl")
y_test  = pd.read_pickle("./df/y_test.pkl")

## Evaluate on Validation Set

In [1]:
from keras.models import load_model
model = load_model('./model/rnn_glove_additional.h5')

Using TensorFlow backend.








Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [2]:
import pandas as pd
X_train = pd.read_pickle("./df/X_train.pkl")
X_test  = pd.read_pickle("./df/X_test.pkl") 
y_train = pd.read_pickle("./df/y_train.pkl")
y_test  = pd.read_pickle("./df/y_test.pkl")

In [3]:
%%time
from keras.preprocessing.text import Tokenizer

# keras tokenizer
tokenizer = Tokenizer(num_words=20000,
                      filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                      lower=True)

tokenizer.fit_on_texts(X_train['sentence'].values)

word_index = tokenizer.word_index

CPU times: user 952 ms, sys: 7.01 ms, total: 959 ms
Wall time: 959 ms


In [4]:
%%time

from keras.preprocessing.sequence import pad_sequences

X_train_seq = tokenizer.texts_to_sequences(X_train['sentence'].values)

X_train_seq = pad_sequences(X_train_seq, maxlen=181) 
X_train_seq.shape

CPU times: user 916 ms, sys: 13.2 ms, total: 930 ms
Wall time: 929 ms


(43458, 181)

In [5]:
y_train_series = pd.get_dummies(y_train).values
y_train_series.shape

(43458, 6)

In [6]:
## prepare for evaluation data as well

X_test_seq = tokenizer.texts_to_sequences(X_test['sentence'].values)
X_test_seq = pad_sequences(X_test_seq, maxlen=181) 

y_test_series = pd.get_dummies(y_test).values

print(X_test_seq.shape) # 10865 sentence with 181 terms sequence (pad if less than 181)
y_test_series.shape 

(10865, 181)


(10865, 6)

In [7]:
X_train_other = X_train[['position', 'sent_len']].values
X_test_other =   X_test[['position', 'sent_len']].values

In [10]:
import pickle

with open('./label_lookup.pkl', 'rb') as f:
    label_lookup = pickle.load(f)

In [11]:
label_lookup

{3: 'OBJECTIVES',
 2: 'METHODS',
 1: 'CONCLUSIONS',
 0: 'BACKGROUND',
 5: 'RESULTS',
 4: 'OTHERS'}

In [12]:
pred_result[0].shape

(10865, 6)

In [13]:
import numpy as np

def get_labels(seq):
    return [label_lookup[np.argmax(s)] for s in seq]

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

y_pred = get_labels(pred_result[0])

print(accuracy_score(y_test, y_pred))
print(classification_report(y_true=y_test, y_pred=y_pred))

0.5766221813161528
              precision    recall  f1-score   support

  BACKGROUND       0.74      0.77      0.75      2718
 CONCLUSIONS       0.45      0.08      0.14      1044
     METHODS       0.57      0.57      0.57      2759
  OBJECTIVES       0.46      0.52      0.49      1799
      OTHERS       0.56      0.15      0.23       192
     RESULTS       0.52      0.66      0.58      2353

    accuracy                           0.58     10865
   macro avg       0.55      0.46      0.46     10865
weighted avg       0.57      0.58      0.56     10865



Accuracy score seems higher.

In [62]:
import pickle

# save model & history
model.save('./model/rnn_glove_additional.h5')   
pickle.dump(history, open("./model/rnn_glove_additional_history.pkl", "wb"))

----

In [14]:
X_test.head()

Unnamed: 0,position,sentence,categories,sent_len
3419,1,The available bandwidths of modern wireless ne...,cs.MM,18
18762,2,"Motivated by the excellent human performance, ...",cs.SD/cs.LG/eess.AS/stat.ML,33
16492,0,We study networks of human decision-makers who...,cs.GT/cs.SI/cs.SY,17
21122,0,This paper seeks to combine differential game ...,cs.SY/math.OC,32
32679,5,Typical applications of link prediction are al...,cs.SI/physics.soc-ph,8


---

## Testing Data

In [15]:
import pandas as pd

# train_file = 'task1_public_testset.csv'
train_file = 'task1_private_testset.csv'
df = pd.read_csv(train_file) 

In [16]:
df.head()

Unnamed: 0,Id,Title,Abstract,Authors,Categories,Created Date
0,T20001,"Smart ""Predict, then Optimize""",Many real-world analytics problems involve two...,Elmachtoub/Grigas,math.OC/cs.LG/stat.ML,2017-10-22
1,T20002,On the variable hierarchy of first-order spectra,The spectrum of a first-order logic sentence i...,Kopczynski/Tan,cs.LO/cs.CC,2014-03-10
2,T20003,Guaranteed Sufficient Decrease for Stochastic ...,"In this paper, we propose a novel sufficient d...",Shang/Liu/Zhou/Cheng/Ng/Yoshida,stat.ML/cs.DS/cs.LG/math.OC,2018-02-25
3,T20004,Non Binary Local Gradient Contours for Face Re...,As the features from the traditional Local Bin...,Gubbi/Azeem/Kumari,cs.CV,2014-11-03
4,T20005,Tree-Based Optimization: A Meta-Algorithm for ...,Designing search algorithms for finding global...,Ghojogh/Sharifian/Mohammadzade,cs.NE,2018-09-24


In [17]:
df['sent'] = df[['Abstract']].apply(lambda x: list(x['Abstract'].split('$$$')), axis=1)
df.head()

Unnamed: 0,Id,Title,Abstract,Authors,Categories,Created Date,sent
0,T20001,"Smart ""Predict, then Optimize""",Many real-world analytics problems involve two...,Elmachtoub/Grigas,math.OC/cs.LG/stat.ML,2017-10-22,[Many real-world analytics problems involve tw...
1,T20002,On the variable hierarchy of first-order spectra,The spectrum of a first-order logic sentence i...,Kopczynski/Tan,cs.LO/cs.CC,2014-03-10,[The spectrum of a first-order logic sentence ...
2,T20003,Guaranteed Sufficient Decrease for Stochastic ...,"In this paper, we propose a novel sufficient d...",Shang/Liu/Zhou/Cheng/Ng/Yoshida,stat.ML/cs.DS/cs.LG/math.OC,2018-02-25,"[In this paper, we propose a novel sufficient ..."
3,T20004,Non Binary Local Gradient Contours for Face Re...,As the features from the traditional Local Bin...,Gubbi/Azeem/Kumari,cs.CV,2014-11-03,[As the features from the traditional Local Bi...
4,T20005,Tree-Based Optimization: A Meta-Algorithm for ...,Designing search algorithms for finding global...,Ghojogh/Sharifian/Mohammadzade,cs.NE,2018-09-24,[Designing search algorithms for finding globa...


In [18]:
print(len(df['sent'].values))
print(len(df['Categories'].values))

20000
20000


In [19]:
# (sentence, position_in_abstract, categories)
sent_dic = [(s, idx, df['Id'].values[index]) 
            for index, sent in enumerate(df['sent'].values) 
            for idx, s in enumerate(sent)]
sent_dic[0]

('Many real-world analytics problems involve two significant challenges: prediction and optimization.',
 0,
 'T20001')

In [20]:
sent_dic[1]

('Due to the typically complex nature of each challenge, the standard paradigm is to predict, then optimize.',
 1,
 'T20001')

In [21]:
sent_dict = dict()
for snt, ind, sid in sent_dic:
    if sid not in sent_dict:
        sent_dict.update({sid:[(ind, snt)]})
    else:
        sent_dict[sid].append((ind, snt))

In [22]:
df_words = pd.DataFrame([(k, v[0], v[1]) for k,vlst in sent_dict.items() for v in vlst ],
                        columns=['Id', 'position', 'sentence'])
df_words.head(10)

Unnamed: 0,Id,position,sentence
0,T20001,0,Many real-world analytics problems involve two...
1,T20001,1,Due to the typically complex nature of each ch...
2,T20001,2,"By and large, machine learning tools are inten..."
3,T20001,3,"In contrast, we propose a new and very general..."
4,T20001,4,A key component of our framework is the SPO lo...
5,T20001,5,Training a model with respect to the SPO loss ...
6,T20001,6,We also propose a stochastic gradient descent ...
7,T20001,7,"Finally, we perform computational experiments ..."
8,T20002,0,The spectrum of a first-order logic sentence i...
9,T20002,1,In this paper we study the hierarchy of first-...


In [23]:
df_words['sent_len'] = df_words['sentence'].apply(lambda x: len(x.split(' ')))

In [24]:
df_words.head(10)

Unnamed: 0,Id,position,sentence,sent_len
0,T20001,0,Many real-world analytics problems involve two...,11
1,T20001,1,Due to the typically complex nature of each ch...,17
2,T20001,2,"By and large, machine learning tools are inten...",28
3,T20001,3,"In contrast, we propose a new and very general...",33
4,T20001,4,A key component of our framework is the SPO lo...,34
5,T20001,5,Training a model with respect to the SPO loss ...,42
6,T20001,6,We also propose a stochastic gradient descent ...,35
7,T20001,7,"Finally, we perform computational experiments ...",21
8,T20002,0,The spectrum of a first-order logic sentence i...,20
9,T20002,1,In this paper we study the hierarchy of first-...,16


In [25]:
df_words.to_pickle("./df/X_testing_private.pkl") 

In [26]:
len(df_words['sentence'].values)

131782

----

### Predict

In [27]:
# convert sentence to numeric sequence
X_testing_seq = tokenizer.texts_to_sequences(df_words['sentence'].values)

# make every sequence equal size with padding
X_testing_seq = pad_sequences(X_testing_seq, maxlen=181) # max_len in sentence in training records

X_testing_seq.shape

(131782, 181)

In [28]:
X_testing_other = df_words[['position', 'sent_len']].values

In [29]:
pred_result_testing = model.predict({'word_input': X_testing_seq, 'other_in': X_testing_other})

In [30]:
pred_result_testing[0].shape

(131782, 6)

In [31]:
y_testing = get_labels(pred_result_testing[0])

In [32]:
df_words['label'] = y_testing
df_words.head(10)

Unnamed: 0,Id,position,sentence,sent_len,label
0,T20001,0,Many real-world analytics problems involve two...,11,BACKGROUND
1,T20001,1,Due to the typically complex nature of each ch...,17,BACKGROUND
2,T20001,2,"By and large, machine learning tools are inten...",28,BACKGROUND
3,T20001,3,"In contrast, we propose a new and very general...",33,METHODS
4,T20001,4,A key component of our framework is the SPO lo...,34,METHODS
5,T20001,5,Training a model with respect to the SPO loss ...,42,RESULTS
6,T20001,6,We also propose a stochastic gradient descent ...,35,METHODS
7,T20001,7,"Finally, we perform computational experiments ...",21,RESULTS
8,T20002,0,The spectrum of a first-order logic sentence i...,20,BACKGROUND
9,T20002,1,In this paper we study the hierarchy of first-...,16,OBJECTIVES


In [33]:
df_words.to_pickle("./df/X_testing_label_private.pkl") 

### To submission format

In [34]:
for index, row in df_words.head(10).iterrows():
    if row['position']+1 > 9:
        print(row['Id']+'_S0'+str(row['position']+1), row['label'])
    else:
        print(row['Id']+'_S00'+str(row['position']+1), row['label'])

T20001_S001 BACKGROUND
T20001_S002 BACKGROUND
T20001_S003 BACKGROUND
T20001_S004 METHODS
T20001_S005 METHODS
T20001_S006 RESULTS
T20001_S007 METHODS
T20001_S008 RESULTS
T20002_S001 BACKGROUND
T20002_S002 OBJECTIVES


In [35]:
label_to_submission = {
    'BACKGROUND' :[1, 0, 0, 0, 0, 0],
    'OBJECTIVES' :[0, 1, 0, 0, 0, 0],
    'METHODS'    :[0, 0, 1, 0, 0, 0],
    'RESULTS'    :[0, 0, 0, 1, 0, 0],
    'CONCLUSIONS':[0, 0, 0, 0, 1, 0],
    'OTHERS'     :[0, 0, 0, 0, 0, 1]
}

In [36]:
for index, row in df_words.head(10).iterrows():
    if row['position']+1 > 9:
        print(row['Id']+'_S0'+str(row['position']+1), label_to_submission[row['label']])
    else:
        print(row['Id']+'_S00'+str(row['position']+1), label_to_submission[row['label']])

T20001_S001 [1, 0, 0, 0, 0, 0]
T20001_S002 [1, 0, 0, 0, 0, 0]
T20001_S003 [1, 0, 0, 0, 0, 0]
T20001_S004 [0, 0, 1, 0, 0, 0]
T20001_S005 [0, 0, 1, 0, 0, 0]
T20001_S006 [0, 0, 0, 1, 0, 0]
T20001_S007 [0, 0, 1, 0, 0, 0]
T20001_S008 [0, 0, 0, 1, 0, 0]
T20002_S001 [1, 0, 0, 0, 0, 0]
T20002_S002 [0, 1, 0, 0, 0, 0]


In [37]:
import csv

with open('submission_private.csv', 'w', newline='') as csvfile:

    writer = csv.writer(csvfile)
    writer.writerow(['order_id', 'BACKGROUND', 'OBJECTIVES', 'METHODS', 'RESULTS', 'CONCLUSIONS', 'OTHERS'])

    for index, row in df_words.iterrows():
        lbl = label_to_submission[row['label']]
        if row['position']+1 > 9:
            writer.writerow([row['Id']+'_S0'+str(row['position']+1), lbl[0], lbl[1], lbl[2], lbl[3], lbl[4], lbl[5]])
        else:
            writer.writerow([row['Id']+'_S00'+str(row['position']+1), lbl[0], lbl[1], lbl[2], lbl[3], lbl[4], lbl[5]])
        
