# Import modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import spacy
import nltk
import re
from tqdm import tqdm
from nltk import pos_tag_sents

# nlp = spacy.load("en_core_web_sm")
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\takhu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\takhu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\takhu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import pickle
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

# Functions

In [3]:
def clean_data(text):
  if type(text) != float:
    text = text.encode("utf-8", errors='ignore').decode("utf-8")
    text = re.sub("({.*})+", '', text)
    text = re.sub("(<.*>)+", "'", text)
    text = re.sub("[\"*]", "'", text)
    text = re.sub("[\'*]", '', text)
    return text.strip()
  else:
    return "NaN"


def remove_special_character(x):
  """
    Input:
      - x: string
    Output: string that already removed special character
  """
  if type(x) != float:
    temp = re.sub('[^A-Za-z0-9]+', ' ', x)
    return temp
  else:
    return "NaN"
def common(x, y):
  """
    Input:
      - x: set, list
      - y: set, list
    Output: set of common words of x and y
  """
  x = set(x)
  y = set(y)
  return x&y

def get_vocab(sentence, rm_character = False, split_by = " "):
  """
    Input:
      - sentence: string
      - rm_character: (True, False) Remove specical character or not. Default is False
      - split_by: split string by character. Default is " "
    Output: Vocabulary set of sentence
  """
  vocab = {""}
  sentence = str(sentence)
  if rm_character == True:
    sentence = remove_special_character(sentence)
  for word in sentence.split(split_by):
    vocab.add(word)
  return vocab

def get_weighted_vocab(sentence, rm_character = False, split_by = " "):
  """
    Input:
      - sentence: string, list, array
      - rm_character: (True, False) Remove specical character or not. Default is False
      - split_by: split string by character. Default is " "
    Output: (Dictionary) Weighted vocabulary set of sentence
  """
  vocab = {}
  if type(sentence) == str:
    if rm_character == True:
      sentence = remove_special_character(sentence)
    for word in sentence.split(split_by):
      if word not in vocab:
        vocab.update({word: 1})
      else:
        vocab[word] += 1
  else:
    if len(sentence) != 0:
      for word in sentence:
        if word not in vocab:
          vocab.update({word: 1})
        else:
          vocab[word] += 1
    else:
      return {}
  return vocab

def weighted_common(x, y):
  """
    Input:
      - x: (dictionary) weighted vocab
      - y: (dictionary) weighted vocab
    Output: set of common words of x and y (weight x + weight y)
  """
  common = {}
  len_x = len(x)
  len_y = len(y)
  if len_x < len_y:
    for key in x.keys():
      if key in y:
        common.update({key: x[key] + y[key]})
  else:
      for key in y.keys():
        if key in x:
          common.update({key: x[key] + y[key]})
  return common

def mean_weight(x):
  """
    Input:
      - x: (dictionary) weighted vocab
    Output: mean weight
  """
  if len(x.values()) == 0:
    return 0
  return sum(x.values())/len(x.values())

def get_sentence_pair(idx, pairset, nodeset):
  """
    Input:
      - idx: index of pair in pairset
      - pairset:  dataset contain pairs
      - nodeset: dataset contains nodes
    Output: pair of sentence
  """
  id1 = pairset.loc[idx]['id1']
  id2 = pairset.loc[idx]['id2']
  s1 = str(nodeset[nodeset['id'] == id1].iloc[0,1])
  s2 = str(nodeset[nodeset['id'] == id2].iloc[0,1])
  return s1, s2

def get_postag_pair(idx, pairset, nodeset):
  """
    Input:
      - idx: index of pair in pairset
      - pairset:  dataset contain pairs
      - nodeset: dataset contains nodes
    Output: pair of postag
  """
  id1 = pairset.loc[idx]['id1']
  id2 = pairset.loc[idx]['id2']
  pt1 = nodeset['embedded_postag'][id1]
  pt2 = nodeset['embedded_postag'][id2]
  return pt1, pt2

def postagging(text, type = "nltk"):
  """
    Input:
      - text: string
      - type = ("nltk", "spacy") Postagger. Default is nltk
    Output: list of postag pair (tuple)
  """
  if type == "nltk":
    tokens = nltk.word_tokenize(text)
    tag = nltk.pos_tag(tokens)
    return tag
  elif type == "spacy":
    doc = nlp(text)
    postag = [(x, x.pos_) for x in doc]
    return postag

def postag_format(x):
  """
  Input:
    - x: (string) line from saved postag csv
  Output: list of list (word, postag)
  """
  temp = x.split("(")
  temp = temp[1:len(temp)]
  postag = []
  for i in temp:
    i = i.replace('\'',"")\
              .replace(")","")\
              .replace("\"","")\
              .replace(" ","").split(',')
    i = i[0:2]
    postag.append(i)
  return postag

def embedded_format(x):
  res = re.findall(r"\[\s*\+?(-?\d+)\s*\.\]", x)
  # return remove_special_character(x).split(" ")[1:-1]
  return res

def summarize_weighted(x, y):
  """
    Input:
      - x: (dictionary) weighted vocab
      - y: (dictionary) weighted vocab
    Output: dict of total number of common words between x and y (weight x + weight y)
  """
  len_x = len(x)
  len_y = len(y)
  if len_x > len_y:
    for key in x.keys():
      if key in y:
        x[key] = x[key] + y[key]
    return x
  else:
    for key in y.keys():
      if key in x:
        y[key] = y[key] + x[key]
    return y

def postag_embedding(sentence, list_postag):
  """
  Input:
    - sentence: (string) sentence that needs to be embedded
    - list_postag: (1D iteration) POSTAG to embed
  Output: Embedding vector
  """

  embedding = np.zeros((len(list_postag),1))

  pt = postagging(remove_special_character(sentence), type = "nltk")
  try:
    pt = get_weighted_vocab(np.array(pt).T[1], True)
  except:
    pt = {}

  # print(pt)

  for i, postag in enumerate(list_postag):
    if postag in list(pt.keys()):
      embedding[i] = pt[postag]
  return embedding

def name_postag(x, number):
  return [i + number for i in x]

# Import dataset

In [4]:
train = pd.read_csv("train.csv").drop(['id'], axis=1)
train

Unnamed: 0,id1,id2,label
0,9202,9202,1
1,410411,460254,0
2,211858,312074,1
3,253901,504325,0
4,415071,63239,0
...,...,...,...
948227,177447,82731,1
948228,141580,396615,0
948229,817680,271396,0
948230,756771,322480,0


In [5]:
test = pd.read_csv("test.csv").drop(['id'], axis=1)

In [6]:
nodes = pd.read_csv("nodes.tsv", sep = "\t", index_col=None)
nodes

Unnamed: 0,id,text
0,1,{{infobox person | name = clayton jacobson | i...
1,2,a '''cobra probe''' is a device to measure the...
2,3,the '''harmon foundation''' was established in...
3,4,'''structured finance''' is a sector of financ...
4,5,'''al-shohada'a stadium''' is a multi-use stad...
...,...,...
837829,836621,{{infobox publisher | name = houghton mifflin ...
837830,836622,{{infobox former subdivision |native_name = }}...
837831,836623,"'''janjira''' may refer to: * janjira state, a..."
837832,344599,'''serendip''' ('''search for extraterrestrial...


# Embed dataset

In [7]:
list_weighted_common_pos = []
for idx in tqdm(range(len(train))):
  sen1, sen2 = get_sentence_pair(idx, train, nodes)
  p1 = postagging(clean_data(sen1), type = "nltk")
  p2 = postagging(clean_data(sen2), type = "nltk")
  try:
    t1 = get_weighted_vocab(np.array(p1).T[1], True)
    t2 = get_weighted_vocab(np.array(p2).T[1], True)
  except:
    t1 = {}
    t2 = {}
  common_weighted = weighted_common(t1, t2)
  list_weighted_common_pos.append(common_weighted)

100%|████████████████████████████████████████████████████████████████████████| 948232/948232 [2:46:17<00:00, 95.04it/s]


In [8]:
list_contain_postag_label1 = []
for i in tqdm(range(len(list_weighted_common_pos))):
  if train['label'][i] == 1:
    list_contain_postag_label1 += list(list_weighted_common_pos[i].keys())

100%|██████████████████████████████████████████████████████████████████████| 948232/948232 [00:07<00:00, 126497.62it/s]


In [9]:
postag = list(set(list_contain_postag_label1))

In [10]:
node_embedding = np.zeros((len(nodes), 1 + len(postag)))

In [11]:
for i in tqdm(range(len(nodes))):
  node_embedding[i][0] = nodes['id'][i]
  node_embedding[i][1:45] = postag_embedding(nodes['text'][i], postag).reshape(node_embedding.shape[1] - 1 ,)

100%|███████████████████████████████████████████████████████████████████████| 837834/837834 [1:24:50<00:00, 164.60it/s]


In [12]:
df_node_embedding = pd.DataFrame(node_embedding)
df_node_embedding.columns = ["id"] + postag
df_node_embedding

Unnamed: 0,id,CD,IN,FW,JJR,WP$,$,'',",",EX,...,RBR,VBN,VBG,``,WP,:,DT,WDT,UH,RB
0,1.0,6.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,2.0,0.0,0.0,0.0,6.0,0.0,0.0,3.0
1,2.0,13.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,7.0,1.0,0.0,0.0,0.0,26.0,2.0,0.0,4.0
2,3.0,12.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,11.0,5.0,0.0,1.0,0.0,24.0,1.0,0.0,4.0
3,4.0,1.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,6.0
4,5.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
837829,836621.0,8.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0
837830,836622.0,27.0,68.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,11.0,10.0,0.0,0.0,0.0,55.0,1.0,0.0,11.0
837831,836623.0,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,2.0
837832,344599.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,3.0,0.0,0.0,0.0,5.0,0.0,0.0,1.0


In [13]:
embed_postag = list(df_node_embedding.columns)
embed_postag.remove("id")
embedded_nodeset = df_node_embedding[embed_postag]

## Make trainset embeded

In [14]:
train_embedded_test = []

for i in tqdm(range(len(train))):
  idx1 = train['id1'][i]
  idx2 = train['id2'][i]
  train_embedded_test.append(list(embedded_nodeset.loc[idx1]) + list(embedded_nodeset.loc[idx2]))

100%|████████████████████████████████████████████████████████████████████████| 948232/948232 [01:43<00:00, 9149.83it/s]


In [15]:
train_df = pd.DataFrame(train_embedded_test)
train_df.columns = name_postag(embed_postag, "1") + name_postag(embed_postag, "2")
train_df = pd.concat([train_df, pd.DataFrame(train['label'])], axis =1).fillna(0)
train_df

Unnamed: 0,CD1,IN1,FW1,JJR1,WP$1,$1,''1,",1",EX1,PDT1,...,VBN2,VBG2,``2,WP2,:2,DT2,WDT2,UH2,RB2,label
0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,1
1,2.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0
2,4.0,49.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,1
3,11.0,30.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,2.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0
4,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,7.0,1.0,0.0,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
948227,4.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,5.0,0.0,0.0,2.0,1
948228,2.0,28.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,7.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,3.0,0
948229,3.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,6.0,1.0,0.0,0.0,0
948230,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,2.0,0


## Make testset embedded

In [16]:
test_embedded_test = []

for i in tqdm(range(len(test))):
  idx1 = test['id1'][i]
  idx2 = test['id2'][i]
  test_embedded_test.append(list(embedded_nodeset.loc[idx1]) + list(embedded_nodeset.loc[idx2]))

100%|████████████████████████████████████████████████████████████████████████| 238364/238364 [00:26<00:00, 9118.91it/s]


In [17]:
test_df = pd.DataFrame(test_embedded_test).fillna(0)
test_df.columns = name_postag(embed_postag, "1") + name_postag(embed_postag, "2")
test_df

Unnamed: 0,CD1,IN1,FW1,JJR1,WP$1,$1,''1,",1",EX1,PDT1,...,RBR2,VBN2,VBG2,``2,WP2,:2,DT2,WDT2,UH2,RB2
0,8.0,10.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,2.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
1,14.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
2,17.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,6.0,0.0,0.0,3.0
4,8.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238359,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,3.0,0.0,0.0,0.0,10.0,1.0,0.0,3.0
238360,10.0,46.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0
238361,5.0,9.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,4.0,0.0,0.0,0.0,25.0,1.0,0.0,3.0
238362,17.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,2.0,0.0,0.0,0.0,6.0,0.0,0.0,2.0


# Create the trainset and validation set

In [18]:
test_label = pd.read_csv("test.csv")

In [19]:
test_id = test_label['id']

In [20]:
data = train_df
test = test_df

In [21]:
y = np.array(data['label'])
y

array([1, 0, 1, ..., 0, 0, 0], dtype=int64)

In [22]:
def de_weighted(x):
  if x > 0:
    return 1
  if x == 0:
    return 0

In [25]:
postag = list(data.columns)
postag.remove("label")

X = np.array(data[postag])
X

array([[ 2.,  6.,  0., ...,  0.,  0.,  1.],
       [ 2.,  3.,  1., ...,  0.,  0.,  1.],
       [ 4., 49.,  0., ...,  0.,  0.,  1.],
       ...,
       [ 3., 15.,  0., ...,  1.,  0.,  0.],
       [ 1.,  3.,  0., ...,  0.,  0.,  2.],
       [10., 17.,  0., ...,  0.,  0.,  1.]])

In [26]:
split_ratio = 0.1

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=42)

# Training random forest models

In [30]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=150, random_state=0)
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)

In [31]:
print(classification_report(y_test, y_pred, digits = 16))

                  precision    recall  f1-score   support

               0  1.0000000000000000 1.0000000000000000 1.0000000000000000     51198
               1  1.0000000000000000 1.0000000000000000 1.0000000000000000     43626

        accuracy                      1.0000000000000000     94824
       macro avg  1.0000000000000000 1.0000000000000000 1.0000000000000000     94824
    weighted avg  1.0000000000000000 1.0000000000000000 1.0000000000000000     94824



# Predict the result on test set and export to file

In [32]:
y_pred_on_test = forest.predict(np.array(test))
submission = pd.DataFrame(np.array([list(test_id), list(y_pred_on_test)]).T)
submission.columns = ["id", "label"]

In [33]:
submission.to_csv("submission_forest_nltkfull.csv", index=False)