# Sentential Relation Prediction
*LING 7800: Computational Models of Discourse*

This ipynb is to test our pre-processing and data cleaning functions on a toy dataset.

In [8]:
import pandas as pd
import numpy as np
import warnings
import random

from util import *
warnings.filterwarnings('ignore')

### Testing PSRN() and EWN() Functions on Toy Data

    Example data set:

In [9]:
# Creating a toy dataset to help understand the change in data structure
SectionNumber = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
FileNumber = [3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4]
SentenceNumber = [3, 5, 6, 8, 13, 14, 22, 24, 25, 29, 5, 6]

S1 = ["sentence_1a", 
      "sentence_2a", 
      "sentence_3a", 
      "sentence_4a", 
      "sentence_5a", 
      "sentence_6a", 
      "sentence_7a", 
      "sentence_8a", 
      "sentence_9a", 
      "sentence_10a", 
      "sentence_11a", 
      "sentence_12a"
      ]

S2 = ["sentence_1b", 
      "sentence_2b", 
      "sentence_3b", 
      "sentence_4b", 
      "sentence_5b", 
      "sentence_6b", 
      "sentence_7b", 
      "sentence_8b", 
      "sentence_9b", 
      "sentence_10b", 
      "sentence_11b", 
      "sentence_12b"
      ]

Level_1 = [3, 3, 1, 3, 3, 1, 3, 1, 3, 2, 1, 0]
Level_2 = [0, 1, 2, 1, 1, 2, 0, 2, 1, 1, 2, 3]

example_df = pd.DataFrame(list(zip(SectionNumber, FileNumber, SentenceNumber, S1, S2, Level_1, Level_2)), columns =['SectionNumber', 'FileNumber', 'SentenceNumber', 'S1', 'S2', 'Level 1', 'Level 2'])
example_df[:15]

Unnamed: 0,SectionNumber,FileNumber,SentenceNumber,S1,S2,Level 1,Level 2
0,0,3,3,sentence_1a,sentence_1b,3,0
1,0,3,5,sentence_2a,sentence_2b,3,1
2,0,3,6,sentence_3a,sentence_3b,1,2
3,0,3,8,sentence_4a,sentence_4b,3,1
4,0,3,13,sentence_5a,sentence_5b,3,1
5,0,3,14,sentence_6a,sentence_6b,1,2
6,0,3,22,sentence_7a,sentence_7b,3,0
7,0,4,24,sentence_8a,sentence_8b,1,2
8,0,4,25,sentence_9a,sentence_9b,3,1
9,0,4,29,sentence_10a,sentence_10b,2,1


    This is how we want our EWN to look:

In [10]:
S1 = ["sentence_1a", 
      "sentence_1b sentence_2a",
      "sentence_2b sentence_3a",
      "sentence_3b sentence_4a",
      "sentence_4b sentence_5a",
      "sentence_5b sentence_6a",
      "sentence_6b sentence_7a",
      "sentence_8a",
      "sentence_8b sentence_9a",
      "sentence_9b sentence_10a",
      "sentence_10b sentence_11a",
      "sentence_11b sentence_12a"
      ]

S2 = ["sentence_1b sentence_2a", 
      "sentence_2b sentence_3a",
      "sentence_3b sentence_4a",
      "sentence_4b sentence_5a",
      "sentence_5b sentence_6a",
      "sentence_6b sentence_7a",
      "sentence_7b",
      "sentence_8b sentence_9a",
      "sentence_9b sentence_10a",
      "sentence_10b sentence_11a",
      "sentence_11b sentence_12a",
      "sentence_12b"
       ]

example_EWN = pd.DataFrame(list(zip(SectionNumber, FileNumber, SentenceNumber, S1, S2, Level_1, Level_2)), columns =['SectionNumber', 'FileNumber', 'SentenceNumber', 'S1', 'S2', 'Level 1', 'Level 2'])
example_EWN[:15]

Unnamed: 0,SectionNumber,FileNumber,SentenceNumber,S1,S2,Level 1,Level 2
0,0,3,3,sentence_1a,sentence_1b sentence_2a,3,0
1,0,3,5,sentence_1b sentence_2a,sentence_2b sentence_3a,3,1
2,0,3,6,sentence_2b sentence_3a,sentence_3b sentence_4a,1,2
3,0,3,8,sentence_3b sentence_4a,sentence_4b sentence_5a,3,1
4,0,3,13,sentence_4b sentence_5a,sentence_5b sentence_6a,3,1
5,0,3,14,sentence_5b sentence_6a,sentence_6b sentence_7a,1,2
6,0,3,22,sentence_6b sentence_7a,sentence_7b,3,0
7,0,4,24,sentence_8a,sentence_8b sentence_9a,1,2
8,0,4,25,sentence_8b sentence_9a,sentence_9b sentence_10a,3,1
9,0,4,29,sentence_9b sentence_10a,sentence_10b sentence_11a,2,1


    This is how we want our PSRN to look:

In [11]:
S1 = ["sentence_1a",
      random.choice(["sentence_1a", "sentence_1b"]) + " " + "sentence_2a",
      random.choice(["sentence_1a", "sentence_1b", "sentence_2a", "sentence_2b"]) + " " + "sentence_3a",
      random.choice(["sentence_1a", "sentence_1b", "sentence_2a", "sentence_2b", "sentence_3a", "sentence_3b"]) + " " + "sentence_4a",
      random.choice(["sentence_1a", "sentence_1b", "sentence_2a", "sentence_2b", "sentence_3a", "sentence_3b", "sentence_4a", "sentence_4b"]) + " " + "sentence_5a",
      random.choice(["sentence_1a", "sentence_1b", "sentence_2a", "sentence_2b", "sentence_3a", "sentence_3b", "sentence_4a", "sentence_4b", "sentence_5a", "sentence_5b"]) + " " + "sentence_6a",
      random.choice(["sentence_1a", "sentence_1b", "sentence_2a", "sentence_2b", "sentence_3a", "sentence_3b", "sentence_4a", "sentence_4b", "sentence_5a", "sentence_5b", "sentence_6a", "sentence_6b"]) + " " + "sentence_7a",
      random.choice([""]) + " " + "sentence_8a",
      random.choice(["sentence_8a", "sentence_8b"]) + " " + "sentence_9a",
      random.choice(["sentence_8a", "sentence_8b", "sentence_9a", "sentence_9b"]) + " " + "sentence_10a",
      random.choice(["sentence_8a", "sentence_8b", "sentence_9a", "sentence_9b", "sentence_10a", "sentence_10b"]) + " " + "sentence_11a",
      random.choice(["sentence_8a", "sentence_8b", "sentence_9a", "sentence_9b", "sentence_10a", "sentence_10b", "sentence_11a", "sentence_11b"]) + " " + "sentence_12a"
      ]

S2 = ["sentence_1b" + " " + random.choice(["sentence_2a", "sentence_2b", "sentence_3a", "sentence_3b", "sentence_4a", "sentence_4b", "sentence_5a", "sentence_5b", "sentence_6a", "sentence_6b", "sentence_7a", "sentence_7b"]),
      "sentence_2b" + " " + random.choice(["sentence_3a", "sentence_3b", "sentence_4a", "sentence_4b", "sentence_5a", "sentence_5b", "sentence_6a", "sentence_6b", "sentence_7a", "sentence_7b"]),
      "sentence_3b" + " " + random.choice(["sentence_4a", "sentence_4b", "sentence_5a", "sentence_5b", "sentence_6a", "sentence_6b", "sentence_7a", "sentence_7b"]),
      "sentence_4b" + " " + random.choice(["sentence_5a", "sentence_5b", "sentence_6a", "sentence_6b", "sentence_7a", "sentence_7b"]),
      "sentence_5b" + " " + random.choice(["sentence_6a", "sentence_6b", "sentence_7a", "sentence_7b"]),
      "sentence_6b" + " " + random.choice(["sentence_7a", "sentence_7b"]),
      "sentence_7b",
      "sentence_8b" + " " + random.choice(["sentence_9a", "sentence_9b", "sentence_10a", "sentence_10b", "sentence_11a", "sentence_11b", "sentence_12a", "sentence_12b"]),
      "sentence_9b" + " " + random.choice(["sentence_10a", "sentence_10b", "sentence_11a", "sentence_11b", "sentence_12a", "sentence_12b"]),
      "sentence_10b" + " " + random.choice(["sentence_11a", "sentence_11b", "sentence_12a", "sentence_12b"]),
      "sentence_11b" + " " + random.choice(["sentence_12a", "sentence_12b"]),
      "sentence_12b"
      ]

example_PSRN = pd.DataFrame(list(zip(SectionNumber, FileNumber, SentenceNumber, S1, S2, Level_1, Level_2)), columns =['SectionNumber', 'FileNumber', 'SentenceNumber', 'S1', 'S2', 'Level 1', 'Level 2'])
example_PSRN[:15]

Unnamed: 0,SectionNumber,FileNumber,SentenceNumber,S1,S2,Level 1,Level 2
0,0,3,3,sentence_1a,sentence_1b sentence_5a,3,0
1,0,3,5,sentence_1b sentence_2a,sentence_2b sentence_6b,3,1
2,0,3,6,sentence_1b sentence_3a,sentence_3b sentence_7b,1,2
3,0,3,8,sentence_1b sentence_4a,sentence_4b sentence_6b,3,1
4,0,3,13,sentence_3a sentence_5a,sentence_5b sentence_6b,3,1
5,0,3,14,sentence_5b sentence_6a,sentence_6b sentence_7a,1,2
6,0,3,22,sentence_4b sentence_7a,sentence_7b,3,0
7,0,4,24,sentence_8a,sentence_8b sentence_9b,1,2
8,0,4,25,sentence_8b sentence_9a,sentence_9b sentence_12a,3,1
9,0,4,29,sentence_9b sentence_10a,sentence_10b sentence_11b,2,1


In [12]:
test_EWN = EWN(example_df, n=1)
test_EWN[:12]

Unnamed: 0,SectionNumber,FileNumber,SentenceNumber,S1,S2,Level 1,Level 2
0,0,3,3,sentence_1a,sentence_1b sentence_2a,3,0
1,0,3,5,sentence_1b sentence_2a,sentence_2b sentence_3a,3,1
2,0,3,6,sentence_2b sentence_3a,sentence_3b sentence_4a,1,2
3,0,3,8,sentence_3b sentence_4a,sentence_4b sentence_5a,3,1
4,0,3,13,sentence_4b sentence_5a,sentence_5b sentence_6a,3,1
5,0,3,14,sentence_5b sentence_6a,sentence_6b sentence_7a,1,2
6,0,3,22,sentence_6b sentence_7a,sentence_7b,3,0
7,0,4,24,sentence_8a,sentence_8b sentence_9a,1,2
8,0,4,25,sentence_8b sentence_9a,sentence_9b sentence_10a,3,1
9,0,4,29,sentence_9b sentence_10a,sentence_10b sentence_11a,2,1


In [20]:
test_PSRN = PSRN(example_df, n=1)
test_PSRN[:12]

Unnamed: 0,SectionNumber,FileNumber,SentenceNumber,S1,S2,Level 1,Level 2
0,0,3,3,sentence_1a,sentence_1b sentence_1b,3,0
1,0,3,5,sentence_5a sentence_2a,sentence_2b sentence_1b,3,1
2,0,3,6,sentence_3a sentence_3a,sentence_3b sentence_5b,1,2
3,0,3,8,sentence_6a sentence_4a,sentence_4b sentence_4b,3,1
4,0,3,13,sentence_3a sentence_5a,sentence_5b sentence_3b,3,1
5,0,3,14,sentence_3a sentence_6a,sentence_6b sentence_5b,1,2
6,0,3,22,sentence_1a sentence_7a,sentence_7b,3,0
7,0,4,24,sentence_8a,sentence_8b sentence_10b,1,2
8,0,4,25,sentence_9a sentence_9a,sentence_9b sentence_10b,3,1
9,0,4,29,sentence_8a sentence_10a,sentence_10b sentence_10b,2,1


In [27]:
test_directNeighbors = directNeighbors(example_df, n=1)
test_directNeighbors[:12]

Unnamed: 0,SectionNumber,FileNumber,SentenceNumber,S1,S2,Level 1,Level 2
1,0,3,5,<s1>sentence_2a</s1>,<s2>sentence_2b</s2> <s2>sentence_3a</s2>,3,1
2,0,3,6,<s1>sentence_2b</s1> <s1>sentence_3a</s1>,<s2>sentence_3b</s2>,1,2
4,0,3,13,<s1>sentence_5a</s1>,<s2>sentence_5b</s2> <s2>sentence_6a</s2>,3,1
5,0,3,14,<s1>sentence_5b</s1> <s1>sentence_6a</s1>,<s2>sentence_6b</s2>,1,2
7,0,4,24,<s1>sentence_8a</s1>,<s2>sentence_8b</s2> <s2>sentence_9a</s2>,1,2
8,0,4,25,<s1>sentence_8b</s1> <s1>sentence_9a</s1>,<s2>sentence_9b</s2>,3,1
10,0,4,5,<s1>sentence_11a</s1>,<s2>sentence_11b</s2> <s2>sentence_12a</s2>,1,2
11,0,4,6,<s1>sentence_11b</s1> <s1>sentence_12a</s1>,<s2>sentence_12b</s2>,0,3
