In [1]:
import pandas as pd
import numpy as np
import glob
import os
import json
import csv
import math
import scipy.sparse
from scipy.sparse import csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from bs4 import BeautifulSoup
import requests

In [2]:
pd.set_option('display.max_colwidth', -1)

## Check out results of extracting XML attributes

In [3]:
texts = pd.read_csv('data/raw/texts.csv', encoding='ISO-8859-1')
texts.iloc[[100]]

Unnamed: 0,text_id,text
100,kbh-fragment01,"The two companies concerned are trying to record the spoken word.\r\nWe do n't want you on tape.\r\nSo the thing, the thing to do is to sort of dot around in the conversation things like antidisestablishmentarianism.\r\nAnd hopefully\r\nThis will get modern usage of words.\r\nOf, one of the words that she was talking about, people have started using wicked for a normal phrase.\r\nI mean, it does n't mean what it says in the dictionary any more.\r\nThat 's a really wicked thing to say, he went, no it 's not it 's bad, it 's not wicked.\r\nHe thinks of wicked as being good, he does n't, cause Mandy says wicked.\r\nYeah.\r\nWell that 's the idea anyway.\r\nHe goes round saying wicked, it 's his favourite word.\r\nAre you going to go to sleep, eh?\r\nPlease That does n't mean much, does it?\r\nShe 's just looking over there to see which one she wants to throw up on.\r\nWho do you want to go and gurgle at?\r\nEh?\r\nOh not Uncle Brian, no no no.\r\nNo?\r\nHave you had enough grub for a little while?\r\nThere you are, go and see somebody.\r\nshe 's got a ginormous one.\r\nShe had it.\r\nWell do n't to me I 've only just you\r\nCome on, give Auntie Pauline a nice smile.\r\nGo on, you do lovely smiles.\r\nIf she smiles it will be at your hairline.\r\nPardon?\r\nShe wo n't be able to see that far, will she?\r\nShe might be able to see a blurred shape.\r\nshe 's just looked this way now.\r\nShe kept looking over there.\r\nDo n't know what she was looking at.\r\nWhat were you looking at?\r\nWell she definitely looks at her mobile.\r\nYeah?\r\nI know it drives you mad, the music but er\r\nI would n't mind if those blasted slightly longer life in terms of the music they played.\r\nBut unfortunately, you know, it 's or whatever.\r\nBut it 's over again.\r\nWell, yes.\r\nWell I mean that 's what 's nice for them, is n't it?\r\nYeah.\r\nYou see they 've got a short concentration.\r\nDo you think music in the background sort of, do all the lyrics as well.\r\nOh of course Oh well done.\r\nthe last half hour.\r\na little bit was n't it?\r\nShe 's had a lot of that today.\r\nHave you?\r\nWell do n't put it on too loud, will you?\r\nwhy babies have a suck there.\r\nYou say they 're having milk, he went, what do you mean?\r\nMilk?\r\nWhat do you mean milk comes out there?\r\nSo I said well it 's only when you 've just had a baby like, you know, like cows.\r\nLove him."


In [4]:
texts[texts['text'].isnull()]

Unnamed: 0,text_id,text


In [5]:
sentences = pd.read_csv('data/raw/sentences.csv', encoding='ISO-8859-1')
display(len(sentences))
display(sentences.sample(5))
sentences.tail(3)

16183

Unnamed: 0,sentence_id,paragraph_id,sentence
4864,4864,1437, I 'm sure she ca n't do justice to your designs.
3880,3880,1183,"fossil fuel combustion, for instance, has led to increased concentrations of sulphate in precipitation( acid rain') and many rivers, lakes and estuaries have been greatly affected by phosphates from agricultural, urban or industrial sources."
11699,11699,-1,Bye!
12164,12164,-1,That 's an old trick.
523,523,183,"Dr Runcie repeated his offer of a limited primacy, involving the limited powers exercised by the popes in the first few centuries of the Christian Church."


Unnamed: 0,sentence_id,paragraph_id,sentence
16180,16180,-1,Some else's sitting at their desk?
16181,16181,-1,Well not you know cleaning so I do n't know what kind of work they do.
16182,16182,-1,Oh well if you 're here that 's all right


In [6]:
len(sentences[sentences['sentence'].isnull()])

0

In [22]:
all_words = pd.read_csv('data/raw/words.csv', encoding='ISO--8859-1', na_filter=False)
display(len(all_words))
all_words.head()

204834

Unnamed: 0,word_id,word,lemma,word_type,function,seg_type,sentence_id,text_id
0,0,Latest,late,AJS,,,0,a1e-fragment01
1,1,corporate,corporate,AJ0,,,0,a1e-fragment01
2,2,unbundler,unbundler,NN1,,,0,a1e-fragment01
3,3,reveals,reveal,VVZ,mrw,met,0,a1e-fragment01
4,4,laid-back,laid-back,AJ0,,,0,a1e-fragment01


In [23]:
all_words['text_id'].nunique()

117

In [9]:
display(all_words[all_words['word']==''])
all_words[all_words['word'].isnull()]

Unnamed: 0,word_id,word,lemma,word_type,function,seg_type,sentence_id,text_id


Unnamed: 0,word_id,word,lemma,word_type,function,seg_type,sentence_id,text_id


## Partition exploration set and final evaluation set using genre tags

Get BNC-Baby genres and tags

In [10]:
html_doc = requests.get('http://www.natcorp.ox.ac.uk/corpus/baby/thebib.html').content

soup = BeautifulSoup(html_doc, 'html.parser')

In [11]:
genre_tags = {
    'newspapers': set(),
    'fiction': set(),
    'academic': set(),
    'conversation': set()
}

genre_name_map = {
    'Academic prose': 'academic',
    'Spontaneous conversation': 'conversation',
    'Prose fiction': 'fiction',
    'Newspapers': 'newspapers'
}

for p in soup.find_all('p'):
    genre = p.text
    if genre in genre_name_map.keys():
        ids = p.next_sibling.next_sibling
        assert ids.name == 'dl'
        for id in ids.findChildren("dt" , recursive=False):
            genre_tags[genre_name_map[genre]].add(id['id'].lower())

print(genre_tags)

{'newspapers': {'a3m', 'k3c', 'a8p', 'a8t', 'a97', 'k5b', 'k2b', 'cel', 'ahd', 'cbm', 'al0', 'a4d', 'a9g', 'ch3', 'a8r', 'a3e', 'k4u', 'a1e', 'a9j', 'a8n', 'e9s', 'k3d', 'al5', 'a91', 'a9y', 'a1j', 'ahb', 'k29', 'ajg', 'k5e', 'a1p', 'a5e', 'a2d', 'a7w', 'k37', 'aar', 'k3b', 'ahc', 'a84', 'k2a', 'cbe', 'a7t', 'a8l', 'a38', 'k38', 'cbd', 'a80', 'k36', 'k4y', 'a1m', 'aa3', 'k2n', 'a82', 'a8m', 'al2', 'cfc', 'a1n', 'k3a', 'aam', 'a8u', 'a1f', 'k2e', 'a9p', 'a31', 'a98', 'k2c', 'a1x', 'a3c', 'a1g', 'k58', 'k4r', 'k5c', 'k5k', 'aa6', 'a1u', 'a36', 'ahl', 'ajf', 'aj1', 'k4s', 'a1l', 'ahe', 'a8s', 'a7s', 'ajw', 'a39', 'a7x', 'a1h', 'k39', 'ahh', 'ahf', 'a3k', 'bm4', 'a9x', 'a3p', 'a7y', 'a1k'}, 'fiction': {'g0y', 'gvl', 'fpb', 'h9d', 'bpa', 'j54', 'g0s', 'cb5', 'guu', 'g01', 'ccw', 'bmw', 'k8v', 'h9c', 'h85', 'j10', 'ac2', 'cfy', 'cdb', 'fet', 'c8t', 'g0l', 'ab9', 'faj', 'hr9'}, 'academic': {'clp', 'fc1', 'f9v', 'hxh', 'as6', 'fss', 'ft1', 'b2k', 'ecv', 'fef', 'j57', 'crs', 'b1g', 'b17', 'hwv'

Add genre and tag info to DataFrame.

In [24]:
all_words['text_tag'] = all_words['text_id'].apply(lambda x: x.split('-')[0])
all_words.sample(5)

Unnamed: 0,word_id,word,lemma,word_type,function,seg_type,sentence_id,text_id,text_tag
106846,106846,chair,chair,NN1,,,5592,ccw-fragment03,ccw
69971,69971,ON,on,PRP,mrw,met,3592,as6-fragment02,as6
150266,150266,instructor,instructor,NN1,,,8124,fpb-fragment01,fpb
121564,121564,forces,force,NN2,,,6441,clw-fragment01,clw
157797,157797,the,the,AT0,,,8972,kb7-fragment10,kb7


In [63]:
all_words['genre'] = all_words['text_tag'].apply(lambda x: tag_to_genre_map[x])
all_words.sample(5)

Unnamed: 0,word_id,word,lemma,word_type,function,seg_type,sentence_id,text_id,text_tag,genre
60718,60718,of,of,PRF,,,3193,alp-fragment01,alp,academic
56426,56426,Donations,donation,NN2,,,2996,ahl-fragment02,ahl,newspapers
24290,24290,come,come,VVN,mrw,met,1228,a5e-fragment06,a5e,newspapers
186406,186406,to,to,TO0,,,13032,kbp-fragment09,kbp,conversation
798,798,a,a,AT0,,,40,a1f-fragment07,a1f,newspapers


Pick random sample of text IDs according to Klebanov 2014 test size to form test set.

In [134]:
sample_sizes = {
    'academic': 6,
    'conversation': 4,
    'fiction': 3,
    'newspapers': 14
}

test_texts = {
    'academic': [],
    'conversation': [],
    'fiction': [],
    'newspapers': []
}

for genre, group in all_words[['text_id', 'genre']].drop_duplicates().groupby('genre'):
    texts = list(group['text_id'])
    test_texts[genre] = (np.random.choice(list(texts), sample_sizes[genre], replace=False))
    
test_texts

{'academic': array(['amm-fragment02', 'b17-fragment02', 'clw-fragment01',
        'fef-fragment03', 'crs-fragment01', 'cty-fragment03'], dtype='<U14'),
 'conversation': array(['kbh-fragment02', 'kbh-fragment09', 'kbp-fragment09',
        'kbw-fragment17'], dtype='<U14'),
 'fiction': array(['ccw-fragment03', 'ccw-fragment04', 'bpa-fragment14'], dtype='<U14'),
 'newspapers': array(['a1f-fragment06', 'a3m-fragment02', 'al2-fragment23',
        'a3c-fragment05', 'ahd-fragment06', 'a1j-fragment34',
        'a1k-fragment02', 'ahf-fragment63', 'ahe-fragment03',
        'a8n-fragment19', 'ahl-fragment02', 'a7t-fragment01',
        'a4d-fragment02', 'a1x-fragment03'], dtype='<U14')}

Save datasets.

In [147]:
for genre, texts in test_texts.items():
    display(genre)
    dataset = all_words[(all_words['genre']==genre)]
    test_set = dataset[dataset['text_id'].isin(test_texts[genre])]
    train_set = dataset[~dataset['text_id'].isin(test_texts[genre])]
    
    test_set_file_name = 'data/test/{}/words.csv'.format(genre)
    test_set.to_csv(test_set_file_name, index=False)
    display('{}: {}'.format(test_set_file_name, len(test_set)))
    
    train_set_file_name = 'data/train/{}/words.csv'.format(genre)
    train_set.to_csv(train_set_file_name, index=False)
    display('{}: {}'.format(train_set_file_name, len(train_set)))

'academic'

'data/test/academic/words.csv: 17223'

'data/train/academic/words.csv: 48964'

'conversation'

'data/test/conversation/words.csv: 6983'

'data/train/conversation/words.csv: 41595'

'fiction'

'data/test/fiction/words.csv: 6914'

'data/train/fiction/words.csv: 38004'

'newspapers'

'data/test/newspapers/words.csv: 9411'

'data/train/newspapers/words.csv: 35740'