## Imports

In [None]:
from collections import Counter, defaultdict
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import utils
from tabulate import tabulate
import matplotlib.pyplot as plt
import seaborn as sns
from gensim.models.word2vec import Word2Vec
from gensim.models import Word2Vec
import gensim
import re
import numpy as np
import pylab as pl

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import learning_curve, GridSearchCV
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")


%matplotlib inline

In [None]:
# imports needed and logging
import gzip
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

## Reading Data 

### Reading the BNC written data 

In [None]:
# Natural Language Toolkit: Plaintext Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

"""Corpus reader for the XML version of the British National Corpus."""

from nltk.corpus.reader.util import concat
from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView, ElementTree


class BNCCorpusReader(XMLCorpusReader):
    """Corpus reader for the XML version of the British National Corpus.

    For access to the complete XML data structure, use the ``xml()``
    method.  For access to simple word lists and tagged word lists, use
    ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.

    You can obtain the full version of the BNC corpus at
    http://www.ota.ox.ac.uk/desc/2554

    If you extracted the archive to a directory called `BNC`, then you can
    instantiate the reader as::

        BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')

    """

    def __init__(self, root, fileids, lazy=True):
        XMLCorpusReader.__init__(self, root, fileids)
        self._lazy = lazy

    def words(self, fileids=None, strip_space=True, stem=True):
        """
        :return: the given file(s) as a list of words
            and punctuation symbols.
        :rtype: list(str)

        :param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        """
        return self._views(fileids, False, None, strip_space, stem)

    def tagged_words(self, fileids=None, c5=True, strip_space=True, stem=True):
        """
        :return: the given file(s) as a list of tagged
            words and punctuation symbols, encoded as tuples
            ``(word,tag)``.
        :rtype: list(tuple(str,str))

        :param c5: If true, then the tags used will be the more detailed
            c5 tags.  Otherwise, the simplified tags will be used.
        :param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        """
        tag = 'c5' if c5 else 'pos'
        return self._views(fileids, False, tag, strip_space, stem)

    def sents(self, fileids=None, strip_space=True, stem=True):
        """
        :return: the given file(s) as a list of
            sentences or utterances, each encoded as a list of word
            strings.
        :rtype: list(list(str))

        :param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        """
        return self._views(fileids, True, None, strip_space, stem)

    def tagged_sents(self, fileids=None, c5=True, strip_space=True, stem=True):
        """
        :return: the given file(s) as a list of
            sentences, each encoded as a list of ``(word,tag)`` tuples.
        :rtype: list(list(tuple(str,str)))

        :param c5: If true, then the tags used will be the more detailed
            c5 tags.  Otherwise, the simplified tags will be used.
        :param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        """
        tag = 'c5' if c5 else 'pos'
        return self._views(
            fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem
        )

    def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=True):
        """A helper function that instantiates BNCWordViews or the list of words/sentences."""
        f = BNCWordView if self._lazy else self._words
        return concat(
            [
                f(fileid, sent, tag, strip_space, stem)
                for fileid in self.abspaths(fileids)
            ]
        )

    def _words(self, fileid, bracket_sent, tag, strip_space, stem):
        """
        Helper used to implement the view methods -- returns a list of
        words or a list of sentences, optionally tagged.

        :param fileid: The name of the underlying file.
        :param bracket_sent: If true, include sentence bracketing.
        :param tag: The name of the tagset to use, or None for no tags.
        :param strip_space: If true, strip spaces from word tokens.
        :param stem: If true, then substitute stems for words.
        """
        result = []

        xmldoc = ElementTree.parse(fileid).getroot()
        for xmlsent in xmldoc.findall('.//s'):
            sent = []
            for xmlword in _all_xmlwords_in(xmlsent):
                word = xmlword.text
                if not word:
                    word = ""  # fixes issue 337?
                if strip_space or stem:
                    word = word.strip()
                if stem:
                    word = xmlword.get('hw', word)
                if tag == 'c5':
                    word = (word, xmlword.get('c5'))
                elif tag == 'pos':
                    word = (word, xmlword.get('pos', xmlword.get('c5')))
                sent.append(word)
            if bracket_sent:
                result.append(BNCSentence(xmlsent.attrib['n'], sent))
            else:
                result.extend(sent)

        assert None not in result
        return result

In [None]:
def _all_xmlwords_in(elt, result=None):
    if result is None:
        result = []
    for child in elt:
        if child.tag in ('w'):
            result.append(child)
        else:
            _all_xmlwords_in(child, result)
    return result


class BNCSentence(list):
    """
    A list of words, augmented by an attribute ``num`` used to record
    the sentence identifier (the ``n`` attribute from the XML).
    """

    def __init__(self, num, items):
        self.num = num
        list.__init__(self, items)

In [None]:
class BNCWordView(XMLCorpusView):
    """
    A stream backed corpus view specialized for use with the BNC corpus.
    """

    tags_to_ignore = set(
        ['stext','pb', 'gap', 'vocal', 'event', 'unclear', 'shift', 'pause', 'align','c']
    )
    """These tags are ignored. For their description refer to the
    technical documentation, for example,
    http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html

    """

    def __init__(self, fileid, sent, tag, strip_space, stem):
        """
        :param fileid: The name of the underlying file.
        :param sent: If true, include sentence bracketing.
        :param tag: The name of the tagset to use, or None for no tags.
        :param strip_space: If true, strip spaces from word tokens.
        :param stem: If true, then substitute stems for words.
        """
        if sent:
            tagspec = '.*/s'
        else:
            tagspec = '.*/s/(.*/)?(w)'
        self._sent = sent
        self._tag = tag
        self._strip_space = strip_space
        self._stem = stem

        self.title = None  #: Title of the document.
        self.author = None  #: Author of the document.
        self.editor = None  #: Editor
        self.resps = None  #: Statement of responsibility

        XMLCorpusView.__init__(self, fileid, tagspec)

        # Read in a tasty header.
        self._open()
        self.read_block(self._stream, '.*/teiHeader$', self.handle_header)
        self.close()

        # Reset tag context.
        self._tag_context = {0: ()}

    def handle_header(self, elt, context):
        # Set up some metadata!
        titles = elt.findall('titleStmt/title')
        if titles:
            self.title = '\n'.join(title.text.strip() for title in titles)

        authors = elt.findall('titleStmt/author')
        if authors:
            self.author = '\n'.join(author.text.strip() for author in authors)

        editors = elt.findall('titleStmt/editor')
        if editors:
            self.editor = '\n'.join(editor.text.strip() for editor in editors)

        resps = elt.findall('titleStmt/respStmt')
        if resps:
            self.resps = '\n\n'.join(
                '\n'.join(resp_elt.text.strip() for resp_elt in resp) for resp in resps
            )

    def handle_elt(self, elt, context):
        if self._sent:
            return self.handle_sent(elt)
        else:
            return self.handle_word(elt)

    def handle_word(self, elt):
        word = elt.text
        if elt.get('c5') not in ['PUL','PUN','PUQ','PUR']:
            if not word:
                word = ""  # fixes issue 337?
            if self._strip_space or self._stem:
                word = word.strip()
            if self._stem:
                word = elt.get('hw', word)
            if self._tag == 'c5':
                    word = (word, elt.get('c5'))
            elif self._tag == 'pos':
                word = (word, elt.get('pos', elt.get('c5')))
            return word

    def handle_sent(self, elt):
        sent = []
        for child in elt:
            if child.tag in ('mw', 'hi', 'corr', 'trunc'):
                sent += [self.handle_word(w) for w in child]
            elif child.tag in ('w'):
                sent.append(self.handle_word(child))
            elif child.tag not in self.tags_to_ignore:
                raise ValueError('Unexpected element %s' % child.tag)
        return BNCSentence(elt.attrib['n'], sent)

In [None]:
parser = BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')

In [None]:
tagged_sentences_wordforms=[]
#we are having the root form of the words
for word in parser.tagged_sents():
    tagged_sentences_wordforms.append(word)

In [None]:
tagged_sentences_wordforms[:10]

In [None]:
tagged_sentences=[]
for s in tagged_sentences_wordforms:
    sentence=[]
    for w in s:
        if w!= None:
            sentence.append(w)
    tagged_sentences.append(sentence)

In [None]:
tagged_sentences[:10]

In [None]:
tagged_sentences=[i for i in tagged_sentences if i]

In [None]:
tagged_sentences[:10]

## _VPC (just up)

In [None]:
# identifying VPC
word_particle = []
# for l in word_verb:
for l in tagged_sentences:
    c = []
    for w in l:
        i = list(w)
        start_word = i[1]
        if start_word != None:
            if start_word.startswith('V'):
                flag = False
                start = l.index(w)+1
                end = l.index(w)+7
                for n in range(start, end):
                    if n < len(l):
                        if l[n][1] != None:
                            if l[n][1].startswith('V'):
                                break
                            elif l[n][1] == 'AVP' and l[n][0] == 'up':
                                c.append(i[0]+'_VPC')
                                flag = True
                        else:
                            continue
                if not flag:
                    c.append(i[0])
    #                 c.append(i.split('_')[0]+'_V')

            else:
                c.append(i[0])
        else:
            continue
    word_particle.append(c)

word_particle[:5]

In [None]:
word_particle=[i for i in word_particle if i]

In [None]:
for i in word_particle:
    for j in i:
        if j.find("_") != -1:
            print(j)

In [None]:
# particle features
verb_particle_list = []
for l in tagged_sentences:
    for w in l:
        i = list(w)
        start_word = i[1]
        if start_word != None:
            if start_word.startswith('V'):
                start = l.index(w)+1
                end = l.index(w)+7
                for n in range(start, end):
                    if n < len(l):
                        if l[n][1] != None:
                            if l[n][1].startswith('V'):
                                break
                            elif l[n][1] == 'AVP' and l[n][0] == 'up':
                                verb_particle_list.append(
                                    (i[0], l[n][0], n-start))
                        else:
                            continue

            else:
                continue
        else:
            continue
verb_particle_list[:5]

## _VPC

In [None]:
# # identifying VPC
# word_particle = []
# # for l in word_verb:
# for l in tagged_sentences:
#     c = []
#     for w in l:
#         i = list(w)
#         start_word = i[1]
#         if start_word != None:
#             if start_word.startswith('V'):
#                 flag = False
#                 start = l.index(w)+1
#                 end = l.index(w)+7
#                 for n in range(start, end):
#                     if n < len(l):
#                         if l[n][1] != None:
#                             if l[n][1].startswith('V'):
#                                 break
#                             elif l[n][1] == 'AVP':
#                                 c.append(i[0]+'_VPC')
#                                 flag = True
#                         else:
#                             continue
#                 if not flag:
#                     c.append(i[0])
#     #                 c.append(i.split('_')[0]+'_V')

#             else:
#                 c.append(i[0])
#         else:
#             continue
#     word_particle.append(c)

# word_particle[:5]

In [None]:
word_particle=[i for i in word_particle if i]

In [None]:
for i in word_particle:
    for j in i:
        if j.find("_") != -1:
            print(j)

In [None]:
# # particle features
# verb_particle_list = []
# for l in tagged_sentences:
#     for w in l:
#         i = list(w)
#         start_word = i[1]
#         if start_word != None:
#             if start_word.startswith('V'):
#                 start = l.index(w)+1
#                 end = l.index(w)+7
#                 for n in range(start, end):
#                     if n < len(l):
#                         if l[n][1] != None:
#                             if l[n][1].startswith('V'):
#                                 break
#                             elif l[n][1] == 'AVP':
#                                 verb_particle_list.append(
#                                     (i[0], l[n][0], n-start))
#                         else:
#                             continue

#             else:
#                 continue
#         else:
#             continue
# verb_particle_list[:5]

In [None]:
x=pd.DataFrame(verb_particle_list, columns=['verb','particle','position'])
x

In [None]:
x.to_excel("verb_particle_list.xlsx",index=False)

In [None]:
len(x.particle.unique())

In [None]:
len(x.verb.unique())

In [None]:
## did some feature engineering on excel!

In [None]:
particle_features_df=pd.read_excel('particle_features.xlsx')
# particle_features_df.fillna(0,inplace=True)

In [None]:
particle_features_df

In [None]:
particle_features_df.columns

In [None]:
particle_features_df[['about', 'along', 'around', 'back', 'by', 'down', 'in', 'off',
                          'on', 'out', 'over', 'round', 'through', 'under', 'up', 'up_0', 'up_1',
                          'up_2', 'up_3', 'up_4', 'up_5']].to_numpy(dtype='float32')

In [None]:
particle_features={}
for index, row in particle_features_df.iterrows():
    particle_features[particle_features_df.iloc[index]['verb']]=particle_features_df[['about', 'along', 'around', 'back', 'by', 'down', 'in', 'off',
                          'on', 'out', 'over', 'round', 'through', 'under', 'up', 'up_0', 'up_1',
                          'up_2', 'up_3', 'up_4', 'up_5']].to_numpy(dtype='float32')
particle_features['fire']

## _V 

In [None]:
word_verb=[]
# for l in d0:
for l in tagged_sentences:
    word_verb_tagged=[]
    for t in l:
        T = list(t)
        if str(T[1]).startswith('V'):
            T[0]=T[0]+'_V'
        word_verb_tagged.append(T[0])
    word_verb.append(word_verb_tagged)

## training word2vec on BNC corpus 

In [None]:
model_BNC_tagged_sentences_particle = Word2Vec(word_particle, min_count=5,size= 300,workers=3, window =5,sg=1)

In [None]:
model_BNC_tagged_sentences_particle.save("model_BNC_tagged_sentences_particle_sg_v2.model")

In [None]:
model_BNC_tagged_sentences_particle = Word2Vec(word_particle, min_count=5,size= 300,workers=3, window =5)

In [None]:
model_BNC_tagged_sentences_particle.save("model_BNC_tagged_sentences_particle_cw_v2.model")

In [None]:
model_BNC_tagged_sentences_verb = Word2Vec(word_verb, min_count=5,size= 300,workers=3, window =5,sg=1)

In [None]:
model_BNC_tagged_sentences_verb.save("model_BNC_tagged_sentences_verb_sg_v2.model")

In [None]:
model_BNC_tagged_sentences_verb = Word2Vec(word_verb, min_count=5,size= 300,workers=3, window =5)

In [None]:
model_BNC_tagged_sentences_verb.save("model_BNC_tagged_sentences_verb_cw_v2.model")

In [None]:
model_BNC_tagged_sentences_particle_up = Word2Vec(word_particle, min_count=5,size= 300,workers=3, window =5)

In [None]:
model_BNC_tagged_sentences_particle_up.save("model_BNC_tagged_sentences_particle_up_cw.model")

In [None]:
model_BNC_tagged_sentences_particle_up = Word2Vec(word_particle, min_count=5,size= 300,workers=3, window =5,sg=1)

In [None]:
model_BNC_tagged_sentences_particle_up.save("model_BNC_tagged_sentences_particle_up_sg.model")

In [None]:
# model_BNC_tagged_sentences_particle.save("model_BNC_tagged_sentences_particle_sg_window3.model")