In [1]:
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words"""
  with zipfile.ZipFile(filename) as f:
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  return data
  
words = read_data(filename)
print('Data size %d' % len(words))

Data size 17005207


In [155]:
vocabulary_size = 50000

def build_dataset(words):
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count = unk_count + 1
    data.append(index)
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
  return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])
del words  # Hint to reduce memory.

Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5242, 3083, 12, 6, 195, 2, 3136, 46, 59, 156]


In [133]:
grammar = r"""
  Company:{<NNP>+}    # chunk determiner/possessive, adjectives and noun
           
                         # chunk sequences of proper nouns
"""
cp = nltk.RegexpParser(grammar)


str1 = """
RefME takes the tedium out of compiling citations - and is proving extremely popular with students. The free app works by scanning the book you want to reference to speed up the task of creating, formatting and managing bibliographies.  RefMe raised $5 million (PS3.75 million) in seed round funding in April 2015, led by GEMS Global. This Hackney-based edtech startup blends hardware and software skills for children by selling toys that kids can build and code themselves. Technology Will Save Us has kits for building your own synthesiser, speaker, games console and even a starter soldering kit, with prices starting at PS15 for the BBC micro:bit. The company was dreamed up by husband and wife Daniel Hirschmann and Bethany Koby around their kitchen table in 2012. CEO Koby said last year: "I'm not sure there has ever been a more exciting time to be building a learning-focused organisation in the UK, and we are thrilled to be leading the charge and championing edtech in this field." Technology Will Save Us raised a PS1.2 million seed funding round led by Saatchinvest in December 2015. George Burgess is a 23-year-old British entrepreneur who dropped out from Stanford University to focus on growing edtech startup Gojimo. The exam revision app is available on iOS, Android and the web and offers over 50,000 free curriculum-specific quiz questions to over 300,000 monthly active users. Premium content from the likes of McGraw-Hill Education and Oxford University Press is also available as in-app purchases.  Gojimo previously raised over $1 million (PS630,000) in a seed round led by Index Ventures, which included participation by JamJar Investments (the innocent Drinks founders). Krishan Meetoo, along with his co-founder Carl Dawson, set up Proversity to provide a learning and development environment "across the employee lifecycle, from attracting talent through to on boarding them by providing the professional and soft skills they need for the role. So try before you buy with candidates," Meetoo said. Proversity is working in the corporate learning space, providing an on-demand mobile platform that brings employee attraction, recruitment and retention into one place. Candidates are given a structured learning programme, with each course being designed on a bespoke basis depending on the employer needs. Content can range from text, photos, motion graphics, video walkthroughs and various forms of assessment, from essays to simple check boxes. Proversity raised $1.6 million (PS1 million) in investment from Czech Republic fund RSBC Venture Capital in November 2015 and is planning a PS5-8 million Series A round this year as it expands into the USA, Middle East and ASEAN markets. Learning a new language is really hard, and companies have been trying to make it easier as far back as those giant Rosetta Stone software boxes were being advertised on the TV. UK startup Memrise claims to have cracked the science behind learning and retaining a new language, called 'elaborate encoding.' The platform then reinforces learning by testing and scheduling reminders to keep participants engaged. Memrise has raised more than $6 million (PS4.5 million) in seed and Series A funding so far, led by London VC firm Balderton Capital. London-based Firefly aims to free up teachers' time by simplifying the processes of sifting e-mail for homework, working with a clunky virtual learning environment or copying data into multiple apps that don't communicate. The portal integrates with existing school systems to bring everything into one place. Teachers can issue homework, give feedback and track progress in one place. For students this means being able to submit work at any time, so not having to hand in work while on study leave, for example. It also gives parents an easy way to track their child's progress and timetables. Firefly is already used in 300 schools across the UK and typically charges between PS3,000 and PS10,000 depending on the number of pupils. The startup has so far avoided venture capital. A similar, well established startup is London-based Show My Homework. The clue is in the name, as the platform aims to give teachers their evenings back by bringing time saving reports, instant access to online resources and access to homework into one place. The app also gives parents total visibility of their child's homework and students a place to store and manage all of their homework in one place. Show My Homework is already in more than 1500 schools globally and it raised PS2.4 million in funding from venture capital firm LocalGlobe in January 2017. Wonde is a Cambridge-based startup working to help schools keep their data secure. The platform allows schools to view and manage their data as it is accessed by third party applications. Once in place Wonde enables administrators to allow or deny access to school data and manage existing applications. For developers Wonde supplies an API for access to school data without having to set up agreements with individual organisations. Knowledgemotion's primary product is the boclips platform, which allows education providers to find and embed over two million video clips into their teaching materials without having to deal with tricky licensing issues or multiple content providers. The service is available as a clips library or a white label portal, with usage charged either per stream, or at a flat rate per asset used. Founder David Bainbridge told Techworld that he created Knowledgemotion after asking himself: "In an education world that is manifestly changing, where textbooks are becoming Chromebooks, why wasn't the content experience in classrooms - from schools to university to corporate training - catching up with the tech delivery opportunity that seemed to be unfolding?" The company has already signed a supply deal with education content publishers Pearson. Bainbridge says this means "clips are API integrated into their publishing tool. So when a textbook is being created the author can search and pull in the relevant clip to illustrate the paragraph." Knowledgemotion has raised close to PS1.2 million from angel investors and ICG Ventures - part of major textbook publisher Ingram Content Group - and is preparing for a Series A round at the time of writing. London-based startup Pobble is a platform for children's writing, with the aim of encouraging even the most reluctant young writers by opening up a global audience for their published work. There is a school version which can be embedded, with trackable metrics. Pobble claims to be used in 100 countries and has already seen 30,000 pieces published to the platform. Authors Anthony Horowitz and Michael Morpurgo are fans too. Pobble raised PS170,000 in crowdfunding with CrowdCube in 2015, and an additional PS900,000 funding round in February 2016. Digital Assess is trying to change the way students and assessors work together on feedback relating to pieces of work. Assessors can leave contextual feedback, so comments on a single line of music, a specific paragraph or a slide for a project, regardless of file format. The mobile platform lets work be presented as a "storyboard-style digital portfolio." Digital Assess works predominantly with higher education bodies, including Goldsmiths university and Eton college, but also vocational learning and accreditation bodies. Digital Assess raised $3 million (PS2.25 million) from investors including Nesta Impact Investments in June 2015. Fluency focuses on online learning and development courses in digital employability skills, such as analytics, coding, email, CRM, SEO and social media. There are just 25 courses available on the free portal at the time of writing, with each one clearly marked for how long it will take to complete. The aim is that as candidates take on skills and learnings they become more visible to employers. The platform has started pulling through digital jobs so once candidates successfully pass a course they become eligible for jobs through Fluency. The platform also aims to help employers find skilled candidates by designing courses tailored to their needs. Fluency raised $110,000 (PS82,000) in angel investment from Bethnal Green Ventures and Clearly Social Angels back in 2014. London-based startup AVADO focuses on corporate e-learning, teaching employees digital skills through bespoke courses. Lisa Barrett, managing director at AVADO told Techworld that each customer starts out with a face to face meeting with AVADO, where they will "do a capability audit or assessment around digital skills at the organisation to understand their vision and strategy and where the gaps are for digital ways of working," she said. One of these solutions is Dot Native, which allows employees to do short, video-driven learning tasks. "A key challenge around e-learning is that it doesn't work, it's repetitive and people don't finish. So we have designed bite-sized pieces of learning," she said. AVADO builds its learning tools on a highly customised version of the open source Moodle learning technology. The startup has been backed by VC firm Blenheim Chalcot an undisclosed amount. Oxford University graduate Shameer Thobhani has launched Lectus, an iOS-only app which allows students to connect with expert tutors around the world via video calls. The app is still a little buggy and at the time of writing there are only a handful of tutors, charging PS0.40 a minute, but the premise of opening up elite tutors to a broader audience is extremely compelling. Current tutors include a second year biomedical science student at Imperial College London and a 22 year old history and english graduate from Oxford. Subjects include university admissions, languages, history, mathematics, science, economics and geography. Filippo Yacob came up with the idea of Cubetto - a coding toy designed to help children aged three years and up to write their first computer programs - after the birth of his son. Cubetto is a wooden robot which teaches kids to code using blocks instead of screens. Colour coded directional arrows can be placed on the interface board which will direct the Cubetto on a pre-defined path around maps that can be laid on the floor. This apparently teaches children the basics of algorithms, debugging, and recursions. It is a product of London-based startup Primo Toys which raised $1.6 million (PS1.3 million) for the project on Kickstarter last year. Cubetto isn't cheap, retailing at PS159. Teaching the next generation to code is the aim of many a startup, and Code Kingdoms takes a gamification approach, turning glitches into enemies that need to be destroyed. The game uses JavaScript to allow the target audience of 6-13 year olds to create fantasy worlds.  Founders Ross Targett and Hugh Collins came out of the Entrepreneur First graduate accelerator programme and the startup has raised a total of PS280,000 in seed funding from SparkLabs Global Ventures, EF (Entrepreneur First), AngelLab and Neon Adventures. Code Club is a partner of Code Kingdoms and is responsible for events and workshops for 9-11 year olds to learn about coding. Code Club is hosted by the UK charity Raspberry Pi Foundation to encourage more young people to learn coding skills and close the growing technology skills gap.  Kids that attend Code Club will be taught how to programme through fun applications like computer games, animations and websites. Courses start out using Scratch to learn the basics of programming, before graduating to basic web development using HTML and CSS and then advanced skills using established programming languages like Python. Sheffield-based startup Tutora was founded by ex-teacher Scott Woodley and technology analyst Mark Hughes in August 2015. It is a platform for helping parents and students find local, in-person tutors and pay for it without having to handle cash (using Stripe). There are currently around 5,000 tutors across 11 UK cities to choose from, starting at PS15 per hour. Tutors are vetted using their government ID and can be searched by subject, price, customer feedback (a star rating) and proximity. Tutora crowdfunded PS150,000 from 75 investors in April for a 10 percent share in the business. It is currently preparing for a Series A round. Blackbullion seeks to help students and young people take better care of their finances, which in turn helps improve employability. The Blackbullion school edition takes the form of an eight part story and the university version comes in short learning modules based around making informed financial decisions in a non-patronising way. now>press>play is an "immersive educational resource" that looks to help primary school children engage with the curriculum through immersive storytelling. In a now>press>play experience each child is given a pair of headphones which will tell an audio story that incorporates learnings, discovery and problem solving. Pricing is bespoke according to the school. Edalytics is based out of the TechHub at Google Campus and is currently in private beta mode with schools. The mysterious startup says that it is: "Testing technologies such as gamification, cognitive studies, artificial intelligence and data mining to create a unique learning experience for the students." 
"""
sentences =  nltk.sent_tokenize(str1)

for sentence in sentences:
    tokens = nltk.word_tokenize(sentence)
    pos = nltk.pos_tag(tokens)
    parser = cp.parse(pos)
    for par in parser:
        if type(par) == nltk.tree.Tree:
            print(par)
    
    

(Company RefME/NNP)
(Company RefMe/NNP)
(Company PS3.75/NNP)
(Company April/NNP)
(Company GEMS/NNP Global/NNP)
(Company Us/NNP)
(Company PS15/NNP)
(Company BBC/NNP)
(Company Daniel/NNP Hirschmann/NNP)
(Company Bethany/NNP Koby/NNP)
(Company CEO/NNP Koby/NNP)
(Company UK/NNP)
(Company Us/NNP)
(Company PS1.2/NNP)
(Company Saatchinvest/NNP)
(Company December/NNP)
(Company George/NNP Burgess/NNP)
(Company Stanford/NNP University/NNP)
(Company Gojimo/NNP)
(Company Android/NNP)
(Company McGraw-Hill/NNP Education/NNP)
(Company Oxford/NNP University/NNP Press/NNP)
(Company Gojimo/NNP)
(Company PS630,000/NNP)
(Company Index/NNP Ventures/NNP)
(Company JamJar/NNP Investments/NNP)
(Company Drinks/NNP)
(Company Krishan/NNP Meetoo/NNP)
(Company Carl/NNP Dawson/NNP)
(Company Proversity/NNP)
(Company Meetoo/NNP)
(Company Proversity/NNP)
(Company PS1/NNP)
(Company Czech/NNP Republic/NNP)
(Company RSBC/NNP Venture/NNP Capital/NNP)
(Company November/NNP)
(Company Series/NNP A/NNP)
(Company USA/NNP)
(Comp

In [157]:
data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1 # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  for i in range(batch_size // num_skips):
    target = skip_window  # target label at the center of the buffer
    targets_to_avoid = [ skip_window ]
    for j in range(num_skips):
      while target in targets_to_avoid:
        target = random.randint(0, span - 1)
      targets_to_avoid.append(target)
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[target]
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  return batch, labels

print('data:', [reverse_dictionary[di] for di in data[:8]])

for num_skips, skip_window in [(2, 1), (4, 2)]:
    data_index = 0
    batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)
    print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))
    print('    batch:', [reverse_dictionary[bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])

data: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first']

with num_skips = 2 and skip_window = 1:
    batch: ['originated', 'originated', 'as', 'as', 'a', 'a', 'term', 'term']
    labels: ['anarchism', 'as', 'a', 'originated', 'as', 'term', 'a', 'of']

with num_skips = 4 and skip_window = 2:
    batch: ['as', 'as', 'as', 'as', 'a', 'a', 'a', 'a']
    labels: ['originated', 'a', 'anarchism', 'term', 'of', 'term', 'as', 'originated']


In [6]:
import nltk
import re
import json

words_list = []
f = open("data/data.json","r")
sites = json.load(f)
f.close()

grammar = r"""
  Company:{<NNP>+}    # chunk determiner/possessive, adjectives and noun
"""
cp = nltk.RegexpParser(grammar)
pattern = re.compile(r'\W')

for key in sites.keys():
    articles = sites[key]
    for article in articles:
        content = article["content"]
        
        sentences = nltk.sent_tokenize(content)
        for sentence in sentences:
            tokens = nltk.word_tokenize(sentence)
            pos = nltk.pos_tag(tokens)
            parsers = cp.parse(pos)
            
            for par in parsers:
                if type(par) == nltk.tree.Tree:
                    string = ""
                    for idx,leave in  enumerate(par):
                        num = int(idx>0)
                        string = string+' '*num + leave[0]
                    words_list.append(string)
                    
                else:
                    if not pattern.match(par[0]):
                        words_list.append(par[0].lower())

f = open('text.json','w')
json.dump(words_list, f ,indent = 2)
f.close()

In [9]:
words_list[300:500]

['a',
 'virtual',
 '3d',
 'model',
 'unique',
 'to',
 'their',
 'anatomy',
 'using',
 'the',
 'AR HoloLens',
 'or',
 'VR Oculus Rift',
 'users',
 'can',
 'see',
 'a',
 'color-coded',
 'representation',
 'of',
 'a',
 'patient',
 'bones',
 'organs',
 'and',
 'nerves',
 'the',
 'software',
 'is',
 'especially',
 'useful',
 'to',
 'help',
 'clinicians',
 'visualize',
 'the',
 'shape',
 'and',
 'structure',
 'of',
 'a',
 'growth',
 'or',
 'tumor',
 'although',
 'various',
 'medical',
 'institutions',
 'have',
 'expressed',
 'interest',
 'in',
 'Bosc',
 'implementing',
 'this',
 'technology',
 'into',
 'a',
 'doctor',
 'daily',
 'clinical',
 'workflow',
 'will',
 'be',
 'no',
 'small',
 'undertaking',
 'VR',
 'and',
 'AR',
 'headsets',
 'are',
 'still',
 'in',
 'their',
 'early',
 'stages',
 'of',
 'development',
 'and',
 'are',
 'prohibitively',
 'expensive',
 'for',
 'some',
 'institutions',
 'to',
 'use',
 'on',
 'a',
 'large',
 'scale',
 'i',
 'think',
 'there',
 'a',
 'lot',
 'of',
 'hy

In [3]:
import nltk
import re
import json

words_list = []
f = open("data/data.json","r")
sites = json.load(f)
f.close()

grammar = r"""
  Company:{<NNP>+}    # chunk determiner/possessive, adjectives and noun
"""
cp = nltk.RegexpParser(grammar)
pattern = re.compile(r'\W')

content = sites['Geekwire'][0]["content"]
sentences = nltk.sent_tokenize(content)
for sentence in sentences:
    tokens = nltk.word_tokenize(sentence)
    pos = nltk.pos_tag(tokens)
    parsers = cp.parse(pos)
    pattern = re.compile(r'\W')
    for par in parsers:
        if type(par) == nltk.tree.Tree:
            string = ""
            for idx,leave in  enumerate(par):
                num = int(idx>0)
                string = string+' '*num + leave[0]
            words_list.append(string)
        else:
            if not pattern.match(par[0]):
                words_list.append(par[0].lower())


In [247]:
words_list = []
for par in parsers:
    print(par)
    if type(par) == nltk.tree.Tree:
        string = ""
        for idx,leave in  enumerate(par):
            num = int(idx>0)
            string = string+' '*num + leave[0]
        words_list.append(string)
            
    else:
        if not pattern.match(par[0]):
            words_list.append(par[0].lower())
words_list

(Company Ryan/NNP James/NNP)
('(', '(')
('left', 'VBN')
(')', ')')
('and', 'CC')
(Company Mark/NNP Laughery/NNP)
('(', '(')
('right', 'RB')
(')', ')')
('sit', 'NN')
('in', 'IN')
('their', 'PRP$')
('office', 'NN')
('space', 'NN')
('at', 'IN')
('the', 'DT')
(Company UW/NNP CoMotion/NNP Labs/NNP)
('.', '.')


['Ryan James',
 'left',
 'and',
 'Mark Laughery',
 'right',
 'sit',
 'in',
 'their',
 'office',
 'space',
 'at',
 'the',
 'UW CoMotion Labs']