In [1]:
from bookworm import api
from pprint import pprint
import json

In [2]:
API_KEY = '--SET ME--'
LANG = 'sv'  # Just Swedish during beta

In [3]:
bw = api.Bookworm(API_KEY, LANG)

## Anomalies
Compare two groups of texts

In [4]:
# A few sample short texts with user comments on two Facebook pages
texts_telia = json.load(open('texts_telia.json', 'r'))
texts_tele2 = json.load(open('texts_tele2.json', 'r'))

In [5]:
res_anomalies = bw.anomalies(texts_telia, texts_tele2, max_words=10, min_score=1)

In [6]:
pprint(res_anomalies)

{'less': [{'score': 99999991726, 'word': 'tele2'},
          {'score': 99999991726, 'word': 'tack'},
          {'score': 99999991726, 'word': 'respekt'},
          {'score': 99999991726, 'word': 'spendera'},
          {'score': 99999991726, 'word': 'snäll'},
          {'score': 99999991726, 'word': 'köp'},
          {'score': 99999991726, 'word': 'butik'},
          {'score': 10000, 'word': 'sälja'},
          {'score': 769, 'word': 'reklam'},
          {'score': 769, 'word': 'beställa'}],
 'more': [{'score': 99999991726, 'word': 'Telia'},
          {'score': 99999991726, 'word': 'okej'},
          {'score': 99999991726, 'word': 'nej'},
          {'score': 99999991726, 'word': 'ni'},
          {'score': 99999991726, 'word': 'hat'},
          {'score': 99999991726, 'word': 'hata'},
          {'score': 99999991726, 'word': 'fungera'},
          {'score': 99999991726, 'word': 'använda'},
          {'score': 3333, 'word': 'internet'},
          {'score': 2500, 'word': 'tv'}]}


## Categories
See how well a set of documents matches a set of categories.
As a rule of thumb, a score >= 0.4 means a weak match. Anything below 0.4 is noise.

Soon there will be an additional endpoint that can be trained.

In [7]:
category_texts = [
    'Denna text handlar om pengar och dollar och ekonomi',
    'Denna text handlar om hamburgare, vitlök och grönsaker',
    'Här är en till text om mat och tomater'
]

categories = [
    ['ekonomi', 'valuta'],
    ['mat', 'matvaror', 'frukt', 'grönsaker'],
    ['djur', 'hund']
]

res_cat = bw.categories(category_texts, categories)
pprint(res_cat)

[[{'category': 'ekonomi', 'score': 0.7887064834414715},
  {'category': 'mat', 'score': 0.31192957070995536},
  {'category': 'djur', 'score': 0.22434491657897268}],
 [{'category': 'ekonomi', 'score': 0.26923399226273675},
  {'category': 'mat', 'score': 0.8601434681766795},
  {'category': 'djur', 'score': 0.30387611097728867}],
 [{'category': 'ekonomi', 'score': 0.2637664920010135},
  {'category': 'mat', 'score': 0.7474414414198356},
  {'category': 'djur', 'score': 0.3212467123306468}]]


## Clusters

In [8]:
res_clusters = bw.clusters(texts_tele2, min_cluster_size=10)
pprint(res_clusters)

{'clusters': [-1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              13,
              -1,
              -1,
              -1,
              9,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
              -1,
           

In [9]:
# Print a description of first cluster and all docs that belong to it
print(res_clusters['descriptions'][0])

first_cluster = [doc for doc, cluster in zip(texts_tele2, res_clusters['clusters']) if cluster == 0]
for doc in first_cluster:
    print(doc)

{'words': ['Lucas', 'شوف', 'Lars', 'bashar', 'yasar', 'Tony', 'w', 'wow', 'sinan', 'حجي', 'rami', 'Ivan', 'Anders', 'Eric', 'Nancy', 'Madeleine', 'mohamed', 'maryam', 'Nicklas', 'Michael', 'Patrik', 'mena', 'Emanuel', 'Christopher', 'waleed', '3', 'o', 'said', 'Gabriella', 'Gabriel', 'الموسوي', 'zahra', 'والله', 'Oscar', 'Jacob', 'jocke', 'Viktor', 'Paul', 'Angelica']}
Emmy Jensen
Angelica Famulak
Camila Rossi tväla
Paulina Chodkiewicz  i ty nie idziesz? :O
Josefina Holmgren
Libbis Sujessy
Andjela Milosevic Mirta Mikolčević jeeeeee 😂😂😂😂
Marta Svankvist
Casandra W Norelli😍
Ferial Tan 30 ????
Hazel Can O.M.G
Omg😍😍
Alicia Patron
TjasaSaje 😂😂😂😂😂😂😂
Klara Elvira Pravdic
Ilhan Acaralp alabili Emre Turksson
Maher Awad
Paul Ciobanu
Michaela Ragnarsson
Plats 4325👍🏻
Benjamin Akbari aw yiss
Mentor Kastrati
Adhari Algailani
De säger varje gång att det är den bästa ifånen och alla idioter blir helt shockade😂
Vill också förbeställa 🤘😁
När kan man förbeställa?😁
När kommer man att kunna förhandsboka?
L

## Counts

In [10]:
res_counts = bw.counts(texts_tele2)
pprint(res_counts)

[{'count': 403,
  'variants': [{'count': 205, 'word': 'Tele2'},
               {'count': 65, 'word': 'TELE2'},
               {'count': 133, 'word': 'tele2'}],
  'word': 'tele2'},
 {'count': 324,
  'variants': [{'count': 53, 'word': 'Ni'},
               {'count': 5, 'word': 'NI'},
               {'count': 266, 'word': 'ni'}],
  'word': 'ni'},
 {'count': 303,
  'variants': [{'count': 93, 'word': 'tack'},
               {'count': 9, 'word': 'TACK'},
               {'count': 201, 'word': 'Tack'}],
  'word': 'tack'},
 {'count': 248,
  'variants': [{'count': 2, 'word': 'mobilerna'},
               {'count': 7, 'word': 'mobilt'},
               {'count': 85, 'word': 'mobiler'},
               {'count': 122, 'word': 'mobil'},
               {'count': 1, 'word': 'mobils'},
               {'count': 1, 'word': 'MOBIL'},
               {'count': 24, 'word': 'mobilen'},
               {'count': 1, 'word': 'Mobile'},
               {'count': 4, 'word': 'mobila'},
               {'count': 1, 'word'

## Entities
Extract named entities from docs. You will get a list of entities for each doc.

Entities can be 'geo', 'person', 'org', or 'heuristics'.

Heuristics means that entities that we have seen before are returned,
even though we do not know what kind of entity it is.

In [11]:
docs = ['De två största städerna i Sverige är Stockholm och Göteborg',
        'Donald Trump är Barack Obamas efterträdare',
        'En stad i Frankrike är Lyon']

In [12]:
bw.entities(docs)

[[{'entity': 'Stockholm', 'type': 'geo', 'variants': ['Stockholm']},
  {'entity': 'Sverige', 'type': 'geo', 'variants': ['Sverige']},
  {'entity': 'Göteborg', 'type': 'geo', 'variants': ['Göteborg']}],
 [{'entity': 'Barack Obama', 'type': 'person', 'variants': ['Barack Obamas']},
  {'entity': 'Donald Trump', 'type': 'person', 'variants': ['Donald Trump']}],
 [{'entity': 'Frankrike', 'type': 'geo', 'variants': ['Frankrike']},
  {'entity': 'Lyon', 'type': 'heuristics', 'variants': ['Lyon']}]]

## Sentiment

In [13]:
docs_sentiment = [
    'Nej, det här var inte alls bra',
    'Detta är en bra mening, som visar på en helt fantastisk fantasi hos författaren'
]
res_sentiment = bw.sentiment(docs_sentiment)
pprint(res_sentiment)

[-0.3562071871080222, 1]


## Topics

In [14]:
docs_topics = [
    'De två största städerna i Sverige är Stockholm och Göteborg, men Rävlanda är mest creddig',
    'Donald Trump är Barack Obamas efterträdare',
    'Jag gillar äpplen och päron och jag kan jämföra dem']
res_topics = bw.topics(docs_topics)
pprint(res_topics)

[[{'score': 3.0, 'topic': 'Stad'},
  {'score': 3.0, 'topic': 'Göteborg'},
  {'score': 2, 'topic': 'Städer'}],
 [{'score': 2, 'topic': 'Trump'},
  {'score': 1.0, 'topic': 'Byggnadstyper'},
  {'score': 1.0, 'topic': 'Amerikanska släkter'}],
 [{'score': 3.0, 'topic': 'Äpplen'},
  {'score': 2, 'topic': 'Päron'},
  {'score': 2.0, 'topic': 'Rosväxter'}]]


## Wordcloud

In [15]:
cloud = bw.wordcloud(bw.counts(texts_tele2))

In [16]:
cloud

{'url': 'https://bookworm.crawlica.com/v1/sv/wordcloud/8469197d-ce6d-49d6-9f05-e43cb704da8f'}

In [17]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= cloud['url'])