## 3ml Story Analysis

In [1]:
import turicreate as tc
import turicreate.aggregate as agg

In [2]:
stories = tc.SFrame('stories.csv')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [3]:
stories

id,title,content
1244,Ship shape,"The Titanic was made from over 50,000 tonnes of ..."
1252,Ecosystem science,"It’s a lot easier to rebuild a town, after a ..."
950,Perfect place,Chilbolton would be the perfect place to make a ...
78,Viruses,\n\n\nViruses are tiny organisms at the edge of ...
383,Electric vehicle,"“I’ve had my Nissan Leaf for 3 years now,“ says ..."
1370,Evaporation (part 2),If you could zoom right in to the surface of ...
442,Cloth from plants,The cotton plant has been used to make fabric for ...
398,Rock art,Do you think this looks like art? It is a piece ...
358,It lifts boats,The parts of the Falkirk Wheel were made at a ...
584,Daybreak at Baikonur,This is the world's first and largest launch site ...


In [4]:
import string

def clean_text(s):
    s = s.replace('&quot;', '')
    translator = s.maketrans('', '', string.punctuation + '“’”')
    return s.translate(translator).lower()

In [5]:
stories['text'] = stories['content'].apply(clean_text)

In [6]:
stories

id,title,content,text
1244,Ship shape,"The Titanic was made from over 50,000 tonnes of ...",the titanic was made from over 50000 tonnes of ...
1252,Ecosystem science,"It’s a lot easier to rebuild a town, after a ...",its a lot easier to rebuild a town after a ...
950,Perfect place,Chilbolton would be the perfect place to make a ...,chilbolton would be the perfect place to make a ...
78,Viruses,\n\n\nViruses are tiny organisms at the edge of ...,\n\n\nviruses are tiny organisms at the edge of ...
383,Electric vehicle,"“I’ve had my Nissan Leaf for 3 years now,“ says ...",ive had my nissan leaf for 3 years now says ...
1370,Evaporation (part 2),If you could zoom right in to the surface of ...,if you could zoom right in to the surface of ...
442,Cloth from plants,The cotton plant has been used to make fabric for ...,the cotton plant has been used to make fabric for ...
398,Rock art,Do you think this looks like art? It is a piece ...,do you think this looks like art it is a piec ...
358,It lifts boats,The parts of the Falkirk Wheel were made at a ...,the parts of the falkirk wheel were made at a ...
584,Daybreak at Baikonur,This is the world's first and largest launch site ...,this is the worlds first and largest launch site ...


In [7]:
stories['tfidf'] = tc.text_analytics.tf_idf(stories['text']).dict_trim_by_keys(tc.text_analytics.stop_words(), True)


In [8]:
knn_model = tc.nearest_neighbors.create(stories,features=['tfidf'],label='id')

In [9]:
viruses = stories[stories['id'] == 78]

In [10]:
knn_model.query(viruses, k=10)

query_label,reference_label,distance,rank
0,78,0.0,1
0,1198,0.8842105263157894,2
0,67,0.927710843373494,3
0,864,0.9320388349514565,4
0,292,0.9333333333333332,5
0,514,0.935064935064935,6
0,143,0.9375,7
0,77,0.9381443298969072,8
0,471,0.9423076923076924,9
0,860,0.9428571428571428,10


In [11]:
knn_model.query(stories[stories['id'] == 950], k=10)['reference_label']

dtype: int
Rows: 10
[950, 964, 985, 963, 1045, 945, 967, 968, 966, 986]

In [12]:
stories[stories['title'] == 'Do sick plants sneeze?']['tfidf']

dtype: dict
Rows: ?
[{'bless': 6.53451531816871, 'springing': 7.227662498728654, 'observed': 6.53451531816871, 'virginia': 5.8413681376087645, 'caused': 4.092168282799505, 'wheat': 5.281752349673342, 'rust': 5.4359030295006, 'spores': 7.227662498728654, 'energy': 5.820348770384689, 'wet': 4.829767225930285, 'fungus': 13.06903063633742, 'leaf': 15.091313764177306, 'waterproof': 7.227662498728654, 'blow': 5.8413681376087645, 'plant': 10.770229017006807, 'gust': 7.227662498728654, 'sneezes': 11.682736275217529, 'carried': 4.008786673860454, 'released': 5.4359030295006, '2019': 4.519612297626445, 'scientists': 2.0918640616783932, 'viruses': 5.4359030295006, 'kind': 3.0380077567022292, 'droplets': 15.845257049020026, '20': 3.7936752942435086, 'work': 1.8431674359395658, 'noses': 5.4359030295006, 'form': 3.133317936506554, 'diseases': 4.519612297626445, 'inside': 2.6224923127405635, 'reason': 3.9318256327243257, 'june': 5.281752349673342, 'disease': 8.566447039124428, 'dont': 3.7706564935276

In [13]:
knn = knn_model.query(stories, k=10).groupby(key_column_names='query_label', operations={'nn': agg.CONCAT('reference_label')})

In [14]:
knn['id']=knn['nn'].apply(lambda x: x[0])

In [15]:
knn['nn']=knn['nn'].apply(lambda x: x[1:])

In [16]:
knn=knn.remove_column('query_label')

In [17]:
knn.export_json('stories_knn.json')

In [18]:
sim_graph = knn_model.similarity_graph(k=10)