## Session 3 - introduction to SpaCy

In [2]:
import spacy

In [None]:
# in terminal: python -m spacy download en_core_web_md
# to download the spacy model trained on english data

In [3]:
nlp = spacy.load("en_core_web_md")

In [4]:
type(nlp)

spacy.lang.en.English

In [15]:
# First, make a smple sentence
text = "My name is Sofie. I am a CogSci Student"

In [16]:
# Create spacy doc
doc= nlp(text)

In [17]:
type(doc)

spacy.tokens.doc.Doc

In [18]:
print(doc)

My name is Sofie. I am a CogSci Student


In [19]:
for token in doc:
    print(token.text)

My
name
is
Sofie
.
I
am
a
CogSci
Student


In [20]:
for token in doc:
    #token.i get an index for 
    print(token.i,token.text, token.pos_, token.dep_, token.morph)
    #token.pos_ returns the label
    #token.pos returns the labels number 
    #token.dep_ returns the dependency 
    #token.morph returns the morphology

# https://spacy.io/api/attributes#_title , see this link for all the different attributes we can get

0 My PRON poss Number=Sing|Person=1|Poss=Yes|PronType=Prs
1 name NOUN nsubj Number=Sing
2 is AUX ROOT Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
3 Sofie PROPN attr Number=Sing
4 . PUNCT punct PunctType=Peri
5 I PRON nsubj Case=Nom|Number=Sing|Person=1|PronType=Prs
6 am AUX ROOT Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin
7 a DET det Definite=Ind|PronType=Art
8 CogSci PROPN compound Number=Sing
9 Student NOUN attr Number=Sing


In [21]:
# named entity recognition (NER)
for entity in doc.ents:
    print(entity.text, entity.label_)

Sofie PERSON
CogSci ORG


### Pandas

In [22]:
import pandas as pd

In [23]:
annotations = []

for token in doc:
    annotations.append([token.text,
                        token.pos_,
                        token.dep_])

In [24]:
annotations

[['My', 'PRON', 'poss'],
 ['name', 'NOUN', 'nsubj'],
 ['is', 'AUX', 'ROOT'],
 ['Sofie', 'PROPN', 'attr'],
 ['.', 'PUNCT', 'punct'],
 ['I', 'PRON', 'nsubj'],
 ['am', 'AUX', 'ROOT'],
 ['a', 'DET', 'det'],
 ['CogSci', 'PROPN', 'compound'],
 ['Student', 'NOUN', 'attr']]

In [25]:
df = pd.DataFrame(annotations, columns=["Text","POS","dep"])

In [26]:
df

Unnamed: 0,Text,POS,dep
0,My,PRON,poss
1,name,NOUN,nsubj
2,is,AUX,ROOT
3,Sofie,PROPN,attr
4,.,PUNCT,punct
5,I,PRON,nsubj
6,am,AUX,ROOT
7,a,DET,det
8,CogSci,PROPN,compound
9,Student,NOUN,attr


In [27]:
df["POS"]

0     PRON
1     NOUN
2      AUX
3    PROPN
4    PUNCT
5     PRON
6      AUX
7      DET
8    PROPN
9     NOUN
Name: POS, dtype: object

In [28]:
df["POS"].value_counts()

POS
PRON     2
NOUN     2
AUX      2
PROPN    2
PUNCT    1
DET      1
Name: count, dtype: int64

In [29]:
df.to_csv("annotations.csv", index=False)

In [30]:
input_df = pd.read_csv("annotations.csv")

In [31]:
input_df

Unnamed: 0,Text,POS,dep
0,My,PRON,poss
1,name,NOUN,nsubj
2,is,AUX,ROOT
3,Sofie,PROPN,attr
4,.,PUNCT,punct
5,I,PRON,nsubj
6,am,AUX,ROOT
7,a,DET,det
8,CogSci,PROPN,compound
9,Student,NOUN,attr
