## `SpaCy` and `pandas`

In [1]:
import os
import spacy

# init spacy

nlp = spacy.load("en_core_web_sm")

In [2]:
type(nlp)

spacy.lang.en.English

In [3]:
example = "This is a sentence written in English."
doc = nlp(example)
type(doc)

spacy.tokens.doc.Doc

In [4]:
for token in doc:
    print(token.text, token.pos_, token.tag_, token.lemma_)

This DET DT this
is AUX VBZ be
a DET DT a
sentence NOUN NN sentence
written VERB VBN write
in ADP IN in
English PROPN NNP English
. PUNCT . .


__Reading data with `pandas`__

In [5]:
import pandas as pd

In [6]:
infile = os.path.join('..', 'data', 'fake_or_real_news.csv')

data = pd.read_csv(infile)

# removing a column
del data['Unnamed: 0']

# 5 random records
data.sample(5)

# count the amount of records for every unique value in "label" col
data["label"].value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

__Filter on columns__

In [7]:
# all return new df where all entries are True / False based on whether "label" == "FAKE"
data['label'] == "FAKE"

# Filter based on condition in brackets
fake_news_df = data[data["label"] == "FAKE"]
real_news_df = data[data["label"] == "REAL"]

# Only fake news left
fake_news_df["label"].value_counts()

FAKE    3164
Name: label, dtype: int64

__Counting features in data__

In [9]:
adj_count = 0

# process texts in batch
for doc in nlp.pipe(fake_news_df["title"], batch_size=500):
    for token in doc:
        if token.tag_ == "JJ":
            adj_count += 1
            
adj_count

1794

## Sentiment with `SpaCy`

In [None]:
import os
import pandas as pd
import spacy
from spacytextblob import
nlp