In [1]:
!pip install spacy




In [2]:
import spacy
import pandas as pd

In [3]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     - -------------------------------------- 0.5/12.8 MB 4.8 MB/s eta 0:00:03
     ---- ----------------------------------- 1.3/12.8 MB 4.3 MB/s eta 0:00:03
     ------- -------------------------------- 2.4/12.8 MB 4.4 MB/s eta 0:00:03
     --------- ------------------------------ 3.1/12.8 MB 4.1 MB/s eta 0:00:03
     ------------ --------------------------- 3.9/12.8 MB 3.9 MB/s eta 0:00:03
     -------------- ------------------------- 4.7/12.8 MB 4.0 MB/s eta 0:00:03
     ------------------ --------------------- 6.0/12.8 MB 4.2 MB/s eta 0:00:02
     --------------------- ------------------ 6.8/12.8 MB 4.1 MB/s eta 0:00:02
     ---------------------- ----------------- 7.3/12.8 MB 4.1 MB/s eta 0:00:02
     -------------------------- ---------

In [4]:
nlp = spacy.load("en_core_web_sm")


In [5]:
text = "Although Nithish was exhausted after completing 110 sections of his AI course in a single day, he still managed to stay focused and study NLP concepts late into the night with remarkable dedication."
print(text)

Although Nithish was exhausted after completing 110 sections of his AI course in a single day, he still managed to stay focused and study NLP concepts late into the night with remarkable dedication.


In [6]:
doc = nlp(text)
pos_data = pd.DataFrame(columns=['word','pos_tag'])

In [7]:
for X in doc:
    pos_data = pd.concat([pos_data, pd.DataFrame.from_records([{'word':X.text,'pos_tag':X.pos_}])])

In [8]:
pos_data.head(15)

Unnamed: 0,word,pos_tag
0,Although,SCONJ
0,Nithish,PROPN
0,was,AUX
0,exhausted,ADJ
0,after,ADP
0,completing,VERB
0,110,NUM
0,sections,NOUN
0,of,ADP
0,his,PRON


In [9]:
pos_data_count = pos_data.groupby(['pos_tag'])['word'].count().sort_values(ascending=True)

In [10]:
pos_data_count.head(10)

pos_tag
AUX      1
CCONJ    1
NUM      1
PART     1
SCONJ    1
PUNCT    2
DET      2
ADV      2
PRON     2
PROPN    3
Name: word, dtype: int64

In [11]:
nouns_data = pos_data[pos_data['pos_tag'] == 'NOUN']
nouns_data


Unnamed: 0,word,pos_tag
0,sections,NOUN
0,course,NOUN
0,day,NOUN
0,concepts,NOUN
0,night,NOUN
0,dedication,NOUN


In [12]:
Adj_data = pos_data[pos_data['pos_tag'] == 'ADJ']
Adj_data


Unnamed: 0,word,pos_tag
0,exhausted,ADJ
0,single,ADJ
0,focused,ADJ
0,remarkable,ADJ


Named Entity Recognition(NER)

In [13]:
from spacy import displacy
from spacy import tokenizer
import re

In [14]:
text_1 = "On April 27, 2023, Elon Musk announced at a Tesla event in Berlin that SpaceX would partner with NASA to launch a new satellite from Cape Canaveral, while also expanding operations in Bengaluru, India."
print(text_1)

On April 27, 2023, Elon Musk announced at a Tesla event in Berlin that SpaceX would partner with NASA to launch a new satellite from Cape Canaveral, while also expanding operations in Bengaluru, India.


In [15]:
doc_1 = nlp(text_1)

In [16]:
for X in doc_1.ents:
    print(X.text,X.label_)
    

April 27, 2023 DATE
Elon Musk PERSON
Tesla NORP
Berlin GPE
SpaceX PERSON
NASA ORG
Cape Canaveral GPE
Bengaluru GPE
India GPE


In [17]:
displacy.render(doc_1,style = "dep")

In [18]:
displacy.render(doc_1,style = "ent")

In [19]:
text_1_clean = re.sub(r"[^\w\s]",'',text_1).lower()
print(text_1_clean)

on april 27 2023 elon musk announced at a tesla event in berlin that spacex would partner with nasa to launch a new satellite from cape canaveral while also expanding operations in bengaluru india


In [20]:
text_1_lower = nlp(text_1_clean)

In [None]:
displacy.render(text_1_lower,style="ent",jupyter= True)

In [24]:
bbc_news = pd.read_csv(r"C:\Users\hp\Desktop\Tech Mango\bbc_news.csv")

In [27]:
bbc_news.head()

Unnamed: 0,title,pubDate,guid,link,description
0,Ukraine: Angry Zelensky vows to punish Russian...,"Mon, 07 Mar 2022 08:01:56 GMT",https://www.bbc.co.uk/news/world-europe-60638042,https://www.bbc.co.uk/news/world-europe-606380...,The Ukrainian president says the country will ...
1,War in Ukraine: Taking cover in a town under a...,"Sun, 06 Mar 2022 22:49:58 GMT",https://www.bbc.co.uk/news/world-europe-60641873,https://www.bbc.co.uk/news/world-europe-606418...,"Jeremy Bowen was on the frontline in Irpin, as..."
2,Ukraine war 'catastrophic for global food',"Mon, 07 Mar 2022 00:14:42 GMT",https://www.bbc.co.uk/news/business-60623941,https://www.bbc.co.uk/news/business-60623941?a...,One of the world's biggest fertiliser firms sa...
3,Manchester Arena bombing: Saffie Roussos's par...,"Mon, 07 Mar 2022 00:05:40 GMT",https://www.bbc.co.uk/news/uk-60579079,https://www.bbc.co.uk/news/uk-60579079?at_medi...,The parents of the Manchester Arena bombing's ...
4,Ukraine conflict: Oil price soars to highest l...,"Mon, 07 Mar 2022 08:15:53 GMT",https://www.bbc.co.uk/news/business-60642786,https://www.bbc.co.uk/news/business-60642786?a...,Consumers are feeling the impact of higher ene...


In [29]:
bbc_title = pd.DataFrame(bbc_news['title'])

In [31]:
bbc_title.head()

Unnamed: 0,title
0,Ukraine: Angry Zelensky vows to punish Russian...
1,War in Ukraine: Taking cover in a town under a...
2,Ukraine war 'catastrophic for global food'
3,Manchester Arena bombing: Saffie Roussos's par...
4,Ukraine conflict: Oil price soars to highest l...


In [33]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re

In [35]:
bbc_title['lowercase'] = bbc_title['title'].str.lower() 

In [37]:
bbc_title.head()

Unnamed: 0,title,lowercase
0,Ukraine: Angry Zelensky vows to punish Russian...,ukraine: angry zelensky vows to punish russian...
1,War in Ukraine: Taking cover in a town under a...,war in ukraine: taking cover in a town under a...
2,Ukraine war 'catastrophic for global food',ukraine war 'catastrophic for global food'
3,Manchester Arena bombing: Saffie Roussos's par...,manchester arena bombing: saffie roussos's par...
4,Ukraine conflict: Oil price soars to highest l...,ukraine conflict: oil price soars to highest l...


In [41]:
bbc_title['no_puc_words'] = bbc_title['lowercase'].apply(lambda x: re.sub(r"[^\w\s]", "", str(x)))


In [43]:
bbc_title.head()

Unnamed: 0,title,lowercase,no_puc_words
0,Ukraine: Angry Zelensky vows to punish Russian...,ukraine: angry zelensky vows to punish russian...,ukraine angry zelensky vows to punish russian ...
1,War in Ukraine: Taking cover in a town under a...,war in ukraine: taking cover in a town under a...,war in ukraine taking cover in a town under at...
2,Ukraine war 'catastrophic for global food',ukraine war 'catastrophic for global food',ukraine war catastrophic for global food
3,Manchester Arena bombing: Saffie Roussos's par...,manchester arena bombing: saffie roussos's par...,manchester arena bombing saffie roussoss paren...
4,Ukraine conflict: Oil price soars to highest l...,ukraine conflict: oil price soars to highest l...,ukraine conflict oil price soars to highest le...


In [49]:
bbc_title['tokenize'] = bbc_title['no_puc_words'].apply(lambda X : word_tokenize(X))

In [52]:
bbc_title.head()

Unnamed: 0,title,lowercase,no_puc_words,tokenize
0,Ukraine: Angry Zelensky vows to punish Russian...,ukraine: angry zelensky vows to punish russian...,ukraine angry zelensky vows to punish russian ...,"[ukraine, angry, zelensky, vows, to, punish, r..."
1,War in Ukraine: Taking cover in a town under a...,war in ukraine: taking cover in a town under a...,war in ukraine taking cover in a town under at...,"[war, in, ukraine, taking, cover, in, a, town,..."
2,Ukraine war 'catastrophic for global food',ukraine war 'catastrophic for global food',ukraine war catastrophic for global food,"[ukraine, war, catastrophic, for, global, food]"
3,Manchester Arena bombing: Saffie Roussos's par...,manchester arena bombing: saffie roussos's par...,manchester arena bombing saffie roussoss paren...,"[manchester, arena, bombing, saffie, roussoss,..."
4,Ukraine conflict: Oil price soars to highest l...,ukraine conflict: oil price soars to highest l...,ukraine conflict oil price soars to highest le...,"[ukraine, conflict, oil, price, soars, to, hig..."


WORKING POS & NER WITH BBC DATASETS

In [63]:
token_raw = sum(bbc_title['tokenize'],[])

In [65]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")


In [74]:
# Ensure token_raw contains only valid strings
token_raw = []

for tokens in bbc_title['tokenize']:
    if isinstance(tokens, list):
        for token in tokens:
            if isinstance(token, str) and token.strip() != "":
                token_raw.append(token)


In [77]:
# Join and truncate first 100,000 characters (safe for spaCy)
text_data = ' '.join(token_raw)[:100000]
spacy_doc = nlp(text_data)

In [78]:
pos_def = pd.DataFrame(columns = ['token','pos_tags'])

In [81]:
for X in spacy_doc:
    pos_def = pd.concat([pos_def, pd.DataFrame.from_records([{'token':X.text,'pos_tag':X.pos_}])])


In [82]:
pos_def_count = pos_def.groupby(['token','pos_tag']).size().reset_index(name = 'counts')
pos_def_count.head(10)

Unnamed: 0,token,pos_tag,counts
0,01,NUM,1
1,02,NUM,2
2,1,NUM,4
3,10,NUM,11
4,100,NUM,3
5,100000,NUM,1
6,1000yearold,NOUN,1
7,100k,NUM,1
8,100yearold,PROPN,1
9,10bn,ADJ,1


In [86]:
Nouns = pos_def_count[pos_def_count.pos_tag=="NOUN"]
Nouns

Unnamed: 0,token,pos_tag,counts
6,1000yearold,NOUN,1
10,10man,NOUN,1
19,12day,NOUN,1
26,15inch,NOUN,1
33,18th,NOUN,1
...,...,...,...
5343,youngsters,NOUN,1
5347,youtube,NOUN,1
5349,z,NOUN,1
5359,zelensky,NOUN,1


In [88]:
Verbs = pos_def_count[pos_def_count.pos_tag=="VERB"]
Verbs

Unnamed: 0,token,pos_tag,counts
119,abandoned,VERB,3
122,abduct,VERB,1
123,abducted,VERB,1
133,abramovich,VERB,2
139,academies,VERB,1
...,...,...,...
5310,wounded,VERB,2
5312,wows,VERB,1
5315,write,VERB,2
5316,wrote,VERB,1


In [89]:
Ner_df = pd.DataFrame(columns=['token','NER_tag'])

In [91]:
for X in spacy_doc.ents:
    print(X.text,X.label_)

ukraine GPE
zelensky PERSON
russian NORP
2008 DATE
50 CARDINAL
netflix GPE
russia GPE
fourth ORDINAL
russia GPE
thousands CARDINAL
11 CARDINAL
russian NORP
ukraine GPE
ukraine GPE
putin PERSON
prorussian trolls ukraine maps new ORG
south five DATE
ukraine russian NORP
european NORP
ukraine russians NORP
boycotts NORP
kherson PERSON
russian NORP
belfast GPE
russian NORP
russia GPE
ukraine GPE
russia GPE
bbc ukrainian ORG
ukraine russian NORP
nato ORG
indian NORP
first ORDINAL
the week DATE
joe perry PERSON
gb ski pairings PERSON
two CARDINAL
uk GPE
five CARDINAL
fourth ORDINAL
england GPE
uk GPE
england GPE
2008 DATE
one CARDINAL
scottish NORP
capsizes ORG
norway ukraine irish medical FAC
one CARDINAL
scottish NORP
capsizes ORG
norway ukraine FAC
one CARDINAL
scottish NORP
australian NORP
new zealand GPE
one CARDINAL
scottish NORP
australian NORP
russian NORP
ivan kuliak PERSON
thousands CARDINAL
ukraine GPE
ukraine irish medical ORG
sumy ukraine india doctor ORG
all hours TIME
82 CARDI

In [93]:
displacy.render(spacy_doc,style="ent")