In [1]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import spacy
from spacy import displacy

In [4]:
url = "https://straitsresearch.com/article/japan-s-leadership-in-the-global-photovoltaic-market"
response = requests.get(url)

In [5]:
soup=BeautifulSoup(response.content, "html.parser")

In [6]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible">
   <script async="" crossorigin="anonymous" src="https://pagead2.googlesyndication.com/pagead/js/adsbygoogle.js?client=ca-pub-6621186845735499">
   </script>
   <title>
    Japan's solar innovation &amp; growth, trends and future plans
   </title>
   <meta content="Japan's solar innovation &amp; growth, trends and future plans" name="title"/>
   <meta content="With a 9.2% CAGR, Japan aims for 117.6 GW PV capacity by 2030, backed by robust government support and projects like the Setouchi Kirei Mega Solar Power Plant." name="description"/>
   <meta content="photovoltaic market, japan, solar panels, solar power plant, solar energy" name="keywords"/>
   <link href="https://straitsresearch.com/article/japan-s-leadersh

In [7]:
# Extract main content
main_content = soup.find('div', {'id': 'contents'})
for element in main_content(['a', 'em', 'span']):
    element.decompose()

In [8]:
text_list = []
for paragraph in main_content.find_all('p'):
    paragraph_text = paragraph.get_text(separator=' ', strip=True)
    text_list.append(paragraph_text)

In [9]:
df = pd.DataFrame({"Paragraph Text": text_list})
df.to_csv('16_paragraphs.csv', index=False) # Save as csv file

In [10]:
df

Unnamed: 0,Paragraph Text
0,Japan is a world leader in the photovoltaic (P...
1,Japan is a world leader in the photovoltaic (P...
2,The Japanese solar energy market is expected t...
3,Japan's photovoltaic industry has been growing...
4,Space-Based Solar Power and Perovskite Solar C...
5,Japan's photovoltaic market is one of the larg...
6,The plan clarifies the government's policy of ...
7,"In conclusion, Japan is leading the global pho..."
8,Japan's success in the photovoltaic market ser...


In [11]:
nlp = spacy.load("en_core_web_sm")

In [12]:
relationship_keywords = {'collaboration', 'cooperation', 'partner', 'engage', 
                         'support', 'fund', 'finance', 'invest', 'develop', 
                         'supply', 'promote', 'transition', 'export', 'agreement', 
                         'interest', 'share', 'enhance', 'effort', 'import'}

In [13]:
def extract_entities_and_relationships(text):
    doc = nlp(text)  # Process text with spaCy
    entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ['ORG', 'GPE', 'FAC']]

    # Find relationships based on keywords
    relationships = []
    for token in doc:
        if token.lemma_ in relationship_keywords:  # Use lemma_ to check relationship keywords
            subject = [w for w in token.children if w.dep_ == "nsubj"]  # Find subject
            object_ = [w for w in token.children if w.dep_ == "dobj"]  # Find object
            if subject and object_:
                relationships.append((subject[0].text, token.text, object_[0].text))  # (subject, relationship, object)

    return entities, relationships

In [14]:
df['Entities'], df['Relationships'] = zip(*df['Paragraph Text'].apply(extract_entities_and_relationships))

In [15]:
df[['Paragraph Text', 'Entities', 'Relationships']]

Unnamed: 0,Paragraph Text,Entities,Relationships
0,Japan is a world leader in the photovoltaic (P...,"[(Japan, GPE), (PV, FAC), (Japan, GPE), (Japan...",[]
1,Japan is a world leader in the photovoltaic (P...,"[(Japan, GPE), (PV, FAC), (Japan, GPE)]",[]
2,The Japanese solar energy market is expected t...,"[(Japan, GPE)]",[]
3,Japan's photovoltaic industry has been growing...,"[(Japan, GPE), (Japan, GPE)]",[]
4,Space-Based Solar Power and Perovskite Solar C...,"[(Japan, GPE), (US, GPE), (China, GPE), (Japan...",[]
5,Japan's photovoltaic market is one of the larg...,"[(Japan, GPE), (FIT, ORG), (Japan, GPE), (FIT,...",[]
6,The plan clarifies the government's policy of ...,"[(Japan, GPE)]",[]
7,"In conclusion, Japan is leading the global pho...","[(Japan, GPE), (Japan, GPE)]",[]
8,Japan's success in the photovoltaic market ser...,"[(Japan, GPE), (Japan, GPE), (Japan, GPE)]",[]


In [16]:
for index, row in df.iterrows():
    print(f"\nParagraph {index + 1}:")
    print(row['Paragraph Text'])

    # Display dependency parsing
    doc = nlp(row['Paragraph Text'])
    sentence_spans = list(doc.sents)
    displacy.render(sentence_spans, style='dep', jupyter=True, options={'compact': True})
    
    # Display named entities
    displacy.render(sentence_spans, style="ent", jupyter=True)
    
    # Print extracted entities and relationships
    print("Entities:", row['Entities'])
    print("Relationships:", row['Relationships'])


Paragraph 1:
Japan is a world leader in the photovoltaic (PV) market, with a significant share of the global market since about 45% of photovoltaic cells are manufactured in Japan. The country has been at the forefront of solar energy innovation and has been investing heavily in the development of solar PV technology. The Japanese solar energy market is expected to witness more than a 9.2% CAGR during the forecast period (2023-2028). Factors such as solar PV projects under construction in the pipeline and planning stages are expected to boost the cumulative installed solar energy capacity during the forecast period. Let's dive deeper into trends, projects, and strategies with which Japan is leading the . Recent Trends in Japan's solar PV industry Japan's photovoltaic industry has been growing steadily over the years, and there have been several recent developments in the industry. Here are some of the recent developments in Japan's solar PV industry: Japan's photovoltaic market has be



Entities: [('Japan', 'GPE'), ('PV', 'FAC'), ('Japan', 'GPE'), ('Japan', 'GPE'), ('Japan', 'GPE'), ('Japan', 'GPE'), ('Japan', 'GPE'), ('Japan', 'GPE'), ('Japan', 'GPE'), ('US', 'GPE'), ('China', 'GPE'), ('Japan', 'GPE'), ('Japan', 'GPE'), ('Japan', 'GPE'), ('US', 'GPE'), ('China', 'GPE'), ('Japan', 'GPE'), ('Japan', 'GPE'), ('GW', 'ORG'), ("the Ministry of the Environment's", 'ORG'), ('FIT', 'ORG'), ('FIP', 'ORG'), ('Japan', 'GPE'), ('AC', 'ORG'), ('Japan', 'GPE'), ('Photovoltaics Space-Based Solar Power', 'ORG'), ('Japan', 'GPE'), ('US', 'GPE'), ('China', 'GPE'), ('Japan', 'GPE'), ('Japan', 'GPE'), ('Japan', 'GPE'), ('FIT', 'ORG'), ('Japan', 'GPE'), ('FIT', 'ORG'), ('PV', 'FAC'), ('Japan', 'GPE'), ('Japan', 'GPE'), ('Setouchi', 'GPE'), ('Okayama', 'GPE'), ('Japan', 'GPE'), ('MW', 'ORG'), ('MW', 'ORG'), ('Chiba', 'ORG'), ('MW', 'ORG'), ('Fukushima', 'GPE'), ('MW', 'ORG'), ('Oita', 'ORG'), ('MW', 'ORG'), ('Hokkaido', 'GPE'), ('MW', 'ORG'), ('Kumamoto', 'GPE'), ('Kumamoto', 'GPE'), ('MW'

Entities: [('Japan', 'GPE'), ('PV', 'FAC'), ('Japan', 'GPE')]
Relationships: []

Paragraph 3:
The Japanese solar energy market is expected to witness more than a 9.2% CAGR during the forecast period (2023-2028). Factors such as solar PV projects under construction in the pipeline and planning stages are expected to boost the cumulative installed solar energy capacity during the forecast period. Let's dive deeper into trends, projects, and strategies with which Japan is leading the .


Entities: [('Japan', 'GPE')]
Relationships: []

Paragraph 4:
Japan's photovoltaic industry has been growing steadily over the years, and there have been several recent developments in the industry. Here are some of the recent developments in Japan's solar PV industry:


Entities: [('Japan', 'GPE'), ('Japan', 'GPE')]
Relationships: []

Paragraph 5:
Space-Based Solar Power and Perovskite Solar Cells: Japan is making progress in solar, offshore wind, storage, and hydrogen technology. The country is a leader in solar PV innovation and is now looking to grow its industry further amid US-China tensions and a shift to renewables. Japan Targets Adoption of Flexible Solar Panels by 2030: Japan aims to popularize the use of flexible solar cells by 2030, with the government planning to support mass production by domestic companies and introduce them at public buildings, train stations, schools, and elsewhere.


Entities: [('Japan', 'GPE'), ('US', 'GPE'), ('China', 'GPE'), ('Japan', 'GPE'), ('Japan', 'GPE')]
Relationships: []

Paragraph 6:
Japan's photovoltaic market is one of the largest in the world, with a cumulative installed capacity of over 70 GW as of 2023. The country has been investing heavily in solar PV technology, with the government providing incentives for the installation of solar panels. The government's feed-in-tariff (FIT) scheme has been instrumental in driving the growth of the solar PV market in Japan. The FIT scheme provides a guaranteed price for solar energy generated by residential and commercial PV systems, making it an attractive investment for homeowners and businesses.


Entities: [('Japan', 'GPE'), ('FIT', 'ORG'), ('Japan', 'GPE'), ('FIT', 'ORG'), ('PV', 'FAC')]
Relationships: []

Paragraph 7:
The plan clarifies the government's policy of working with companies and others to build supply networks and encourage widespread use of the panels. Japan is also investing in other innovative solar PV technologies, such as space-based solar power and flexible perovskite solar cells.


Entities: [('Japan', 'GPE')]
Relationships: []

Paragraph 8:
In conclusion, Japan is leading the global photovoltaic market with its innovative solar PV technologies, major photovoltaic projects, and companies that manufacture photovoltaics. The country's focus and efforts in renewable energy generation and government incentives for solar energy have been instrumental in driving the growth of the photovoltaic market in Japan.


Entities: [('Japan', 'GPE'), ('Japan', 'GPE')]
Relationships: []

Paragraph 9:
Japan's success in the photovoltaic market serves as an example for other countries to follow in promoting photovoltaic technology. Japan is making tremendous progress in solar PV innovation and is now aiming to be at the top. With continued investment and innovation, Japan's photovoltaic industry is poised for unprecedented growth in the coming years.


Entities: [('Japan', 'GPE'), ('Japan', 'GPE'), ('Japan', 'GPE')]
Relationships: []


In [17]:
entity1_list = []
relationship_list = []
entity2_list = []
date_list = []
money_list = []

In [18]:
# Iterate through each paragraph
for index, row in df.iterrows():
    doc = nlp(row['Paragraph Text'])  # Process the paragraph with spaCy

    for sent in doc.sents:
        # Extract named entities (countries, organizations)
        entities = []
        money_in_sentence = []

        for ent in sent.ents:
            if 'Inititative' in ent.text.lower(): 
                ent = ent._replace(label_='POL')
                entities.append(ent)
            elif ent.label_ in ['GPE', 'ORG', 'POL', 'MONEY']:
                entities.append(ent)

            # Collect monetary amounts
            if ent.label_ == "MONEY":
                money_in_sentence.append(ent.text)
        
        # Look for verbs or specific relationship nouns in the sentence
        verbs = [token for token in sent if token.pos_ == "VERB"]
        keywords_in_sentence = [token.text for token in sent if token.lemma_ in relationship_keywords]
        
        # Extract dates
        dates = [ent.text for ent in sent.ents if ent.label_ == "DATE"]

        # If we find two entities and a relationship keyword, we assume a relationship
        if len(entities) >= 2 and keywords_in_sentence:
            entity1_list.append(entities[0].text)

            # Concatenate the relationship noun and verb (if available) into one string
            relationship_info = keywords_in_sentence[0]
            if verbs:
                relationship_info += " (verb: " + verbs[0].lemma_ + ")"

            relationship_list.append(relationship_info)  # Store concatenated info
            entity2_list.append(entities[1].text)
            date_list.append(dates[0] if dates else "n/a")
            money_list.append(money_in_sentence[0] if money_in_sentence else "n/a")

In [19]:
# DataFrame
relationships_df = pd.DataFrame({
    "Entity1": entity1_list,
    "Relationship": relationship_list,
    "Entity2": entity2_list,
    "Date": date_list,
    "Money": money_list
})

In [20]:
relationships_df

Unnamed: 0,Entity1,Relationship,Entity2,Date,Money
0,Japan,share (verb: manufacture),Japan,,
1,Japan,share (verb: grow),Japan,the years,
2,Japan,supported (verb: estimate),GW,2022,
3,Japan,supply (verb: expect),AC,fiscal year 2030,
4,Japan,share (verb: manufacture),Japan,,


In [21]:
# Export to csv
relationships_df.to_csv('16_relationships.csv', index=False)