<a href="https://colab.research.google.com/github/Sambhaji6529/Web-Scraping/blob/main/Web_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import feedparser
from transformers import pipeline
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Step 1: Web scrape headlines from CNBC
def scrape_cnbc_headlines():
    url = "https://www.cnbc.com/search/?query=green%20hydrogen&qsearchterm=green%20hydrogen"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    headlines = [headline.text.strip() for headline in soup.find_all('h3')]
    return headlines


In [None]:
# Call the function to scrape headlines
df=cnbc_headlines = scrape_cnbc_headlines()

# Print the scraped headlines
print("CNBC Headlines:")
for headline in cnbc_headlines:
    print(headline)


CNBC Headlines:


In [None]:
# Step 2: Fetch news headlines from Google News
def fetch_google_news_headlines():
    url = "https://news.google.com/rss/search?q=green%20hydrogen&hl=en-IN&gl=IN&ceid=IN:en"
    feed = feedparser.parse(url)
    headlines = [entry.title for entry in feed.entries]
    return headlines

# Call the function to fetch headlines from Google News
google_news_headlines = fetch_google_news_headlines()



In [None]:
print(f"Length of google_news_headlines: {len(google_news_headlines)}")

Length of google_news_headlines: 98


In [None]:
print(f"create_dataframe function defined: {'yes' if 'create_dataframe' in globals() else 'no'}")

create_dataframe function defined: yes


In [None]:
def create_dataframe(headlines, source):
    df = pd.DataFrame({'Headline': headlines, 'Source': source})
    df['Sentiment Score'] = calculate_sentiment_scores(headlines)
    df['Organization Names'] = identify_organization_names(headlines)
    return df

In [None]:
# Specify the sentiment analysis model
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")


In [None]:
print(f"identify_organization_names function defined: {'yes' if 'identify_organization_names' in globals() else 'no'}")

identify_organization_names function defined: yes


In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
import spacy
from spacy import displacy
from spacy.matcher import Matcher
from spacy.tokens import Span

In [None]:
def identify_organization_names(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    # Initialize the matcher with a pattern for organizations
    matcher = Matcher(nlp.vocab)
    pattern = [{"POS": "PROPN"}, {"POS": "PROPN"}]
    matcher.add("ORG", [pattern])

    # Find matches in the text
    matches = matcher(doc)

    # Extract organization names from matches
    org_names = []
    for match_id, start, end in matches:
        org_names.append(doc[start:end].text)

    return org_names

In [None]:
print(type(cnbc_headlines))

<class 'list'>


In [None]:
df = pd.DataFrame(columns=["Headline", "Organization Names"])

In [None]:
org_names = []
for headline in cnbc_headlines:
    org_names.extend(identify_organization_names(cnbc_headlines))

df['Organization Names'] = org_names

In [None]:
import pandas as pd

def create_dataframe(headlines, sources):
    # Check if headlines and sources have the same length
    if len(headlines) != len(sources):
        raise ValueError("Length of headlines and sources must be the same.")

    # Create a DataFrame
    df = pd.DataFrame({'Headline': headlines, 'Source': sources})
    return df

# Check if either source has headlines
if cnbc_headlines and google_news_headlines:
    df = create_dataframe(cnbc_headlines + google_news_headlines, ['CNBC'] * len(cnbc_headlines) + ['Google News'] * len(google_news_headlines))
elif cnbc_headlines:
    df = create_dataframe(cnbc_headlines, ['CNBC'] * len(cnbc_headlines))
elif google_news_headlines:
    df = create_dataframe(google_news_headlines, ['Google News'] * len(google_news_headlines))
else:
    print("No headlines available from either source.")


In [None]:
# Step 3: Create Pandas DataFrame
if cnbc_headlines and google_news_headlines:
    df = create_dataframe(cnbc_headlines + google_news_headlines, ['CNBC'] * len(cnbc_headlines) + ['Google News'] * len(google_news_headlines))
elif cnbc_headlines:
    df = create_dataframe(cnbc_headlines, ['CNBC'] * len(cnbc_headlines))
elif google_news_headlines:
    df = create_dataframe(google_news_headlines, ['Google News'] * len(google_news_headlines))
else:
    print("No headlines available from either source.")

In [None]:
# Step 4: Use sentiment analysis model to compute sentiment scores
def calculate_sentiment_scores(headlines):
    sentiment_analyzer = pipeline("sentiment-analysis")
    sentiment_scores = [sentiment_analyzer(headline)[0]['score'] for headline in headlines]
    return sentiment_scores

In [None]:
# Step 5: Use NER model to identify organization names
def identify_organization_names(headlines):
    # Use appropriate NER model to identify organization names
    # Example: Use SpaCy's NER model
    organizations = []  # Placeholder for identified organization names
    return organizations

In [None]:
# Step 6: Create Pandas DataFrame with headlines, news date, source, sentiment score, and organization names
import pandas as pd

# Step 6: Create Pandas DataFrame with headlines, news date, source, sentiment score, and organization names
def create_dataframe(headlines, source, news_date):
    df = pd.DataFrame({'Headline': headlines, 'Source': source, 'News Date': news_date})
    df['Sentiment Score'] = calculate_sentiment_scores(headlines)
    df['Organization Names'] = identify_organization_names(headlines)
    return df


In [None]:
# Step 7: Generate CSV table
def generate_csv(df):
    df.to_csv('green_hydrogen_news.csv', index=False)


In [None]:
# Step 8: Transfer CSV data to Google Sheet using Google Sheets Python API
# You'll need to follow Google Sheets API documentation for this step

In [None]:
pip install --upgrade google-api-python-client


In [None]:
df.head()

Unnamed: 0,Headline,Source
0,Govt allocates Rs 455 crore for green hydrogen...,Google News
1,Govt shares guidelines for Green Hydrogen Pilo...,Google News
2,MNRE Launches Pilot Projects to Implement Gree...,Google News
3,MNRE releases guidelines for pilot projects on...,Google News
4,India Releases Guidelines for Green Hydrogen i...,Google News


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Headline  98 non-null     object
 1   Source    98 non-null     object
dtypes: object(2)
memory usage: 1.7+ KB


In [None]:
# Assuming 'news_dates' is a list containing the dates you want to add
news_dates = [1/29/2024, 1/17/2024, 1/15/2024, 1/31/2024, 8/12/2023, 11/9/2023, 7/4/2023 ]  # Your list of dates goes here

# Add the 'News Date' column to the DataFrame
df['news_dates'] = news_dates


In [None]:
# Check column names
print(df.columns)

# Check if 'News Date' is in the columns
if 'News Date' in df.columns:
    print(df['News Date'].head())
else:
    print("Column 'News Date' not found in DataFrame.")


Index(['Headline', 'Source'], dtype='object')
Column 'News Date' not found in DataFrame.


In [None]:
# Step 9: Generate graph for week-wise trend of average sentiment scores
def generate_sentiment_trend_graph(df):
    # Convert the 'News Date' column to datetime format
    df['News Date'] = pd.to_datetime(df['News Date'], format='%Y-%m-%d')
    # Group the data by week and calculate the average sentiment score
    df['Week'] = df['News Date'].dt.week
    weekly_sentiment = df.groupby('Week')['Sentiment Score'].mean()
    # Plot the graph
    plt.plot(weekly_sentiment.index, weekly_sentiment.values)
    plt.xlabel('Week')
    plt.ylabel('Average Sentiment Score')
    plt.title('Week-wise Trend of Average Sentiment Score')
    plt.show()

In [None]:
# Step 10: Generate word cloud map with organization names
def generate_wordcloud(organizations):
    wordcloud = WordCloud(width=800, height=400, background_color ='white').generate(' '.join(organizations))
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

In [None]:
# Main function to orchestrate the workflow
def main():
    # Step 1: Scrape CNBC headlines
    cnbc_headlines = scrape_cnbc_headlines()

    # Step 2: Fetch Google News headlines
    google_news_headlines = fetch_google_news_headlines()

    # Step 3: Create Pandas DataFrame
    df = create_dataframe(cnbc_headlines + google_news_headlines, ['CNBC'] * len(cnbc_headlines) + ['Google News'] * len(google_news_headlines), [])

    # Step 6: Generate CSV
    generate_csv(df)

    # Step 8: Transfer data to Google Sheet

    # Step 9: Generate sentiment trend graph
    generate_sentiment_trend_graph(df)

    # Step 10: Generate word cloud
    #generate_wordcloud(df['Organization Names'])

if __name__ == "__main__":
    main()