In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [20]:
import warnings
warnings.filterwarnings('ignore')
pip install googlesearch-python

In [21]:
# Step 1: Search Queries and Get a List of Query Links
from googlesearch import search

queries =["Canoo industry size and growth rate",
           "Canoo competitors market share and pricing strategies",
           "Key trends in Canoo's market",
           "Canoo financial performance revenue and profit margins"]
for query in queries:
    search_results = search(query,num_results=5)
    links =[]
    for result in search_results:
            if result not in links:
                links.append(result)

links

['https://investors.canoo.com/financial-information/income-statement',
 'https://simplywall.st/stocks/us/automobiles/nasdaq-goev/canoo/past',
 'https://www.tipranks.com/stocks/goev/financials',
 'https://www.wsj.com/market-data/quotes/GOEV/financials',
 'https://ycharts.com/companies/GOEV/gross_profit_margin',
 'https://in.investing.com/equities/hennessy-capital-acquisition-corp-financial-summary',
 'https://www.macrotrends.net/stocks/charts/GOEV/canoo/profit-margins']

In [23]:
# Step 2: Scrape Data from Web Links
data = []
queries = ["Canoo industry size and growth rate",
           "Canoo competitors market share and pricing strategies",
           "Key trends in Canoo's market",
           "Canoo financial performance revenue and profit margins"]
for query in queries:
    url = f"https://google.com/search?q={query}"  
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    title = soup.title         # Extract relevant information from the page
    text = soup.get_text()
    data.append({"Title": title, "Text": text})
data  

[{'Title': <title>Canoo industry size and growth rate - Google Search</title>,
  'Text': "Canoo industry size and growth rate - Google SearchGoogle×Please click here if you are not redirected within a few seconds.    AllNewsImagesVideos Maps Shopping Books Search tools    Any timeAny timePast hourPast 24 hoursPast weekPast monthPast yearAll resultsAll resultsVerbatimIncluding results for Canoe industry size and growth rateSearch only for Canoo industry size and growth rateCanoe Market Size, Growth & Trends Report [2024-2030] - LinkedInwww.linkedin.com › pulse › canoe-market-size-growth-trends-report-2024...1 Dec 2023 · By 2030, the global Canoe market size is projected to reach multimillion figures, displaying an unexpected compound annual growth rate between\xa0...People also askHow big is the canoeing market?What is the definition of a canoo?Canoo Inc. (GOEV) valuation measures & financial ... - Yahoo Financesg.finance.yahoo.com › quote › GOEV › key-statisticsReturn on equity (ttm), 

In [24]:
# we can also use link and scrape the data from links 
data = []
for link in links:
    response = requests.get(link)
    soup = BeautifulSoup(response.text, 'html.parser')
   
    title = soup.title         # Extract relevant information from the page
    text = soup.get_text()
    data.append({"Title": title, "Text": text})
data     

In [25]:
#store the scraped data in a CSV file
df = pd.DataFrame(data)
df.to_csv('canoo_data.csv', index=False)

In [10]:
# importing NLTK library
import nltk

In [26]:
# TF-IDF vectorization of text
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

# Load CSV data into a DataFrame
db = pd.read_csv('canoo_data.csv')
corpus = db['Text'].values

# Convert text data into numerical vectors
vectorizer = TfidfVectorizer(stop_words='english')

X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names_out())

In [27]:
# Visualizing the Document Term Matrix using TF-IDF
import pandas as pd
VectorizedText=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
VectorizedText['originalText']=pd.Series(corpus)
VectorizedText

In [28]:
pip install spacy

In [29]:
import spacy

# Load the English language model
nlp = spacy.load('en_core_web_sm')

# Function to generate summary
def generate_summary(text, max_length=100):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    summary = " ".join(sentences[:min(len(sentences), max_length)])
    return summary

# Generate summary
corpus = db['Text']
summary = generate_summary(corpus.iloc[0])
print(summary)

Canoo industry size and growth rate - Google SearchGoogle×Please click here if you are not redirected within a few seconds.     AllNewsImagesVideos Maps Shopping Books Search tools    Any timeAny timePast hourPast 24 hoursPast weekPast monthPast yearAll resultsAll resultsVerbatimIncluding results for Canoe industry size and growth rateSearch only for Canoo industry size and growth rateCanoe Market Size, Growth & Trends Report [2024-2030] - LinkedInwww.linkedin.com › pulse › canoe-market-size-growth-trends-report-2024...1 Dec 2023 · By 2030, the global Canoe market size is projected to reach multimillion figures, displaying an unexpected compound annual growth rate between ...People also askHow big is the canoeing market?What is the definition of a canoo?Canoo Inc. (GOEV) valuation measures & financial ... - Yahoo Financesg.finance.yahoo.com › quote › GOEV › key-statisticsReturn on equity (ttm), -170.77%. Income statement. Revenue (ttm), N/A. Revenue per share (ttm), N/A. Quarterly reve

In [30]:
corpus = db['Text']
summary = generate_summary(corpus.iloc[1])
print(summary)

Canoo competitors market share and pricing strategies - Google SearchGoogle×Please click here if you are not redirected within a few seconds.    AllNewsImagesBooks Maps Videos Shopping Search tools    Any timeAny timePast hourPast 24 hoursPast weekPast monthPast yearAll resultsAll resultsVerbatimCanoe Market Share 2024 Revenue and Price Trends, Size, Growth ...www.linkedin.com › pulse › canoe-market-share-2024-revenue-price-trend...24 Jan 2024 · The report provides an in-depth study of the major players in the market, their competitive landscape, product portfolios, strategies, and ...Canoo Inc Competitors 2023 | Stocks: GOEV - Macroaxiswww.macroaxis.com › competition › GOEVCanoo Inc competes with Goodyear Tire, Quantumscape Corp, Visteon Corp, Dorman Products, and Volcon; as well as few others. The company conducts business under ...GOEV's vs. Market share relative to its competitors, as of Q3 2023csimarket.com › stocks › competitionSEG2Canoo Inc's Q3 2023 quarter and 12 months market

In [None]:
# to see all query we have  to change iloc[0:4]

In [18]:
# We can also search by company name  and get a List of Query Links

In [15]:
# Search Queries and Get a List of Query Links
from googlesearch import search

query = "Canoo"
search_results = search(query, num_results=10)

links =[]
for result in search_results:
    if result not in links:
        links.append(result)
        
links

['https://www.canoo.com/',
 'https://www.linkedin.com/company/canoo',
 'https://en.wikipedia.org/wiki/Canoo',
 'https://www.instagram.com/canoo/?hl=en',
 'https://twitter.com/canoo?lang=en',
 'https://www.youtube.com/channel/UCjnvEVgMdkcQY980TZPVqkg',
 'https://www.usatoday.com/story/money/cars/2023/11/22/a-2024-canoo-lifestyle-vehicle-first-drive-review/71672558007/',
 'https://www.facebook.com/Canoo/',
 'https://www.theverge.com/2023/7/12/23792450/canoo-ev-nasa-artemis-defense-government']

In [31]:
##### Code For text cleaning Purpose###

import re

# Check if data is a string or bytes-like object
if not isinstance(data, str):
    data = str(data)

# Remove unwanted characters and whitespace
cleaned_data= re.sub(r'[^a-zA-Z0-9\s]', '', data)

# Remove extra whitespace
cleaned_data= re.sub(r'\s+', ' ', cleaned_data).strip()

# Print the cleaned data
print(cleaned_data)

In [17]:
# To extract from wikipedia 

In [79]:
import requests

def scrape_data_from_wikipedia(page_title):
    # URL for the Wikipedia API
    url = "https://en.wikipedia.org/w/api.php"

    # Parameters for the API request
    params = {"action": "query","format": "json",
              "prop": "extracts","exintro": True,
              "titles": page_title}

    # Send a GET request to the Wikipedia API
    response = requests.get(url, params=params)

    if response.status_code == 200:
        data = response.json()
        page_id = list(data["query"]["pages"].keys())[0]
        content = data["query"]["pages"][page_id]["extract"]
        return content
    else:
        print(f"Failed to fetch data from Wikipedia. Status code: {response.status_code}")
        return None

page_title = "Canoo"
content = scrape_data_from_wikipedia(page_title)
if content:
    print(content)


<p class="mw-empty-elt">

</p>
<p><b>Canoo Inc.</b> is an American automotive company based in Torrance, California, that develops and manufactures electric vehicles. Canoo's research &amp; development team is based in Michigan, in the Detroit region (Auburn Hills, Livonia), and production operations are in Justin, Texas. The company also plans to produce commercial electric vehicles such as vans for fleet, vehicle rental and ride sharing services.</p>
