In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from transformers import pipeline

# Load the CSV file
data = pd.read_csv('Scrape data from web links.csv')

# Drop rows with NaN values in the 'information' column
data = data.dropna(subset=['information'])

# Convert text data into vectors
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(data['information'])

# Function to perform text summarization
def summarize_text(text):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentences_count=2)
    summarized_text = " ".join([str(sentence) for sentence in summary])
    return summarized_text

# Function to perform query and summarization
def perform_query(query, tfidf_matrix, vectorizer, data):
    query_vector = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix)
    most_similar_index = cosine_similarities.argsort()[0][-1]
    relevant_information = data.loc[most_similar_index, 'information']
    summarized_information = summarize_text(relevant_information)
    return summarized_information

# Example queries
example_queries = [
    "Identify the industry in which Canoo operates, along with its size, growth rate, trends, and key players.",
    "Analyze Canoo's main competitors, including their market share, products or services offered, pricing strategies, and marketing efforts.",
    "Identify key trends in the market, including changes in consumer behavior, technological advancements, and shifts in the competitive landscape.",
    "Gather information on Canoo's financial performance, including its revenue, profit margins, return on investment, and expense structure."
]

# Perform queries and print summarized results
for query in example_queries:
    print("Query:", query)
    summarized_result = perform_query(query, tfidf_matrix, vectorizer, data)
    print("Summarized Result:", summarized_result)
    print("-" * 50)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Query: Identify the industry in which Canoo operates, along with its size, growth rate, trends, and key players.
Summarized Result: Publisher: AnythingResearch                  Market Size                 Growth Rate                 5-Year Market Forecast                 Average Company Size & Growth                 Salary & Compensation Benchmarks
--------------------------------------------------
Query: Analyze Canoo's main competitors, including their market share, products or services offered, pricing strategies, and marketing efforts.
Summarized Result: Further reading: Building a profile for each of your main competitors allows you to benchmark important details, including market share and audience demographics. So, it makes more sense that he sells his products in Sephora stores rather than affordable retail chains like Walmart or CVS.
--------------------------------------------------
Query: Identify key trends in the market, including changes in consumer behavior, technologica