In [None]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB

In [None]:

!pip install langchain langchain_openai openai tiktoken chromadb pandas nltk bs4 requests python-dotenv

# Import necessary libraries
import os
import requests
import pandas as pd
import nltk
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import numpy as np
from datetime import datetime

# LangChain and OpenAI imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.schema import Document
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

# Download NLTK resources for text preprocessing
nltk.download('punkt')

# Load environment variables (API keys)
load_dotenv()

# Set OpenAI API key  #MY_new_OPENAI_key
os.environ["OPENAI_API_KEY"] = ""

Collecting langchain_openai
  Downloading langchain_openai-0.3.12-py3-none-any.whl.metadata (2.3 kB)
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.23.0-py2.py3-none-any.whl.met

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# 1. Data Collection

def scrape_crypto_articles(num_articles=10):
    """
    Scrape cryptocurrency news articles from a sample source.
    In a production system, you'd use more reliable sources with proper API access.
    """
    print("Fetching cryptocurrency articles...")



    sample_articles = [
        {
            "title": "Bitcoin Hits New All-Time High Amid Institutional Adoption",
            "content": "Bitcoin has reached a new all-time high as institutional investors continue to adopt the cryptocurrency. Major corporations have added Bitcoin to their balance sheets, viewing it as a hedge against inflation. The cryptocurrency has shown significant volatility in the past, but has demonstrated strong recovery patterns. Analysts suggest this trend could continue as mainstream adoption increases.",
            "date": "2025-03-15",
            "source": "CryptoNews",
            "url": "https://example.com/bitcoin-ath",
            "tickers": ["BTC"]
        },
        {
            "title": "Ethereum's Shift to Proof-of-Stake Reduces Energy Consumption by 99%",
            "content": "Ethereum has successfully completed its transition to a Proof-of-Stake consensus mechanism, reducing its energy consumption by approximately 99%. This eco-friendly shift has attracted more environmentally conscious investors. The upgrade also introduces enhanced scalability features and reduced gas fees for transactions, making the network more accessible for smaller investors and developers. These improvements could position Ethereum for long-term growth in the smart contract platform market.",
            "date": "2025-03-10",
            "source": "BlockchainInsider",
            "url": "https://example.com/ethereum-pos",
            "tickers": ["ETH"]
        },
        {
            "title": "Cardano Launches New DeFi Platform, ADA Price Surges",
            "content": "Cardano has launched a new decentralized finance platform, causing its native token ADA to surge in price. The platform aims to provide financial services to unbanked populations. Cardano's methodical, research-driven approach to development has been criticized for slow progress but praised for security and reliability. Its focus on academic rigor and formal verification methods makes it attractive for risk-averse investors looking for sustainable growth rather than quick gains.",
            "date": "2025-03-12",
            "source": "DeFiDaily",
            "url": "https://example.com/cardano-defi",
            "tickers": ["ADA"]
        },
        {
            "title": "Solana Network Experiences Growth in NFT Market Share",
            "content": "Solana's blockchain has been gaining significant traction in the NFT marketplace, challenging Ethereum's dominance. The network's high throughput and low transaction fees have made it attractive to NFT creators and collectors. Despite past network outages affecting confidence, recent protocol upgrades have improved stability. Solana's ecosystem continues to expand with new projects and applications, potentially offering high growth opportunities alongside higher technical risks.",
            "date": "2025-03-16",
            "source": "NFTWorld",
            "url": "https://example.com/solana-nfts",
            "tickers": ["SOL"]
        },
        {
            "title": "Ripple Wins Regulatory Clarity in Major Markets",
            "content": "Ripple has secured regulatory clarity in several major markets, boosting confidence in XRP. The company continues to expand its cross-border payment solutions with financial institutions globally. Regulatory developments have had significant impact on XRP's price volatility. Despite legal challenges in some jurisdictions, Ripple's underlying technology for international transfers remains compelling for financial institutions seeking efficiency improvements.",
            "date": "2025-03-05",
            "source": "CryptoRegulation",
            "url": "https://example.com/ripple-regulations",
            "tickers": ["XRP"]
        },
        {
            "title": "Binance Coin Grows as Exchange Expands Services",
            "content": "Binance Coin (BNB) has seen steady growth as the Binance exchange continues to expand its service offerings. The token's utility within the Binance ecosystem provides it with practical use cases beyond speculation. Binance's regular token burns reduce supply, potentially supporting price appreciation. However, regulatory scrutiny of centralized exchanges presents ongoing compliance challenges and risks for the associated token.",
            "date": "2025-03-20",
            "source": "ExchangeNews",
            "url": "https://example.com/bnb-growth",
            "tickers": ["BNB"]
        },
        {
            "title": "Polkadot Parachains Show Promise for Interoperability Solutions",
            "content": "Polkadot's parachain ecosystem is demonstrating promising results for blockchain interoperability. The network's ability to connect different blockchains could solve major fragmentation issues in the crypto space. Polkadot's unique architecture allows specialized blockchains to operate with shared security, creating opportunities for niche applications with cross-chain capabilities. This technical approach could provide significant long-term value if wider blockchain adoption continues across industries.",
            "date": "2025-03-08",
            "source": "InteropTech",
            "url": "https://example.com/polkadot-parachains",
            "tickers": ["DOT"]
        },
        {
            "title": "Algorand Partners with Central Banks for CBDC Development",
            "content": "Algorand has announced partnerships with multiple central banks for central bank digital currency (CBDC) development projects. These high-profile collaborations highlight the platform's institutional-grade capabilities. Algorand's pure proof-of-stake protocol offers deterministic finality and carbon-negative operations, making it suitable for regulated financial applications. Such partnerships could establish Algorand as a key player in the institutional blockchain space, though retail adoption remains more limited than some competitors.",
            "date": "2025-03-18",
            "source": "CBDCInsights",
            "url": "https://example.com/algorand-cbdc",
            "tickers": ["ALGO"]
        },
        {
            "title": "Chainlink Expands Data Oracle Services Beyond Crypto",
            "content": "Chainlink has expanded its oracle services beyond the cryptocurrency sector, now providing data feeds to traditional financial markets. This broadening use case strengthens Chainlink's position as the leading decentralized oracle network. The project's focus on reliable data provision has made it essential infrastructure for many DeFi applications. As more industries require tamper-proof data for smart contracts, Chainlink's utility and potential market continue to grow, offering a unique investment profile tied to the broader blockchain ecosystem rather than any single platform.",
            "date": "2025-03-14",
            "source": "DeFiInsider",
            "url": "https://example.com/chainlink-expansion",
            "tickers": ["LINK"]
        },
        {
            "title": "Avalanche Subnet Technology Enables Custom Blockchain Deployment",
            "content": "Avalanche's subnet technology is enabling enterprises to deploy custom blockchains tailored to specific requirements. This flexibility has attracted projects from gaming to finance seeking scalable blockchain solutions. Avalanche's architecture balances decentralization with high performance, processing thousands of transactions per second with sub-second finality. The platform's ability to host application-specific blockchains positions it as a versatile foundation for next-generation Web3 services, potentially capturing market share across multiple verticals simultaneously.",
            "date": "2025-03-09",
            "source": "EnterpriseChain",
            "url": "https://example.com/avalanche-subnets",
            "tickers": ["AVAX"]
        }
    ]

    # Create a DataFrame for easier manipulation
    df = pd.DataFrame(sample_articles)
    print(f"Collected {len(df)} articles")
    return df

# Option 1: Using News APIs


In [None]:
!pip install requests beautifulsoup4 feedparser

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6046 sha256=9caf123eb75539ed504b108a0858165aafd7a0c82a49bac9bd24f3f8a61c45c3
  Stored in directory: /root/.cache/pip/wheels/3b/25/2a/105d6a15df6914f4d15047691c6c28f9052cc1173e40285d03
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.11 sgmllib3k-1.0.0


In [None]:
def scrape_crypto_articles_api(num_articles=10):
    """Fetch cryptocurrency news using a news API"""
    import requests
    import os
    from datetime import datetime, timedelta

    # Option 1: NewsAPI.org
    # Sign up at https://newsapi.org for an API key (free tier available)
    api_key = os.getenv("NEWS_API_KEY", "")

    # Set parameters
    days_back = 7
    from_date = (datetime.now() - timedelta(days=days_back)).strftime('%Y-%m-%d')

    url = "https://newsapi.org/v2/everything"
    params = {
        "q": "cryptocurrency OR bitcoin OR ethereum OR blockchain",
        "language": "en",
        "sortBy": "publishedAt",
        "from": from_date,
        "apiKey": api_key,
        "pageSize": num_articles
    }

    print(f"Fetching cryptocurrency news from the past {days_back} days...")

    try:
        response = requests.get(url, params=params)
        data = response.json()

        if response.status_code == 200 and data.get("status") == "ok":
            articles = []

            for article in data.get("articles", []):
                # Extract cryptocurrency tickers mentioned in the title/description
                content = (article.get("title", "") + " " + article.get("description", "")).lower()
                tickers = []
                for ticker in ["BTC", "ETH", "ADA", "SOL", "XRP", "BNB", "DOT", "AVAX", "LINK", "ALGO"]:
                    if ticker.lower() in content or ticker in content:
                        tickers.append(ticker)

                # Default to general crypto if no specific ticker found
                if not tickers:
                    tickers = ["CRYPTO"]

                articles.append({
                    "title": article.get("title", ""),
                    "content": article.get("description", "") + " " + article.get("content", ""),
                    "date": article.get("publishedAt", "")[:10],
                    "source": article.get("source", {}).get("name", ""),
                    "url": article.get("url", ""),
                    "tickers": tickers
                })

            # Create DataFrame
            df = pd.DataFrame(articles)
            print(f"Collected {len(df)} articles from News API")
            return df
        else:
            print(f"Error fetching news: {data.get('message', 'Unknown error')}")
            # Fall back to sample data
            return scrape_crypto_articles(num_articles)

    except Exception as e:
        print(f"Exception when fetching news: {e}")
        # Fall back to sample data
        return scrape_crypto_articles(num_articles)

# Option 2: Direct Web Scraping

In [None]:
# 2. Document Processing

def process_articles_to_documents(articles_df):
    """Convert articles DataFrame into LangChain Document objects."""
    documents = []

    for _, row in articles_df.iterrows():
        # Create metadata for better retrieval context
        metadata = {
            "title": row["title"],
            "date": row["date"],
            "source": row["source"],
            "url": row["url"],
            "tickers": ",".join(row["tickers"])
        }

        # Create Document object with content and metadata
        doc = Document(
            page_content=f"Title: {row['title']}\n\nContent: {row['content']}",
            metadata=metadata
        )
        documents.append(doc)

    print(f"Processed {len(documents)} documents with metadata")
    return documents

# Function to split documents into chunks for embedding
def split_documents(documents):
    """Split documents into smaller chunks for better retrieval."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        separators=["\n\n", "\n", ". ", " ", ""]
    )

    chunks = text_splitter.split_documents(documents)
    print(f"Split documents into {len(chunks)} chunks")
    return chunks


In [None]:
# 3. Vector Store Creation

def create_vector_store(chunks):
    """Create a vector store from document chunks using OpenAI embeddings."""
    # Initialize OpenAI embeddings
    embeddings = OpenAIEmbeddings()

    # Create vector store - using Chroma for in-memory storage
    # In production, you might want to use Supabase, Pinecone, or another persistent vector store
    vector_store = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings
    )

    print("Vector store created successfully")
    return vector_store

In [None]:
# 4. User Profile Processing

def generate_search_queries(risk_profile):
    """
    Generate appropriate search queries based on user's risk profile.
    Different risk tolerance levels will prioritize different aspects of cryptocurrencies.
    """
    queries = []

    if risk_profile.lower() == "conservative" or risk_profile.lower() == "low":
        queries = [
            "stable cryptocurrency with low volatility",
            "established cryptocurrency with institutional adoption",
            "cryptocurrency with strong regulatory compliance",
            "blue-chip cryptocurrency long-term investment"
        ]
    elif risk_profile.lower() == "moderate" or risk_profile.lower() == "medium":
        queries = [
            "balanced risk-reward cryptocurrency",
            "cryptocurrency with growing adoption",
            "established altcoins with utility",
            "mid-cap cryptocurrency with potential"
        ]
    elif risk_profile.lower() == "aggressive" or risk_profile.lower() == "high":
        queries = [
            "high potential growth cryptocurrency",
            "emerging cryptocurrency projects",
            "innovative blockchain technology",
            "new cryptocurrency with unique features"
        ]
    else:
        # Default to a mix of queries
        queries = [
            "reliable cryptocurrency investment",
            "cryptocurrency market trends",
            "promising cryptocurrency projects",
            "cryptocurrency adoption and growth"
        ]

    print(f"Generated {len(queries)} search queries for {risk_profile} risk profile")
    return queries


In [None]:
# 5. Retrieval System

def retrieve_relevant_chunks(vector_store, queries, top_k=3):
    """
    Retrieve relevant document chunks based on generated queries.
    Combines results from multiple queries for better coverage.
    """
    all_docs = []

    for query in queries:
        docs = vector_store.similarity_search(query, k=top_k)
        all_docs.extend(docs)

    # Remove duplicates while preserving order
    unique_docs = []
    seen_content = set()

    for doc in all_docs:
        content = doc.page_content
        if content not in seen_content:
            seen_content.add(content)
            unique_docs.append(doc)

    print(f"Retrieved {len(unique_docs)} unique relevant chunks")
    return unique_docs


In [None]:
# 6. LLM Response Generation

def create_rag_chain(risk_profile):
    """Create a RAG chain that combines retrieval with LLM generation."""

    # Initialize LLM
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.2)

    # Create the prompt template
    prompt = ChatPromptTemplate.from_template("""
    You are a cryptocurrency investment advisor specializing in providing personalized advice based on current market information.

    User Risk Profile: {risk_profile}
    Current Date: {current_date}

    Based on the following recent cryptocurrency articles and information, provide investment recommendations tailored to the user's risk profile:

    {context}

    Instructions:
    1. Analyze the provided information and identify cryptocurrencies that match the user's {risk_profile} risk profile.
    2. Provide 2-4 specific cryptocurrency recommendations with clear reasoning.
    3. For each recommendation, include:
       - Name and ticker symbol
       - Why it fits the user's risk profile
       - Key strengths and potential risks
       - Suggested allocation percentage (approximate)
    4. Add a brief market overview based on the recent articles.
    5. Include appropriate disclaimers about cryptocurrency investments.

    Your recommendations should be balanced, evidence-based, and clearly connected to the user's risk tolerance.
    """)

    # Create the RAG chain
    chain = (
        {"context": lambda x: x, "risk_profile": lambda _: risk_profile, "current_date": lambda _: datetime.now().strftime("%Y-%m-%d")}
        | prompt
        | llm
        | StrOutputParser()
    )

    return chain

In [None]:
# 7. Main RAG System Function
# ---------------------------

def crypto_advisor_rag(risk_profile):
    """
    Main function that orchestrates the RAG system workflow.
    1. Collects and processes cryptocurrency articles
    2. Creates vector store
    3. Generates queries based on user risk profile
    4. Retrieves relevant information
    5. Generates personalized investment advice
    """
    print(f"Starting cryptocurrency advisor for {risk_profile} risk profile...")

    # Step 1: Collect articles
    articles_df = scrape_crypto_articles_api()

    # Step 2: Process articles into documents
    documents = process_articles_to_documents(articles_df)

    # Step 3: Split documents into chunks
    chunks = split_documents(documents)

    # Step 4: Create vector store
    vector_store = create_vector_store(chunks)

    # Step 5: Generate search queries based on risk profile
    queries = generate_search_queries(risk_profile)

    # Step 6: Retrieve relevant chunks
    relevant_chunks = retrieve_relevant_chunks(vector_store, queries)

    # Step 7: Create and run RAG chain
    rag_chain = create_rag_chain(risk_profile)
    response = rag_chain.invoke(relevant_chunks)

    print("Investment advice generated successfully!")
    return response


In [None]:
# 8. User Interface Function
# --------------------------

def get_investment_advice():
    """
    Simple function to collect user input and display results.
    In a real application, this would be a web or mobile interface.
    """
    print("Welcome to the Cryptocurrency Investment Advisor!")
    print("=" * 50)

    # Get user's risk profile
    print("\nPlease specify your risk tolerance:")
    print("1: Conservative (Low risk)")
    print("2: Moderate (Medium risk)")
    print("3: Aggressive (High risk)")

    risk_choice = input("\nEnter your choice (1-3): ")

    risk_profile_map = {
        "1": "Conservative",
        "2": "Moderate",
        "3": "Aggressive"
    }

    risk_profile = risk_profile_map.get(risk_choice, "Moderate")

    print(f"\nGenerating investment advice for {risk_profile} risk profile...")
    print("This may take a moment while we analyze the latest cryptocurrency information.")

    # Get investment advice
    advice = crypto_advisor_rag(risk_profile)

    print("\n" + "=" * 50)
    print("Your Personalized Cryptocurrency Investment Advice:")
    print("=" * 50)
    print(advice)


In [None]:
sample_advice = crypto_advisor_rag("Moderate")
print("\nSample Investment Advice for Moderate Risk Profile:")
print("=" * 60)
print(sample_advice)



Starting cryptocurrency advisor for Moderate risk profile...
Fetching cryptocurrency news from the past 7 days...
Exception when fetching news: can only concatenate str (not "NoneType") to str
Fetching cryptocurrency articles...
Collected 10 articles
Processed 10 documents with metadata
Split documents into 23 chunks
Vector store created successfully
Generated 4 search queries for Moderate risk profile
Retrieved 7 unique relevant chunks
Investment advice generated successfully!

Sample Investment Advice for Moderate Risk Profile:
### Market Overview:
Based on recent articles, the cryptocurrency market is experiencing significant developments in terms of institutional adoption, interoperability solutions, and partnerships with central banks. Bitcoin (BTC) has reached new all-time highs driven by institutional interest, while projects like Polkadot (DOT), Algorand (ALGO), and Chainlink (LINK) are making strides in their respective niches within the blockchain ecosystem.

### Cryptocurren