In [None]:
# 📦 Install required packages
!pip install newsapi-python nltk spacy rake-nltk wordcloud supabase
!pip install newsapi-python nltk spacy rake-nltk wordcloud supabase fuzzywuzzy
!pip install gnews
!pip install requests
# ✅ Download NLTK and spaCy resources
import nltk
nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
!python -m spacy download en_core_web_sm

# 🔧 Imports
import re
import pandas as pd
import spacy
from datetime import datetime, timedelta
from newsapi import NewsApiClient
from gnews import GNews
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from rake_nltk import Rake
from wordcloud import WordCloud
from fuzzywuzzy import fuzz
from supabase import create_client, Client
import requests

GNEWS_API_KEY = "7ae58b911c154d14aae1c9e779b46f58"

# 🔑 Your API Keys
#newsapi = NewsApiClient(api_key='21386679f7a84bdbbdb3487d926ecadb')
#news = GNews()

url = 'https://cakchguemrmvpizmqdka.supabase.co'
key = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImNha2NoZ3VlbXJtdnBpem1xZGthIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NDQ1NDE1NDYsImV4cCI6MjA2MDExNzU0Nn0.aT7Ssu9b--fW39VPhl4fwE2cxSvlw7teZnoFRix4qCE'
supabase: Client = create_client(url, key)

# 🏢 Companies to track
companies = [
    "Razorpay", "CRED", "Nykaa", "Zepto", "Freshworks",
    "Chargebee", "Rapido", "Porter", "Zinnov", "Fractal Analytics", "BlackRock"
]

# Time Range
from_date = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')
to_date = datetime.now().strftime('%Y-%m-%d')

#  NLP Setup
nlp = spacy.load("en_core_web_sm")
sia = SentimentIntensityAnalyzer()
rake = Rake()

# Clean Text Function
def clean_text(text):
    text = re.sub(r"http\S+", "", text)                     # Remove URLs
    text = re.sub(r"#", "", text)                           # Remove hashtags
    text = re.sub(r"[^A-Za-z0-9\s.,!?'-]", "", text)        # Remove unwanted characters
    text = re.sub(r"\s+", " ", text).strip()                # Remove extra whitespace
    return text

# Fetch + Analyze + Upload News
all_data = []



def fetch_gnews_articles(company, from_date, to_date, language='en', sort_by='relevance', max_results=10):
    url = f"https://gnews.io/api/v4/search"
    api_key = "7ae58b911c154d14aae1c9e779b46f58"
    params = {
        'q': company,
        'lang': language,
        'from': from_date,
        'to': to_date,
        'sortby': sort_by,
        'max': max_results,
        'token': api_key
    }

    response = requests.get(url, params=params)

    if response.status_code == 200:
        return response.json().get('articles', [])
    else:
        print(f"GNews API error: {response.status_code} - {response.text}")
        return []

for company in companies:
    print(f"Fetching news for: {company}")
    articles = fetch_gnews_articles(company, from_date, to_date)

    for article in articles:
        # Cleaned and combined text
        text = clean_text(article['title'] + " " + (article['description'] or ""))
        sentiment = sia.polarity_scores(text)['compound']
        rake.extract_keywords_from_text(text)
        keywords = rake.get_ranked_phrases()

        doc = nlp(text)
        entities = [ent.text for ent in doc.ents if ent.label_ in ['ORG', 'PRODUCT', 'GPE']]

        # Match company names
        text_lower = text.lower()
        keyword_matches = [comp for comp in companies if comp.lower() in text_lower]

        fuzzy_matches = []
        for ent in entities:
            for comp in companies:
                if fuzz.ratio(ent.lower(), comp.lower()) > 85:
                    fuzzy_matches.append(comp)

        all_company_mentions = list(set(keyword_matches + fuzzy_matches))

        # if company == "CRED" and "CRED" not in all_company_mentions:
        #     print(f"⏭ Skipping likely irrelevant CRED article: {article['title']}")
        #     continue


        # Duplicate check using URL
        existing = supabase.table("news_insights").select("id").eq("url", article['url']).execute()
        if existing.data:
            print(f"⏭ Skipping duplicate article for {company}")
            continue



        # Store in DataFrame
        all_data.append({
            "Company": company,
            "matched_companies": ", ".join(all_company_mentions),
            "title": article['title'],
            "published_at": article['publishedAt'],
            "source": article['source']['name'],
            "sentiment_score": sentiment,
            "top_keywords": ", ".join(keywords[:5]),
            "entities": ", ".join(entities),
            "url": article['url']
        })

        # Upload to Supabase
        supabase.table("news_insights").insert({
            "Company": company,
            "matched_companies": ", ".join(all_company_mentions),
            "title": article['title'],
            "published_at": article['publishedAt'],
            "source": article['source']['name'],
            "sentiment_score": sentiment,
            "top_keywords": ", ".join(keywords[:5]),
            "entities": ", ".join(entities),
            "url": article['url']
        }).execute()

# 🧾 Final DataFrame
df = pd.DataFrame(all_data)
if not df.empty and 'published_at' in df.columns:
    df['published_at'] = pd.to_datetime(df['published_at'], errors='coerce')
    df.sort_values(by='published_at', ascending=False, inplace=True)
    display(df.head(10))
else:
    print("No new articles were fetched or valid 'published_at' data not found.")

Collecting newsapi-python
  Downloading newsapi_python-0.2.7-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting rake-nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl.metadata (6.4 kB)
Collecting supabase
  Downloading supabase-2.15.0-py3-none-any.whl.metadata (11 kB)
Collecting gotrue<3.0.0,>=2.11.0 (from supabase)
  Downloading gotrue-2.12.0-py3-none-any.whl.metadata (6.1 kB)
Collecting postgrest<1.1,>0.19 (from supabase)
  Downloading postgrest-1.0.1-py3-none-any.whl.metadata (3.5 kB)
Collecting realtime<2.5.0,>=2.4.0 (from supabase)
  Downloading realtime-2.4.2-py3-none-any.whl.metadata (6.6 kB)
Collecting storage3<0.12,>=0.10 (from supabase)
  Downloading storage3-0.11.3-py3-none-any.whl.metadata (1.8 kB)
Collecting supafunc<0.10,>=0.9 (from supabase)
  Downloading supafunc-0.9.4-py3-none-any.whl.metadata (1.2 kB)
Collecting pytest-mock<4.0.0,>=3.14.0 (from gotrue<3.0.0,>=2.11.0->supabase)
  Downloading pytest_mock-3.14.0-py3-none-any.whl.metadata (3.8 kB)
Collecting deprecation<

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m88.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.




Fetching news for: Razorpay
⏭ Skipping duplicate article for Razorpay
⏭ Skipping duplicate article for Razorpay
⏭ Skipping duplicate article for Razorpay
⏭ Skipping duplicate article for Razorpay
⏭ Skipping duplicate article for Razorpay
⏭ Skipping duplicate article for Razorpay
⏭ Skipping duplicate article for Razorpay
⏭ Skipping duplicate article for Razorpay
⏭ Skipping duplicate article for Razorpay
⏭ Skipping duplicate article for Razorpay
Fetching news for: CRED
⏭ Skipping duplicate article for CRED
⏭ Skipping duplicate article for CRED
⏭ Skipping duplicate article for CRED
⏭ Skipping duplicate article for CRED
⏭ Skipping duplicate article for CRED
⏭ Skipping duplicate article for CRED
⏭ Skipping duplicate article for CRED
⏭ Skipping duplicate article for CRED
⏭ Skipping duplicate article for CRED
⏭ Skipping duplicate article for CRED
Fetching news for: Nykaa
⏭ Skipping duplicate article for Nykaa
⏭ Skipping duplicate article for Nykaa
⏭ Skipping duplicate article for Nykaa
⏭ Skip

In [None]:
from supabase import create_client, Client
import pandas as pd

# Your Supabase credentials
SUPABASE_URL = "https://cakchguemrmvpizmqdka.supabase.co"
SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImNha2NoZ3VlbXJtdnBpem1xZGthIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NDQ1NDE1NDYsImV4cCI6MjA2MDExNzU0Nn0.aT7Ssu9b--fW39VPhl4fwE2cxSvlw7teZnoFRix4qCE"
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

def fetch_data():
    response = supabase.table("news_insights").select("*").execute()
    data = response.data
    if data:
        df = pd.DataFrame(data)
        df["published_at"] = pd.to_datetime(df["published_at"])
        return df
    else:
        return pd.DataFrame()

In [59]:
# app.py
code = '''
import streamlit as st
import pandas as pd
from supabase import create_client, Client

# 🔐 Supabase Connection
url = "https://cakchguemrmvpizmqdka.supabase.co"
key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImNha2NoZ3VlbXJtdnBpem1xZGthIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NDQ1NDE1NDYsImV4cCI6MjA2MDExNzU0Nn0.aT7Ssu9b--fW39VPhl4fwE2cxSvlw7teZnoFRix4qCE"
supabase: Client = create_client(url, key)

# 📦 Fetch Data
@st.cache_data
def fetch_data():
    response = supabase.table("news_insights").select("*").execute()
    data = response.data
    df = pd.DataFrame(data)

    if not df.empty:
        df['published_at'] = pd.to_datetime(df['published_at'])
        df.sort_values(by='published_at', ascending=False, inplace=True)
    return df

df = fetch_data()

# 🧠 Title
st.title("🧠 AI-Driven Competitive Intelligence Tracker")

# 📊 Sidebar Filters
companies = df['Company'].unique().tolist()
selected_companies = st.sidebar.multiselect("Select Companies", companies, default=companies)

# 🔍 Filter Data
filtered_df = df[df["Company"].isin(selected_companies)]

# 📈 Trend Section
st.subheader("📈 Sentiment Trend Over Time (Coming Soon)")

# 📋 News Section
st.subheader("📰 Latest News")

for idx, row in filtered_df.iterrows():
    st.markdown(f"""
    #### {row['title']}
    - 🏢 **Company:** {row['Company']}
    - 📰 **Source:** {row['source']}
    - 📅 **Published At:** {row['published_at'].strftime('%Y-%m-%d %H:%M')}
    - 😊 **Sentiment Score:** `{row['sentiment_score']}`
    - 🧠 **Top Keywords:** {row['top_keywords']}
    - 🌐 [Read More]({row['url']})
    ---
    """)'''

In [60]:
with open("/content/temp_repo/app.py", "w") as file:
    file.write(code)

print("✅ Streamlit app.py saved successfully!")

✅ Streamlit app.py saved successfully!


In [None]:
from google.colab import files
files.download("app.py")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install GitPython



In [None]:
!rm -rf /content/temp_repo


In [36]:
from git import Repo
import os

# 🛠️ Setup repo
GITHUB_USERNAME = "Vashitva20"
GITHUB_REPO = "AI-news-dashboard"
GITHUB_TOKEN = "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"  # Replace this with your actual token
REPO_URL = f"https://{GITHUB_USERNAME}:{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{GITHUB_REPO}.git"

# 📁 Create a temp directory
repo_dir = "/content/temp_repo"
os.makedirs(repo_dir, exist_ok=True)

# 📄 Save your Streamlit file
streamlit_code = '''
# your Streamlit code here
import streamlit as st
st.title("It works!")
'''
with open(f"{repo_dir}/app.py", "w") as f:
    f.write(streamlit_code)

# 🚀 Clone and push
repo = Repo.clone_from(REPO_URL, repo_dir)
repo.git.add(A=True)
repo.index.commit("Upload Streamlit app from Colab")
origin = repo.remote(name="origin")
origin.push()

GitCommandError: Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://*****:*****@github.com/Vashitva20/AI-news-dashboard.git /content/temp_repo
  stderr: 'fatal: destination path '/content/temp_repo' already exists and is not an empty directory.
'

In [35]:
!streamlit run app.py &> logs.txt &

In [37]:

!pwd

/content/temp_repo


In [38]:

!ls -la

total 20
drwxr-xr-x 3 root root 4096 Apr 16 06:53 .
drwxr-xr-x 1 root root 4096 Apr 16 06:44 ..
-rw-r--r-- 1 root root   73 Apr 16 07:25 app.py
drwxr-xr-x 8 root root 4096 Apr 16 05:45 .git
-rw-r--r-- 1 root root  225 Apr 16 07:25 logs.txt


In [None]:

!streamlit run app.py &> logs.txt &

In [39]:

!find /content -name "app.py"

/content/app.py
/content/temp_repo/app.py


In [41]:

with open("app.py", "r") as file:
    print(file.read())


# your Streamlit code here
import streamlit as st
st.title("It works!")



In [42]:
!streamlit run app.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8503[0m
[34m  External URL: [0m[1mhttp://34.138.242.133:8503[0m
[0m
[34m  Stopping...[0m
^C


In [43]:
Network URL: http://172.28.0.12:8501
External URL: http://34.138.242.133:8501

SyntaxError: invalid syntax (<ipython-input-43-8d9df4f05a29>, line 1)

In [45]:
!pip install ngrok

Collecting ngrok
  Downloading ngrok-1.4.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Downloading ngrok-1.4.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ngrok
Successfully installed ngrok-1.4.0


In [47]:

from pyngrok import ngrok

# Paste your token inside the quotes
ngrok.set_auth_token("2vndlz2NHLXGz6iLbGdqrlBzLE1_42Yj381M27RimuHuQHLy4")

In [48]:
# Install Streamlit and pyngrok
!pip install streamlit pyngrok -q

# Set your ngrok token here
from pyngrok import ngrok

# Replace with your actual token
ngrok.set_auth_token("2vndlz2NHLXGz6iLbGdqrlBzLE1_42Yj381M27RimuHuQHLy4")

In [49]:

# Run your Streamlit app and open a public URL
!streamlit run /content/app.py &

# Wait a bit and then open the tunnel
from pyngrok import ngrok
public_url = ngrok.connect(port=8501)
print(f"Your app is live at: {public_url}")


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.138.242.133:8501[0m
[0m
[34m  Stopping...[0m


KeyboardInterrupt: 

In [50]:

# Wait a few seconds to make sure Streamlit is running
import time
time.sleep(5)

# Open ngrok tunnel on port 8501 (Streamlit default)
from pyngrok import ngrok
public_url = ngrok.connect(8501)
print(f"✅ Your public Streamlit app is live at:\n👉 {public_url}")

✅ Your public Streamlit app is live at:
👉 NgrokTunnel: "https://cfcd-34-138-242-133.ngrok-free.app" -> "http://localhost:8501"


In [51]:

with open("requirements.txt", "w") as f:
    f.write("""streamlit
pandas
supabase
""")

In [61]:

!cp requirements.txt /content/temp_repo/
%cd /content/temp_repo/
!git add requirements.txt
!git commit -m "Add requirements.txt"
!git push origin main

cp: 'requirements.txt' and '/content/temp_repo/requirements.txt' are the same file
/content/temp_repo
On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   app.py[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31mlogs.txt[m

no changes added to commit (use "git add" and/or "git commit -a")
Everything up-to-date


In [62]:

%cd /content/temp_repo

!git config --global user.email "vashitvarajofficial@gmail.com"
!git config --global user.name "Vashitva20"

!git add app.py
!git commit -m "Updated full Streamlit dashboard"
!git push origin main

/content/temp_repo
[main 39f980d] Updated full Streamlit dashboard
 1 file changed, 1 insertion(+), 1 deletion(-)
Enumerating objects: 5, done.
Counting objects: 100% (5/5), done.
Delta compression using up to 2 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 329 bytes | 329.00 KiB/s, done.
Total 3 (delta 1), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/Vashitva20/AI-news-dashboard.git
   fceedf7..39f980d  main -> main


In [53]:

with open("app.py", "r") as file:
    lines = file.readlines()

# Remove the first line
with open("app.py", "w") as file:
    file.writelines(lines[1:])