# Mounting to Google Drive

In [4]:
from google.colab import drive
drive.mount('/content/drive')

# Make sure your data lives inside this directory
base_path = "/content/drive/MyDrive/auto_analyst_data"


Mounted at /content/drive


## Downloading a few requirements

In [1]:
!pip install requests beautifulsoup4 pandas



# Cloning into repository

In [2]:
!git clone https://github.com/RahulPatnaik/auto-analyst.git
%cd auto-analyst

Cloning into 'auto-analyst'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), 4.71 KiB | 4.71 MiB/s, done.
/content/auto-analyst


# Scraping from WIKIPEDIA


In [3]:
import requests
from bs4 import BeautifulSoup
import json
import os

def get_wikipedia_summary(company, save_dir):
    url = f"https://en.wikipedia.org/wiki/{company.replace(' ', '_')}"
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content, 'html.parser')

    paragraphs = soup.select('p')
    summary = ''
    for p in paragraphs:
        if p.text.strip():
            summary = p.text.strip()
            break

    data = {
        "company": company,
        "source": url,
        "summary": summary
    }

    os.makedirs(save_dir, exist_ok=True)
    with open(os.path.join(save_dir, f"{company}.json"), 'w') as f:
        json.dump(data, f, indent=2)

    print(f"[✓] Saved summary for {company}")

# Example usage:
# get_wikipedia_summary("OpenAI", "/content/drive/MyDrive/auto_analyst_data/raw/wikipedia")

In [6]:

get_wikipedia_summary("OpenAI", "/content/drive/MyDrive/auto_analyst_data/raw/wikipedia")

[✓] Saved summary for OpenAI


In [7]:

get_wikipedia_summary("Meta", "/content/drive/MyDrive/auto_analyst_data/raw/wikipedia")

[✓] Saved summary for Meta


In [8]:

get_wikipedia_summary("Google", "/content/drive/MyDrive/auto_analyst_data/raw/wikipedia")

[✓] Saved summary for Google


 # Scraping from GOOGLENEWS

In [20]:
!pip install gnews

Collecting gnews
  Downloading gnews-0.4.1-py3-none-any.whl.metadata (19 kB)
Collecting feedparser~=6.0.2 (from gnews)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting dnspython (from gnews)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Collecting sgmllib3k (from feedparser~=6.0.2->gnews)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading gnews-0.4.1-py3-none-any.whl (18 kB)
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=s

In [21]:
from gnews import GNews
import json, os

def get_news(company, save_dir):
    google_news = GNews(language='en', country='US', max_results=10)
    news = google_news.get_news(company)

    os.makedirs(save_dir, exist_ok=True)
    with open(os.path.join(save_dir, f"{company}.json"), 'w') as f:
        json.dump(news, f, indent=2)

    print(f"[✓] Saved news for {company}")

# Example:
# get_news("OpenAI", "/content/drive/MyDrive/auto_analyst_data/raw/news")


In [22]:
get_news("OpenAI", "/content/drive/MyDrive/auto_analyst_data/raw/news")

[✓] Saved news for OpenAI


In [23]:
get_news("Meta", "/content/drive/MyDrive/auto_analyst_data/raw/news")

[✓] Saved news for Meta


In [24]:
get_news("Google", "/content/drive/MyDrive/auto_analyst_data/raw/news")

[✓] Saved news for Google


# Scraping from reviews websites


In [25]:
import kagglehub

# Download Glassdoor Reviews dataset
path = kagglehub.dataset_download("davidgauthier/glassdoor-job-reviews")
print("✅ Dataset downloaded to:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/davidgauthier/glassdoor-job-reviews?dataset_version_number=11...


100%|██████████| 83.9M/83.9M [00:00<00:00, 155MB/s]

Extracting files...





✅ Dataset downloaded to: /root/.cache/kagglehub/datasets/davidgauthier/glassdoor-job-reviews/versions/11


In [26]:
import shutil
import os

drive_path = "/content/drive/MyDrive/auto_analyst_data/raw/reviews"

# Create directory if it doesn't exist
os.makedirs(drive_path, exist_ok=True)

# Copy all files from kagglehub to Drive
for file in os.listdir(path):
    full_src = os.path.join(path, file)
    full_dst = os.path.join(drive_path, file)
    shutil.copy(full_src, full_dst)

print("✅ Files copied to Google Drive:", drive_path)


✅ Files copied to Google Drive: /content/drive/MyDrive/auto_analyst_data/raw/reviews


In [30]:
df = pd.read_csv('/content/drive/MyDrive/auto_analyst_data/raw/reviews/glassdoor_reviews.csv')

In [31]:
df.head()

Unnamed: 0,firm,date_review,job_title,current,location,overall_rating,work_life_balance,culture_values,diversity_inclusion,career_opp,comp_benefits,senior_mgmt,recommend,ceo_approv,outlook,headline,pros,cons
0,AFH-Wealth-Management,2015-04-05,,Current Employee,,2,4.0,3.0,,2.0,3.0,3.0,x,o,r,"Young colleagues, poor micro management",Very friendly and welcoming to new staff. Easy...,"Poor salaries, poor training and communication."
1,AFH-Wealth-Management,2015-12-11,Office Administrator,"Current Employee, more than 1 year","Bromsgrove, England, England",2,3.0,1.0,,2.0,1.0,4.0,x,o,r,"Excellent staff, poor salary","Friendly, helpful and hard-working colleagues",Poor salary which doesn't improve much with pr...
2,AFH-Wealth-Management,2016-01-28,Office Administrator,"Current Employee, less than 1 year","Bromsgrove, England, England",1,1.0,1.0,,1.0,1.0,1.0,x,o,x,"Low salary, bad micromanagement",Easy to get the job even without experience in...,"Very low salary, poor working conditions, very..."
3,AFH-Wealth-Management,2016-04-16,,Current Employee,,5,2.0,3.0,,2.0,2.0,3.0,x,o,r,Over promised under delivered,Nice staff to work with,No career progression and salary is poor
4,AFH-Wealth-Management,2016-04-23,Office Administrator,"Current Employee, more than 1 year","Bromsgrove, England, England",1,2.0,1.0,,2.0,1.0,1.0,x,o,x,client reporting admin,"Easy to get the job, Nice colleagues.","Abysmal pay, around minimum wage. No actual tr..."


In [37]:
unique_firms = df['firm'].dropna().unique()
print(f"Total unique firms: {len(unique_firms)}")
print(unique_firms[:20])  # Show first 20 firm names


Total unique firms: 428
['AFH-Wealth-Management' 'AJ-Bell' 'ALDI' 'AQA' 'ASDA' 'ASOS' 'AXA-UK'
 'Abcam' 'Abertawe-Bro-Morgannwg-University-Health-Board' 'Accenture'
 'Accor' 'Achieving-for-Children' 'ActionCOACH' 'Active-Care-Group'
 'Adecco' 'Age-UK-The-National-Charity' 'AlixPartners' 'American-Express'
 'Amey' 'Angard-Staffing']


In [32]:
import pandas as pd
import json
import os

def extract_reviews(csv_path, company_name, save_to_dir):
    df = pd.read_csv(csv_path)

    # Filter by firm name (not "Company Name" like before)
    matches = df[df['firm'].str.contains(company_name, case=False, na=False)]

    if matches.empty:
        print("❌ No reviews found for", company_name)
        return

    # Drop rows without 'pros' or 'cons'
    reviews = matches[['job_title', 'pros', 'cons']].dropna()

    # Merge pros + cons into single content
    review_data = [
        {
            "title": row['job_title'] if pd.notna(row['job_title']) else "N/A",
            "content": f"Pros: {row['pros']} | Cons: {row['cons']}"
        }
        for _, row in reviews.iterrows()
    ]

    os.makedirs(save_to_dir, exist_ok=True)
    out_path = os.path.join(save_to_dir, f"{company_name}.json")

    with open(out_path, "w") as f:
        json.dump(review_data, f, indent=2)

    print(f"[✓] Extracted {len(review_data)} reviews for '{company_name}' and saved to {out_path}")


In [33]:
extract_reviews(
    "/content/drive/MyDrive/auto_analyst_data/raw/reviews/glassdoor_reviews.csv",
    "Google",
    "/content/drive/MyDrive/auto_analyst_data/parsed/reviews"
)


[✓] Extracted 15995 reviews for 'Google' and saved to /content/drive/MyDrive/auto_analyst_data/parsed/reviews/Google.json


In [34]:
extract_reviews(
    "/content/drive/MyDrive/auto_analyst_data/raw/reviews/glassdoor_reviews.csv",
    "OpenAI",
    "/content/drive/MyDrive/auto_analyst_data/parsed/reviews"
)


❌ No reviews found for OpenAI


In [36]:
extract_reviews(
    "/content/drive/MyDrive/auto_analyst_data/raw/reviews/glassdoor_reviews.csv",
    "meta",
    "/content/drive/MyDrive/auto_analyst_data/parsed/reviews"
)


❌ No reviews found for meta


# Show INFO for any company already scraped

In [38]:
import json
import os

base_path = "/content/drive/MyDrive/auto_analyst_data"

def load_json(path):
    with open(path) as f:
        return json.load(f)

def show_company(company):
    print(f"\n📦 {company}\n")

    # Wikipedia
    wiki_path = os.path.join(base_path, "raw/wikipedia", f"{company}.json")
    if os.path.exists(wiki_path):
        wiki = load_json(wiki_path)
        print("📝 Wikipedia Summary:")
        print(wiki.get("summary", "No summary found."), "\n")

    # News
    news_path = os.path.join(base_path, "raw/news", f"{company}.json")
    if os.path.exists(news_path):
        news = load_json(news_path)
        print("🗞️ Recent News Headlines:")
        for article in news[:3]:  # show top 3 headlines
            print(f"- {article.get('title', 'No Title')}")
            print(f"  {article.get('snippet', '')}")
            print(f"  ↪ {article.get('url', '')}\n")

    # Reviews
    review_path = os.path.join(base_path, "parsed/reviews", f"{company}.json")
    if os.path.exists(review_path):
        reviews = load_json(review_path)
        print("🧠 Real User Reviews:")
        for r in reviews[:3]:  # show top 3 reviews
            print(f"- {r.get('title', 'No Title')}: {r.get('content', 'No Content')}")
    else:
        print("⚠️ No reviews found.")



In [40]:
show_company("OpenAI")


📦 OpenAI

📝 Wikipedia Summary:
OpenAI, Inc. is an American artificial intelligence (AI) research organization founded in December 2015 and headquartered in San Francisco, California. It aims to develop "safe and beneficial" artificial general intelligence (AGI), which it defines as "highly autonomous systems that outperform humans at most economically valuable work".[5] As a leading organization in the ongoing AI boom,[6] OpenAI is known for the GPT family of large language models, the DALL-E series of text-to-image models, and a text-to-video model named Sora.[7][8] Its release of ChatGPT in November 2022 has been credited with catalyzing widespread interest in generative AI. 

🗞️ Recent News Headlines:
- Exclusive: Alphabet, Nvidia invest in OpenAI co-founder Sutskever's SSI, source says - Reuters
  
  ↪ https://news.google.com/rss/articles/CBMi1AFBVV95cUxQbmdtSlVoMDF4Q2J1N2wySk9pUHdTTVhVSkdxZW5CV0YxZUdWN0tFb1MxSjBPQWU5eFZZb3VjZGU0YldkcEVOZGdTWUJqRnhwSnhUZmtoOGZSd2Q5VjNXWXdQXzNqclBq

In [39]:
show_company("Google")


📦 Google

📝 Wikipedia Summary:
Google LLC (/ˈɡuːɡəl/ ⓘ, GOO-gəl) is an American multinational corporation and technology company focusing on online advertising, search engine technology, cloud computing, computer software, quantum computing, e-commerce, consumer electronics, and artificial intelligence (AI).[9] It has been referred to as "the most powerful company in the world" by the BBC[10] and is one of the world's most valuable brands due to its market dominance, data collection, and technological advantages in the field of AI.[11][12][13] Alongside Amazon, Apple, Meta, and Microsoft, Google's parent company, Alphabet Inc. is one of the five Big Tech companies. 

🗞️ Recent News Headlines:
- Google lays off hundreds of employees in Android, Pixel teams, The Information reports - Reuters
  
  ↪ https://news.google.com/rss/articles/CBMivgFBVV95cUxQNzJhZWpCcl9Rd1VRQi1zbl9BTVJWZTE3dEtZcDdOV0ZfZ1BkMC1sZDVNZV9ub3FrUGU1aUl3cXRtMzY5REhaSWlhLVhWcFJscDZQalBidllEUzdlZnVCcEc1SzhacTNyTWJxWmszTm

In [41]:
show_company("AFH-Wealth-Management")


📦 AFH-Wealth-Management

⚠️ No reviews found.
