In [1]:
!curl -fsSL https://ollama.com/install.sh | sh

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
############################################################################################# 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [2]:
!pip install -q ollama

In [3]:
import subprocess
subprocess.Popen("ollama serve", shell = True)

<Popen: returncode: None args: 'ollama serve'>

In [4]:
#model_name = 'deepseek-r1:1.5b'
model_name = 'llama3.1'

In [5]:
#Runtime -> Interrupt execution when the :: type loading shows up and then run the other cells

In [6]:
#!ollama run deepseek-r1:7b
!ollama run llama3.1

[?25lpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest 
pulling 667b0c1932bc...   0% ▕▏    0 B/4.9 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 667b0c1932bc...   0% ▕▏    0 B/4.9 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 667b0c1932bc...   0% ▕▏ 8.5 MB/4.9 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 667b0c1932bc...   1% ▕▏  33 MB/4.9 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 667b0c1932bc...   2% ▕▏  90 MB/4.9 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 667b0c1932bc...   3% ▕▏ 131 MB/4.9 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 667b0c1932bc...   3% ▕▏ 151 MB/4.9 GB                  [?25h

In [7]:
subprocess.Popen("ollama serve", shell = True)

<Popen: returncode: None args: 'ollama serve'>

In [8]:
#Test if ollama is working properly
import ollama
response = ollama.chat(model=model_name, messages=[
    {
    'role' : 'user',
    'content' : 'How many planets are in this solar system? List their names'
    },
])
print(response['message']['content'])

There are 8 planets in our solar system. Here are their names:

1. Mercury
2. Mars
3. Venus
4. Earth
5. Neptune
6. Uranus
7. Saturn
8. Jupiter

Note: Pluto was previously considered a planet, but it's now classified as a dwarf planet by the International Astronomical Union (IAU).


In [9]:
%%capture
!pip install -U crawl4ai
!pip install nest_asyncio

In [10]:
%%capture
!crawl4ai-setup

In [11]:
!crawl4ai-doctor

[36m[INIT].... → Running Crawl4AI health check...[0m
[36m[INIT].... → Crawl4AI 0.4.247[0m
[36m[TEST].... ℹ Testing crawling capabilities...[0m
[36m[EXPORT].. ℹ Exporting PDF and taking screenshot took 1.46s[0m
[32m[FETCH]... ↓ https://crawl4ai.com... | Status: [32mTrue[0m | Time: 2.74s[0m
[36m[SCRAPE].. ◆ Processed https://crawl4ai.com... | Time: 47ms[0m
[32m[COMPLETE] ● https://crawl4ai.com... | Status: [32mTrue[0m | Total: [33m2.79s[0m[0m
[32m[COMPLETE] ● ✅ Crawling test passed![0m
[0m

In [12]:
import asyncio
import nest_asyncio
nest_asyncio.apply()

In [13]:
import asyncio
import json
import os
#from typing import List
from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode, CrawlerRunConfig, LXMLWebScrapingStrategy, DisplayMode, MemoryAdaptiveDispatcher, CrawlerMonitor
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from pydantic import BaseModel, Field
import pandas as pd

output_file = "/content/extracted_company_info.xlsx"

website_url =  [
    "",
    "",
    ""
]

INSTRUCTION = """Extract the company name and their contact numbers"""

def process_result(result, llm_strategy):
        data = json.loads(result.extracted_content)
        print("Extracted items:", data)
        df = pd.DataFrame(data)
        df = df.map(lambda x: x if not isinstance(x, str) else x.encode('utf-8').decode('utf-8', errors='replace'))
                    
        if os.path.exists(output_file):
            old_df = pd.read_excel(output_file)
            combined_df = pd.concat([old_df, df], ignore_index=True)
        else:
            combined_df = df
                    
        combined_df.to_excel(output_file, index=False)
        print(f"Data has been successfully saved to {output_file}")

        llm_strategy.show_usage()


# JS code to convert the webpage of any other language to English for the model to extract the data more efficiently 
js_code = """
function googleTranslateElementInit() {
    new google.translate.TranslateElement({
        pageLanguage: 'auto',
        includedLanguages: 'en',
        layout: google.translate.TranslateElement.InlineLayout.SIMPLE
    }, 'google_translate_element');
}
"""



class CompanyInfo(BaseModel):
    company_name: str = Field(description="The name of the company")
    phone_number: str = Field(description="The contact phone number of the company")
  #  address: str = Field(description="The full address of the company")
  #  hall: str = Field(description="The exhibition hall number")
 #   stand_number: str = Field(description="The exhibition stand number")


async def main():
    llm_strategy = LLMExtractionStrategy(
        provider=f"ollama/{model_name}",  
        #provider="gemini/gemini-1.5-pro",
        api_token=None,                  #Since Im using Ollama so there is no need for an API Key
        schema=CompanyInfo.model_json_schema(),
        extraction_type="schema",
        instruction=INSTRUCTION,
        chunk_token_threshold = 1000,
        overlap_rate = 0.0,
        apply_chunking=True,
        input_format="markdown",
        extra_args={"temperature":0.0,"max_tokens":800},
    )

    crawl_config = CrawlerRunConfig(
        extraction_strategy=llm_strategy,
        scraping_strategy=LXMLWebScrapingStrategy(),
        cache_mode=CacheMode.BYPASS,
        process_iframes=False,
        exclude_external_links=True,
        js_code = [js_code],
      #  wait_for="css:.main-loaded"
    )

    browser_config = BrowserConfig(
        headless=True,
        verbose=True,
        text_mode=True
        )
    

    dispatcher = MemoryAdaptiveDispatcher(
        memory_threshold_percent=70.0,
        check_interval=1.0,
        max_session_permit=10,
        monitor=CrawlerMonitor(
            display_mode=DisplayMode.DETAILED
        )
    )

    try:
        async with AsyncWebCrawler(config=browser_config) as crawler:
            results = await crawler.arun_many(url=website_url, config=crawl_config, dispatcher=dispatcher)

            for result in results:
                if result.success:
                    await process_result(result, llm_strategy)
                else:
                    print("Error:", result.error_message)

    except Exception as e:
        print(f"An error occurred: {str(e)}")




In [None]:
asyncio.run(main())