Setting up Ollama on Google Colab with ngrok
This Jupyter notebook provides a step-by-step guide for setting up Ollama on Google Colab and accessing it from a local machine using ngrok. The tutorial includes instructions for downloading and installing the Ollama model, creating a script to run Ollama, and tunneling the local server to a public URL using ngrok for easy access.

NOTE: The code written here is intended to be run in a Jupyter notebook, hence the '!' is used for terminal commands.

In [None]:
# CODEBLOCK 1 --->
# Downloading the latest Ollama model
!curl -fsSL https://ollama.com/install.sh | sh

>>> Downloading ollama...
############################################################################################# 100.0%
>>> Installing ollama to /usr/local/bin...
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [None]:
# Creating a script to run Ollama on Colab
ollama_script = '''
#!/bin/bash
ollama serve
'''

with open('ollama_script.sh', 'w') as file:
    file.write(ollama_script)

# Making the script executable and running it in the background
!chmod +x ollama_script.sh
!nohup ./ollama_script.sh &

nohup: appending output to 'nohup.out'


In [None]:
# CODEBLOCK 2 --->
# Pull the Ollama model of your choice (e.g., llama3)
!ollama pull llama3

[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest 
pulling 6a0746a1ec1a...   0% ▕▏    0 B/4.7 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 6a0746a1ec1a...   0% ▕▏    0 B/4.7 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 6a0746a1ec1a...   0% ▕▏    0 B/4.7 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 6a0746a1ec1a...   0% ▕▏    0 B/4.7 GB          

In [None]:
# CODEBLOCK 3 --->
# Installing required packages to tunnel localhost to ngrok
!pip install ngrok pyngrok

Collecting ngrok
  Downloading ngrok-1.3.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyngrok
  Downloading pyngrok-7.1.6-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok, ngrok
Successfully installed ngrok-1.3.0 pyngrok-7.1.6


In [None]:
# CODEBLOCK 4 --->
# Installing additional dependencies
!pip install aiohttp



In [None]:
import os
import asyncio
from pyngrok import ngrok
from IPython.display import clear_output

# Setting environment variables so Ollama can use the GPU of the host (Google Colab)
os.environ['PATH'] += ':/usr/local/cuda/bin'
os.environ['LD_LIBRARY_PATH'] = '/usr/lib64-nvidia:/usr/local/cuda/lib64'

async def run_process(cmd):
    """Run a subprocess and collect the output."""
    print('>>> Starting', ' '.join(cmd))
    process = await asyncio.create_subprocess_exec(
        *cmd,
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE,
    )

    async def pipe_output(stream):
        async for line in stream:
            print(line.strip().decode('utf-8'))

    await asyncio.gather(
        pipe_output(process.stdout),
        pipe_output(process.stderr),
    )

def setup_ngrok(auth_token):
    """Authenticate ngrok and clear the output."""
    ngrok.set_auth_token(auth_token)
    clear_output()

# Ngrok authentication
ngrok_auth_token = 'your-ngrok-authtoken'
setup_ngrok(ngrok_auth_token)

# Running the Ollama subprocess and exposing it from localhost:11434 to a public URL using ngrok
await asyncio.gather(
    run_process(['ollama', 'serve']),
    run_process(['ngrok', 'http', '--log', 'stderr', '11434', '--host-header=localhost:11434']),
)



# Scrapegraphai

In [None]:
%%capture
!pip install scrapegraphai --upgrade
!apt install chromium-chromedriver
!pip install nest_asyncio
!pip install playwright
!playwright install

## Usage of the smartscraper

In [None]:

from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info

graph_config = {
    "llm": {
        "model": "ollama/mistral",
        "temperature": 0,
        "format": "json",  # Ollama needs the format to be specified explicitly
        "base_url": "http://localhost:11434"
    },
    "verbose": True,
    "headless": False
}

smart_scraper_graph = SmartScraperGraph(
    prompt="List me all the titles",
    source="https://www.wired.com/",
    config=graph_config
)

result = smart_scraper_graph.run()
print(result)

In [None]:
graph_exec_info = smart_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

## Usage of the CSV scraper

In [None]:
import os
import pandas as pd
from scrapegraphai.graphs import CSVScraperGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info

FILE_NAME = "inputs/username.csv"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, FILE_NAME)

text = pd.read_csv(file_path)

graph_config = {
    "llm": {
        "model": "ollama/llama3",
        "temperature": 0,
        "format": "json",  # Ollama needs the format to be specified explicitly
        "base_url": "http://localhost:11434"
    },
    "verbose": True,
}


csv_scraper_graph = CSVScraperGraph(
    prompt="List me all the last names",
    source=str(text),
    config=graph_config
)

result = csv_scraper_graph.run()
print(result)

graph_exec_info = csv_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

In [None]:
convert_to_csv(result, "result")
convert_to_json(result, "result")

## Usage of the XML scraper


In [None]:
import os
from dotenv import load_dotenv
from scrapegraphai.graphs import XMLScraperGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
load_dotenv()

FILE_NAME = "inputs/books.xml"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, FILE_NAME)

with open(file_path, 'r', encoding="utf-8") as file:
    text = file.read()


graph_config = {
    "llm": {
        "model": "ollama/llama3",
        "temperature": 0,
        "format": "json",  # Ollama needs the format to be specified explicitly
        "base_url": "http://localhost:11434"
    },
    "verbose": True,
}

xml_scraper_graph = XMLScraperGraph(
    prompt="List me all the authors, title and genres of the books",
    source=text,
    config=graph_config
)

result = xml_scraper_graph.run()
print(result)

graph_exec_info = xml_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

In [None]:
# Save to json or csv
convert_to_csv(result, "result")
convert_to_json(result, "result")

## Usage of teh JSON scraper


In [None]:
"""
Basic example of scraping pipeline using JSONScraperGraph from JSON documents
"""

import os
from dotenv import load_dotenv
from scrapegraphai.graphs import JSONScraperGraph
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
load_dotenv()

FILE_NAME = "inputs/example.json"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, FILE_NAME)

with open(file_path, 'r', encoding="utf-8") as file:
    text = file.read()

graph_config = {
    "llm": {
        "model": "ollama/mistral",
        "temperature": 0,
        "format": "json",  # Ollama needs the format to be specified explicitly
        "base_url": "http://localhost:11434"
    },
    "verbose": True,
}


json_scraper_graph = JSONScraperGraph(
    prompt="List me all the authors, title and genres of the books",
    source=text,  # Pass the content of the file, not the file object
    config=graph_config
)

result = json_scraper_graph.run()
print(result)

graph_exec_info = json_scraper_graph.get_execution_info()
print(prettify_exec_info(graph_exec_info))

In [None]:
convert_to_csv(result, "result")
convert_to_json(result, "result")