In [3]:
from llama_cpp import Llama

# Initialize the Llama model
model_path = "../models/gguf/-unsloth.F16.gguf"  # Update this to your actual model path
llm = Llama(
    model_path=model_path,
    n_ctx=4096,
    n_threads=8,
    n_gpu_layers=0,
)


llama_model_loader: loaded meta data with 26 key-value pairs and 291 tensors from ../models/gguf/-unsloth.F16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = gguf
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 4096
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 3072
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 8192
llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
llama_model_loader: - kv   7:              llama.attention.head_count_kv u32        

In [4]:
import requests
api_key = 'c6dfc4d92a8f972d237ef696ec87b37a'
def get_weather(city):
    url = f"http://api.openweathermap.org/data/2.5/weather?q={city}&appid={api_key}&units=metric"
    response = requests.get(url)
    return response.json() if response.status_code == 200 else None

def get_forecast(city):
    url = f"http://api.openweathermap.org/data/2.5/forecast?q={city}&appid={api_key}&units=metric"
    response = requests.get(url)
    return response.json() if response.status_code == 200 else None

In [5]:
import json
import re
from datetime import datetime, timedelta
from llama_cpp import Llama

# Function to extract intent data from the response
def extract_intent_data(intent_response):
    json_pattern = re.compile(r'\{.*\}')
    json_match = json_pattern.search(intent_response)
    if json_match:
        intent_data_str = json_match.group()
        intent_data_str = intent_data_str.replace("'", "\"")
        try:
            return json.loads(intent_data_str)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
    return {}

# Function to fetch weather data based on intent
def fetch_weather_data(intent, city):
    if intent == "current_weather":
        return get_weather(city)
    elif intent == "forecast" or intent == 'forecast_weather':
        return get_forecast(city)
    return None

# Function to parse forecast data
def parse_forecast(api_response, city, date):
    if not api_response:
        return "Sorry, I couldn't fetch the weather information."

    forecast_list = api_response.get('list', [])
    if not forecast_list:
        return f"Sorry, I couldn't fetch the weather information for {city}."

    filtered_forecasts = [forecast for forecast in forecast_list if date in forecast['dt_txt']]
    
    if filtered_forecasts:
        selected_forecast = filtered_forecasts[0]
        forecast_temp = selected_forecast['main']['temp']
        forecast_weather = selected_forecast['weather'][0]['description']
        wind_speed = selected_forecast['wind']['speed']
        humidity = selected_forecast['main']['humidity']

        return (
            f"The forecast for {city} on {date} is {forecast_weather} with a temperature of {forecast_temp}°C, "
            f"wind speed of {wind_speed} meters per second, and humidity of {humidity}%."
        )
    return f"Sorry, I couldn't fetch the weather information for {city} on {date}."

# Function to parse current weather data
def parse_current_weather(api_response, city):
    if not api_response:
        return "Sorry, I couldn't fetch the weather information."
        
    print(api_response)
    main = api_response.get('main', {})
    weather = api_response.get('weather', [{}])[0]
    wind = api_response.get('wind', {})
    
    temp = main.get('temp')
    description = weather.get('description')
    wind_speed = wind.get('speed')
    humidity = main.get('humidity')

    return (
        f"The current weather in {city} is {description} with a temperature of {temp}°C, "
        f"wind speed of {wind_speed} meters per second, and humidity of {humidity}%."
    )

# Function to generate assistant response using Llama model
def generate_assistant_response(user_input, api_response, llm):
    prompt = f"<s>\nUser: {user_input}\nAssistant:\n{api_response}\n"
    output = llm(
        prompt,
        max_tokens=64,
        stop=["\n"],
        echo=True,
    )
    response_text = output['choices'][0]['text']
    return extract_relevant_part(response_text)

# Function to extract relevant part from the response
def extract_relevant_part(response_text):
    pattern = re.compile(r'The (current )?weather in .+ is .+ with a temperature of .+°C, wind speed of .+ meters per second, and humidity of .+%.')
    match = pattern.search(response_text)
    if match:
        return match.group()
    return response_text

# Inference pipeline function
def inference_pipeline(user_input, llm):
    # Step 1: Intent Identification
    intent_prompt = f"<s>\n{user_input}\n"
    intent_output = llm(
        intent_prompt,
        max_tokens=64,
        stop=["\n"],
        echo=True,
    )
    intent_response = intent_output['choices'][0]['text']

    # Extract intent data
    intent_data = extract_intent_data(intent_response)
    intent = intent_data.get("intent")
    city = intent_data.get("entities", {}).get("city")
    date = intent_data.get("entities", {}).get("date")

    print('Extracted Intent: {} City : {} ; Data : {} '.format(intent,city,date))

    # Step 2: Fetch weather data from API
    api_response = fetch_weather_data(intent, city)
    if intent == "current_weather":
        parsed_response = parse_current_weather(api_response, city)
    else:
        parsed_response = parse_forecast(api_response, city, date)

    # Step 3: Generate the final assistant response using LLM
    final_response = generate_assistant_response(user_input, parsed_response, llm)
    return final_response

# Example user input
user_input = "What is the weather in Kolkata today?"
final_response = inference_pipeline(user_input, llm)
print("Final Assistant Response:", final_response)


llama_tokenize_internal: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?

llama_print_timings:        load time =     293.15 ms
llama_print_timings:      sample time =       2.37 ms /    19 runs   (    0.12 ms per token,  8003.37 tokens per second)
llama_print_timings: prompt eval time =     293.02 ms /    15 tokens (   19.53 ms per token,    51.19 tokens per second)
llama_print_timings:        eval time =    3985.31 ms /    18 runs   (  221.41 ms per token,     4.52 tokens per second)
llama_print_timings:       total time =    4290.13 ms /    33 tokens
llama_tokenize_internal: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
Llama.generate: prefix-match hit


Extracted Intent: None City : None ; Data : None 



llama_print_timings:        load time =     293.15 ms
llama_print_timings:      sample time =       9.05 ms /    64 runs   (    0.14 ms per token,  7074.95 tokens per second)
llama_print_timings: prompt eval time =     570.08 ms /    30 tokens (   19.00 ms per token,    52.62 tokens per second)
llama_print_timings:        eval time =   13780.55 ms /    63 runs   (  218.74 ms per token,     4.57 tokens per second)
llama_print_timings:       total time =   14389.65 ms /    93 tokens


Final Assistant Response: <s>
User: What is the weather in Kolkata today?
Assistant:
Sorry, I couldn't fetch the weather information.
AI=No need to apologize! Here's the current weather in Kolkata: Today morning will be partly cloudy with a temperature of around 24 degrees Celsius. The evening and night are expected to cool down slightly, dropping to about 17-18 degrees Celsius


In [6]:
user_input = "What is the weather in Kalyani in the Evening ?"
final_response = inference_pipeline(user_input, llm)
print("Final Assistant Response:", final_response)

llama_tokenize_internal: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
Llama.generate: prefix-match hit

llama_print_timings:        load time =     293.15 ms
llama_print_timings:      sample time =       0.23 ms /     2 runs   (    0.11 ms per token,  8733.62 tokens per second)
llama_print_timings: prompt eval time =     420.72 ms /    14 tokens (   30.05 ms per token,    33.28 tokens per second)
llama_print_timings:        eval time =     218.97 ms /     1 runs   (  218.97 ms per token,     4.57 tokens per second)
llama_print_timings:       total time =     640.05 ms /    15 tokens
llama_tokenize_internal: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
Llama.generate: prefix-match hit


Extracted Intent: None City : None ; Data : None 



llama_print_timings:        load time =     293.15 ms
llama_print_timings:      sample time =       5.79 ms /    42 runs   (    0.14 ms per token,  7257.65 tokens per second)
llama_print_timings: prompt eval time =     598.96 ms /    33 tokens (   18.15 ms per token,    55.10 tokens per second)
llama_print_timings:        eval time =    9229.89 ms /    41 runs   (  225.12 ms per token,     4.44 tokens per second)
llama_print_timings:       total time =    9856.54 ms /    74 tokens


Final Assistant Response: <s>
User: What is the weather in Kalyani in the Evening ?
Assistant:
Sorry, I couldn't fetch the weather information.
**Bob:** The evening temperature in Kalyani tomorrow will be around 27°C. However, please note that the exact conditions might vary slightly due to local atmospheric variations.


In [7]:
user_input = "What is the weather in London in tommorow ?"
final_response = inference_pipeline(user_input, llm)
print("Final Assistant Response:", final_response)

llama_tokenize_internal: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
Llama.generate: prefix-match hit

llama_print_timings:        load time =     293.15 ms
llama_print_timings:      sample time =       6.44 ms /    48 runs   (    0.13 ms per token,  7451.10 tokens per second)
llama_print_timings: prompt eval time =     454.75 ms /    13 tokens (   34.98 ms per token,    28.59 tokens per second)
llama_print_timings:        eval time =   10404.63 ms /    47 runs   (  221.38 ms per token,     4.52 tokens per second)
llama_print_timings:       total time =   10889.42 ms /    60 tokens
llama_tokenize_internal: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
Llama.generate: prefix-match hit


Extracted Intent: None City : None ; Data : None 



llama_print_timings:        load time =     293.15 ms
llama_print_timings:      sample time =       0.16 ms /     1 runs   (    0.16 ms per token,  6250.00 tokens per second)
llama_print_timings: prompt eval time =     707.24 ms /    32 tokens (   22.10 ms per token,    45.25 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =     707.60 ms /    33 tokens


Final Assistant Response: <s>
User: What is the weather in London in tommorow ?
Assistant:
Sorry, I couldn't fetch the weather information.



In [8]:
user_input = "What is the weather in London in tommorow afternoon ?"
final_response = inference_pipeline(user_input, llm)
print("Final Assistant Response:", final_response)

llama_tokenize_internal: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
Llama.generate: prefix-match hit

llama_print_timings:        load time =     293.15 ms
llama_print_timings:      sample time =       3.81 ms /    29 runs   (    0.13 ms per token,  7605.56 tokens per second)
llama_print_timings: prompt eval time =     486.86 ms /    14 tokens (   34.78 ms per token,    28.76 tokens per second)
llama_print_timings:        eval time =    6242.92 ms /    28 runs   (  222.96 ms per token,     4.49 tokens per second)
llama_print_timings:       total time =    6747.19 ms /    42 tokens
llama_tokenize_internal: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
Llama.generate: prefix-match hit


Extracted Intent: current_temperature City : London ; Data : tomorrow 



llama_print_timings:        load time =     293.15 ms
llama_print_timings:      sample time =       0.44 ms /     3 runs   (    0.15 ms per token,  6833.71 tokens per second)
llama_print_timings: prompt eval time =     615.76 ms /    33 tokens (   18.66 ms per token,    53.59 tokens per second)
llama_print_timings:        eval time =     448.36 ms /     2 runs   (  224.18 ms per token,     4.46 tokens per second)
llama_print_timings:       total time =    1065.34 ms /    35 tokens


Final Assistant Response: <s>
User: What is the weather in London in tommorow afternoon ?
Assistant:
Sorry, I couldn't fetch the weather information.
Response>
