In [1]:
import random
import requests
from datetime import datetime, timedelta
from concurrent.futures import ProcessPoolExecutor
import json

In [2]:
api_key = '337586e7326dcb828d7a386379093040'

In [3]:
# Function to get current weather
def get_weather(city):
    url = f"http://api.openweathermap.org/data/2.5/weather?q={city}&appid={api_key}&units=metric"
    response = requests.get(url)
    return response.json() if response.status_code == 200 else None

# Function to get weather forecast
def get_forecast(city):
    url = f"http://api.openweathermap.org/data/2.5/forecast?q={city}&appid={api_key}&units=metric"
    response = requests.get(url)
    return response.json() if response.status_code == 200 else None

# Generate random future date within the next 5 days
def generate_random_future_date():
    days = random.randint(1, 5)
    future_date = datetime.now() + timedelta(days=days)
    return future_date.strftime('%Y-%m-%d')

# Function to create a user prompt and intent extraction
def create_user_prompt_and_intent(dummy):
    cities = [
        "Kolkata", "New York", "London", "Tokyo", "Sydney", "Paris", "Berlin", "Toronto",
        "Mumbai", "Shanghai", "Los Angeles", "Chicago", "Houston", "Phoenix", "Philadelphia",
        "San Antonio", "San Diego", "Dallas", "San Jose", "Austin", "Beijing", "Moscow",
        "Bangkok", "Cairo", "Istanbul", "Buenos Aires", "Rio de Janeiro", "Lagos", "Lima",
        "Jakarta", "Karachi", "Santiago", "Seoul", "Mexico City", "São Paulo", "Dubai",
        "Singapore", "Hong Kong", "Kuala Lumpur", "Tehran", "Baghdad", "Hanoi", "Riyadh",
        "Cape Town", "Nairobi", "Casablanca", "Accra", "Addis Ababa", "Ho Chi Minh City",
        "Manila", "Dhaka", "Abu Dhabi", "Ankara", "Brisbane", "Barcelona", "Madrid",
        "Rome", "Vienna", "Zurich", "Copenhagen", "Oslo", "Stockholm", "Helsinki", "Athens",
        "Budapest", "Warsaw", "Prague", "Brussels", "Amsterdam", "Dublin", "Edinburgh",
        "Glasgow", "Lisbon", "Munich", "Frankfurt", "Hamburg", "Stuttgart", "Lyon", "Marseille",
        "Nice", "Bordeaux", "Toulouse", "Venice", "Florence", "Naples", "Milan", "Turin",
        "Bologna", "Palermo", "Athens", "Thessaloniki", "Osaka", "Nagoya", "Kyoto", "Fukuoka",
        "Sapporo", "Sendai", "Yokohama", "Kobe", "Hiroshima"
    ]
    info_requests = ["current_weather", "forecast_weather"]
    dates = ["today", "tomorrow"] + [generate_random_future_date() for _ in range(8)]

    city = random.choice(cities)
    info_request = random.choice(info_requests)
    date = random.choice(dates) if info_request == "forecast_weather" else "today"
    user_prompt = f"What is the {info_request.replace('_', ' ')} in {city} {date}?"

    intent_extraction = {
        "intent": info_request,
        "entities": {
            "city": city,
            "date": date
        }
    }

    return {
        "user_input": user_prompt,
        "intent_extraction": intent_extraction
    }

# Generate the intent identification dataset
def generate_intent_dataset(num_samples=500):
    with ProcessPoolExecutor() as executor:
        results = list(executor.map(create_user_prompt_and_intent, range(num_samples)))
    return results


In [4]:
# Generate the intent identification dataset
intent_dataset = generate_intent_dataset()

In [9]:
intent_dataset

In [10]:
with open('data/intent_dataset.json', 'w') as f:
    json.dump(intent_dataset, f, indent=4)

In [18]:
len(intent_dataset)

500

In [14]:
# Function to create a response dataset entry based on the intent
def create_response_dataset_entry(intent_entry):
    city = intent_entry["intent_extraction"]["entities"]["city"]
    info_request = intent_entry["intent_extraction"]["intent"]
    date = intent_entry["intent_extraction"]["entities"]["date"]
    user_prompt = intent_entry["user_input"]

    if info_request == "current_weather":
        api_response = get_weather(city)
        if api_response:
            weather = api_response['weather'][0]['description']
            temp = api_response['main']['temp']
            wind_speed = api_response['wind']['speed']
            humidity = api_response['main']['humidity']

            generated_response = (
                f"The weather in {city} is currently {weather} with a temperature of {temp}°C, "
                f"wind speed of {wind_speed} meters per second, and humidity of {humidity}%."
            )

            return {
                "user_input": user_prompt,
                "api_response": {
                    "location": f"{city}, {api_response['sys']['country']}",
                    "temperature": temp,
                    "description": weather,
                    "wind_speed": wind_speed,
                    "humidity": humidity
                },
                "assistant_response": generated_response
            }

    elif info_request == "forecast_weather":
        api_response = get_forecast(city)
        if api_response:
            forecast_list = api_response['list']
            filtered_forecasts = [forecast for forecast in forecast_list if date in forecast['dt_txt']]
            if filtered_forecasts:
                selected_forecast = filtered_forecasts[0]
                forecast_temp = selected_forecast['main']['temp']
                forecast_weather = selected_forecast['weather'][0]['description']
                wind_speed = selected_forecast['wind']['speed']
                humidity = selected_forecast['main']['humidity']

                generated_response = (
                    f"The forecast for {city} on {date} is {forecast_weather} with a temperature of {forecast_temp}°C, "
                    f"wind speed of {wind_speed} meters per second, and humidity of {humidity}%."
                )

                return {
                    "user_input": user_prompt,
                    "api_response": {
                        "date": date,
                        "temperature": forecast_temp,
                        "description": forecast_weather,
                        "wind_speed": wind_speed,
                        "humidity": humidity
                    },
                    "assistant_response": generated_response
                }

# Generate the response generation dataset
def generate_response_dataset(intent_dataset):
    with ProcessPoolExecutor() as executor:
        results = list(executor.map(create_response_dataset_entry, intent_dataset))
    return [result for result in results if result is not None]

In [15]:
response_dataset = generate_response_dataset(intent_dataset)

In [16]:
with open('data/response_dataset.json', 'w') as f:
    json.dump(response_dataset, f, indent=4)

In [17]:
len(response_dataset)

416

In [17]:
response_dataset

[{'user_input': 'What is the forecast weather in Brisbane 2024-06-15?',
  'api_response': {'date': '2024-06-15',
   'temperature': 16.91,
   'description': 'clear sky',
   'wind_speed': 2.54,
   'humidity': 39},
  'assistant_response': 'The forecast for Brisbane on 2024-06-15 is clear sky with a temperature of 16.91°C, wind speed of 2.54 meters per second, and humidity of 39%.'},
 {'user_input': 'What is the current weather in Prague today?',
  'api_response': {'location': 'Prague, CZ',
   'temperature': 23.59,
   'description': 'clear sky',
   'wind_speed': 4.12,
   'humidity': 41},
  'assistant_response': 'The weather in Prague is currently clear sky with a temperature of 23.59°C, wind speed of 4.12 meters per second, and humidity of 41%.'},
 {'user_input': 'What is the current weather in Shanghai today?',
  'api_response': {'location': 'Shanghai, CN',
   'temperature': 21.76,
   'description': 'broken clouds',
   'wind_speed': 5,
   'humidity': 91},
  'assistant_response': 'The weat

In [27]:
combined_data = []
for intent, response in zip(intent_dataset, response_dataset):
    combined_entry = {
        "user_input": intent["user_input"],
        "intent_extraction": intent["intent_extraction"],
        "api_response": response["api_response"],
        "assistant_response": response["assistant_response"]
    }
    combined_data.append(combined_entry)


In [29]:
# combined_data

In [30]:
with open('data/weather_chatbot_dataset.json', 'w') as f:
    json.dump(combined_data, f, indent=4)

In [22]:
# weather_chatbot_data

In [23]:
!ssh -T git@hf.co

Hi VatsalPatel18, welcome to Hugging Face.


In [31]:
from huggingface_hub import login

login(token='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')
# login(token="YOUR_HUGGING_FACE_TOKEN")
# Load the dataset from the JSON file
from datasets import load_dataset

dataset = load_dataset('json', data_files='data/weather_chatbot_dataset.json')

# Push the dataset to Hugging Face Hub
dataset.push_to_hub("VatsalPatel18/Open-Weather-ChatBot")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/vatsal-patel/.cache/huggingface/token
Login successful


Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/36.0 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/VatsalPatel18/Open-Weather-ChatBot/commit/c537f6bf1494281e0b653b4bb4d2dd0623855611', commit_message='Upload dataset', commit_description='', oid='c537f6bf1494281e0b653b4bb4d2dd0623855611', pr_url=None, pr_revision=None, pr_num=None)

In [24]:
# hugging_face_token