## TASK FOR ZEAL.AI

#### Scrapping Data for the website

In [31]:
pip install git+https://github.com/rzagreb/eventbrite_scrapper.git

Collecting git+https://github.com/rzagreb/eventbrite_scrapper.git
  Cloning https://github.com/rzagreb/eventbrite_scrapper.git to /tmp/pip-req-build-oody2z8b
  Running command git clone --filter=blob:none --quiet https://github.com/rzagreb/eventbrite_scrapper.git /tmp/pip-req-build-oody2z8b
  Resolved https://github.com/rzagreb/eventbrite_scrapper.git to commit 6ba1a2ed476a863db6f614a5d0885209e99e0a7d
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [62]:
from eventbrite_scrapper import Eventbrite

client = Eventbrite()

events = client.search_events.get_results(
    region="ca--los-angeles",  # can be found in URL through browser
    dt_start="2024-11-28", # start date range to search events
    dt_end="2024-12-25",  # end date range to search events
    max_pages=4,  # number of pages to check
)

In [63]:
event = events[0]

print(event.id)  # (str) unique eventbrite event identificator
print(event.name)  # (str) event name
print(event.url)  # (str) url to event page
print(event.is_online_event)  # (bool) True this is online event
print(event.short_description)  # (str)
print(event.published_datetime)  # (datetime) when event was created, in utc
print(event.start_datetime)  # (datetime) time event starts, in utc
print(event.end_datetime)  # (datetime) time event ends, in utc
print(event.timezone)  # (str)
print(event.hide_start_date)  # (str) True if time not displayed to user
print(event.hide_end_date)  # (str) True if time not displayed to user
print(event.parent_event_url)  # (str) URL of the parent event
print(event.series_id)  # (str) identificator of the series of events
print(event.primary_venue.id)  # (str) Organizer id
print(event.primary_venue.name)  # (str) Organizer name
print(event.primary_venue.url)  # (str) URL to organizer page
print(event.primary_venue.address.latitude)  # (float)
print(event.primary_venue.address.longitude)  # (float)
print(event.primary_venue.address.region)  # (str)
print(event.primary_venue.address.postal_code)  # (str)
print(event.primary_venue.address.address_1)  # (str)
print(event.tickets_url)  # (str) url to buy tickets
print(event.checkout_flow)  # (str)
print(event.language)  # (str)
print(event.image.url)  # (str) URL for image for the event
print(event.tags_categories[0].text)  # (str) Eventbrite category
print(event.tags_formats[0].text)  # (str) Eventbrite event format
print([tag.text for tag in event.tags_by_organizer])  # (list[str]) Organizer's tags

1007905100287
Sandpipers Holiday Homes Tour 2024
https://www.eventbrite.com/e/sandpipers-holiday-homes-tour-2024-tickets-1007905100287
False
Sandpipers Annual Holiday Homes Tour showcases distinctive South Bay homes beautifully decorated for the holiday season.
2024-10-26 19:54:48+00:00
2024-12-06 18:00:00+00:00
2024-12-08 21:00:00+00:00
America/Los_Angeles
False
False
None
None
230657019
Neptunian Woman's Club
None
33.8837127
-118.4088112
CA
90266
920 Highland Avenue
https://www.eventbrite.com/checkout-external?eid=1007905100287
widget
en-us
https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc.com%2Fimages%2F880889843%2F483824350861%2F1%2Foriginal.20241022-185050?w=400&auto=format%2Ccompress&q=75&sharp=10&s=27c6b08fa49195c2d2c5c010d52685aa
Seasonal & Holiday
Tour
['Holiday', 'Philanthropy', 'Southbay', 'Hometour', 'Hermosabeach', 'Manhattanbeach', 'holiday_events', 'redondobeach', 'holidayhomedecor', 'sandpipers1931']


### Using RAG for ingesting Data as per query

In [64]:
!pip install sentence-transformers scikit-learn datasets ragas



In [65]:
import os
import json
from typing import List, Dict, Any
from datetime import datetime
from dataclasses import dataclass, asdict, field, replace
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

@dataclass(frozen=True)
class EventAddress:
    latitude: float = None
    longitude: float = None
    region: str = None
    postal_code: str = None
    address_1: str = None

@dataclass(frozen=True)
class EventVenue:
    id: str = None
    name: str = None
    url: str = None
    address: EventAddress = field(default_factory=lambda: EventAddress())

@dataclass(frozen=True)
class EventImage:
    url: str = None

@dataclass(frozen=True)
class EventTag:
    text: str = None

@dataclass(frozen=True)
class Event:
    id: str = None
    name: str = None
    url: str = None
    is_online_event: bool = False
    short_description: str = None
    published_datetime: datetime = None
    start_datetime: datetime = None
    end_datetime: datetime = None
    timezone: str = None
    hide_start_date: bool = False
    hide_end_date: bool = False
    parent_event_url: str = None
    series_id: str = None
    primary_venue: EventVenue = field(default_factory=lambda: EventVenue())
    tickets_url: str = None
    checkout_flow: str = None
    language: str = None
    image: EventImage = field(default_factory=lambda: EventImage())
    tags_categories: tuple = field(default_factory=tuple)
    tags_formats: tuple = field(default_factory=tuple)
    tags_by_organizer: tuple = field(default_factory=tuple)

    def __hash__(self):
        if self.id:
            return hash(self.id)
        return hash((
            self.name,
            self.is_online_event,
            self.start_datetime,
            self.primary_venue.name
        ))

class EventbriteRAGPipeline:
    def __init__(self, events: List[Event], embedding_model: str = 'all-MiniLM-L6-v2'):
        self.events = [
            replace(
                event,
                tags_categories=tuple(event.tags_categories),
                tags_formats=tuple(event.tags_formats),
                tags_by_organizer=tuple(event.tags_by_organizer),
            )
            for event in events
        ]
        self.model = SentenceTransformer(embedding_model)
        self.event_embeddings = self._compute_embeddings()


    def _compute_embeddings(self) -> List[np.ndarray]:
        """
        Compute embeddings for events by creating a comprehensive text representation.

        Returns:
            List of event embeddings
        """
        def event_to_text(event: Event) -> str:
            """Convert event to a comprehensive text representation for embedding."""
            text_parts = [
                event.name or '',
                event.short_description or '',
                ' '.join(tag.text for tag in event.tags_categories),
                ' '.join(tag.text for tag in event.tags_formats),
                ' '.join(tag.text for tag in event.tags_by_organizer),
                event.primary_venue.name or '',
                event.primary_venue.address.region or '',
                event.language or ''
            ]
            return ' '.join(filter(bool, text_parts))

        return self.model.encode([event_to_text(event) for event in self.events])

    def query_events(self, query: str, top_k: int = 5) -> List[Event]:
        """
        Query events based on semantic similarity.

        Args:
            query (str): Natural language query
            top_k (int): Number of top events to return

        Returns:
            List of top matching events
        """
        query_embedding = self.model.encode(query).reshape(1, -1)
        similarities = cosine_similarity(query_embedding, self.event_embeddings)[0]
        top_indices = similarities.argsort()[-top_k:][::-1]

        return [self.events[idx] for idx in top_indices]

class EventEvaluator:

    def __init__(self, pipeline, queries):
        self.pipeline = pipeline
        self.queries = queries

    def evaluate_query(self, query):
        """Evaluate a single query and return results."""
        top_events = self.pipeline.query_events(query)
        results = []
        for event in top_events:
            result = {
                "Event Name": event.name,
                "Online Event": event.is_online_event,
                "Start Time": event.start_datetime,
                "Venue Address": event.primary_venue.address.address_1,
                "Venue Name": event.primary_venue.name,
                "Tickets URL": event.tickets_url,
                "Language": event.language,
                "Description": event.short_description,
                "Categories": [tag.text for tag in event.tags_categories],
            }
            results.append(result)
        return results

    def evaluate(self):
        """Run evaluation on all queries."""
        all_results = {}
        for query in self.queries:
            all_results[query] = self.evaluate_query(query)
        return all_results


### Pipeline for 10 Queries and there outcomes

In [66]:
from tabulate import tabulate
if __name__ == "__main__":
    sample_events = events
    rag_pipeline = EventbriteRAGPipeline(sample_events)
    test_queries = [
        "parties",
        "concerts",
        "hackathons",
        "conferences",
        "art exhibitions",
        "online webinars",
        "networking events",
        "coding workshops",
        "startup pitches",
        "tech meetups"
    ]

    evaluator = EventEvaluator(rag_pipeline, test_queries)
    evaluation_results = evaluator.evaluate()
    for query, results in evaluation_results.items():
        print(f"Results for query '{query}':\n")
        if results:
            table = [
                [
                    result["Event Name"],
                    result["Online Event"],
                    result["Start Time"],
                    result["Venue Address"],
                    result["Venue Name"],
                    result["Description"],
                    result["Tickets URL"],
                    result["Language"],
                    result["Categories"],
                ]
                for result in results
            ]
            print(tabulate(table, headers=["Event Name", "Online Event", "Start Time", "Venue Address", "Venue Name","Description" ,"Tickets URL", "Language", "Categories"], tablefmt="grid"))
        else:
            print("No results found.")
        print("\n")

Results for query 'parties':

+------------------------------------------+----------------+---------------------------+-----------------------+------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------+------------+-----------------------------+
| Event Name                               | Online Event   | Start Time                | Venue Address         | Venue Name                         | Description                                                                                                                                | Tickets URL                                                    | Language   | Categories                  |
| Friday Night Flirt                       | False          | 2024-11-30 04:00:00+00:00 | 438 South Main Street | Bar Franca                         | A Social for QTBIPO

### Saving the results

In [73]:
from tabulate import tabulate
from docx import Document
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.shared import Pt
from docx.oxml import OxmlElement
from docx.oxml.ns import qn


def add_table_border(table):
    """Add borders to the table."""
    tbl = table._element  # Get the table's XML element
    tbl_pr = tbl.xpath("./w:tblPr")[0]  # Find the table properties element
    tbl_borders = OxmlElement("w:tblBorders")

    # Define border styles (you can modify the color and size as needed)
    for border_name in ["top", "left", "bottom", "right", "insideH", "insideV"]:
        border = OxmlElement(f"w:{border_name}")
        border.set(qn("w:val"), "single")
        border.set(qn("w:sz"), "6")  # Border size
        border.set(qn("w:space"), "0")
        border.set(qn("w:color"), "000000")  # Black color
        tbl_borders.append(border)

    tbl_pr.append(tbl_borders)


def save_results_to_word(evaluation_results, output_file):
    """Save evaluation results to a Word document in tabular format with visible borders."""
    document = Document()

    # Add a title to the document
    document.add_heading("Evaluation Results", level=1)

    for query, results in evaluation_results.items():
        document.add_heading(f"Results for query: {query}", level=2)

        if results:
            # Create a table
            table = document.add_table(rows=1, cols=9)  # Adding 9 columns
            table.alignment = WD_TABLE_ALIGNMENT.CENTER

            # Add header row
            headers = [
                "Event Name", "Online Event", "Start Time", "Venue Address",
                "Venue Name", "Tickets URL", "Language", "Categories", "Description"
            ]
            header_cells = table.rows[0].cells
            for i, header in enumerate(headers):
                header_cells[i].text = header

            # Add data rows
            for result in results:
                row_cells = table.add_row().cells
                row_cells[0].text = result["Event Name"]
                row_cells[1].text = str(result["Online Event"])
                row_cells[2].text = str(result["Start Time"])
                row_cells[3].text = result["Venue Address"]
                row_cells[4].text = result["Venue Name"]
                row_cells[5].text = result["Tickets URL"]
                row_cells[6].text = result["Language"]
                row_cells[7].text = result["Categories"]
                row_cells[8].text = result["Description"]

            # Add borders to the table
            add_table_border(table)

            # Format the table for better readability
            for row in table.rows:
                for cell in row.cells:
                    cell.width = Pt(50)  # Set column width (optional)
        else:
            document.add_paragraph("No results found.")

        # Add a blank line after each query's results
        document.add_paragraph()

    # Save the document
    document.save(output_file)
    print(f"Results saved to {output_file}")


if __name__ == "__main__":
    # Example usage - you would replace this with your actual events and pipeline
    sample_events = events  # Replace with your actual events data
    rag_pipeline = EventbriteRAGPipeline(sample_events)

    # Define queries for evaluation
    test_queries = [
        "parties",
        "concerts",
        "hackathons",
        "conferences",
        "art exhibitions",
        "online webinars",
        "networking events",
        "coding workshops",
        "startup pitches",
        "tech meetups"
    ]

    evaluator = EventEvaluator(rag_pipeline, test_queries)
    evaluation_results = evaluator.evaluate()

    # Save the results to a Word document
    save_results_to_word(evaluation_results, "Evaluation_Results_Zeal.ai.docx")


Results saved to Evaluation_Results_Zeal.ai.docx
