In [9]:

import pandas as pd
import numpy as np
import time
import dash
from pyspark.sql.functions import desc
import dash_bootstrap_components as dbc
import plotly.graph_objects as go
from dash import dcc, html, Output, Input, State
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from dash import dcc, html, Input, Output, State

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("StarMeter") \
    .config("spark.jars", "jars/mysql-connector-java-8.4.0.jar") \
    .getOrCreate()

# Function to retrieve fan counts with error handling
def get_fan_counts():
    try:
        # Fetch data using Spark
        df = spark.read.format("jdbc") \
            .option("url", "jdbc:mysql://127.0.0.1:3306/starmeter") \
            .option("dbtable", "user_dynamic_preferences") \
            .option("user", "timlinkous") \
            .option("password", "zipcode1") \
            .option("driver", "com.mysql.cj.jdbc.Driver") \
            .load()

        # Perform the aggregation using Spark
        fan_counts_df = df.groupBy("current_favorite").count().withColumnRenamed("count", "fan_count")

        # Convert to Pandas DataFrame for Dash processing
        fan_counts_pd = fan_counts_df.toPandas()

        if fan_counts_pd.empty:
            print("Warning: The retrieved DataFrame is empty.")
        return fan_counts_pd

    except Exception as e:
        print(f"Error retrieving data: {e}")
        return pd.DataFrame(columns=['current_favorite', 'fan_count'])



def fetch_and_calculate_changes():
    # Fetch data from MySQL
    df = spark.read.format("jdbc") \
        .option("url", "jdbc:mysql://localhost:3306/starmeter") \
        .option("driver", "com.mysql.cj.jdbc.Driver") \
        .option("dbtable", "event_log") \
        .option("user", "timlinkous") \
        .option("password", "zipcode1") \
        .load()

    # Convert to Pandas DataFrame and select necessary columns
    pdf = df.select("event_date", "celebrity", "event_description", "current_fan_count") \
        .orderBy(F.desc("event_date")) \
        .limit(11).toPandas()
    
    # Sort by date in descending order
    pdf = pdf.sort_values('event_date', ascending=False).reset_index(drop=True)
    
    # Calculate fan_count_change
    pdf['fan_count_change'] = pdf['current_fan_count'].diff(-1).fillna(0).astype(int)
    
    # Calculate percent change
    pdf['percent_change'] = (pdf['fan_count_change'] / pdf['current_fan_count'].shift(-1)) * 100
    
    # Detect anomalies (changes >= 15%)
    pdf['is_anomaly'] = np.abs(pdf['percent_change']) >= 15
    
    # Remove the extra row we used for calculation
    pdf = pdf.iloc[:-1]

    # Create the event log HTML using Dash components with specified font sizes
    event_log_html = html.Div([
        html.H4("Event Log", style={'fontSize': '20px', 'marginBottom': '10px'}),
        html.Table([
            html.Thead(html.Tr([
                html.Th("Date", style={'fontSize': '12px', 'fontWeight': 'bold', 'padding': '5px'}),
                html.Th("Celebrity", style={'fontSize': '12px', 'fontWeight': 'bold', 'padding': '5px'}),
                html.Th("Event", style={'fontSize': '12px', 'fontWeight': 'bold', 'padding': '5px'}),
                html.Th("Change", style={'fontSize': '12px', 'fontWeight': 'bold', 'padding': '5px'}),
                html.Th("% Change", style={'fontSize': '12px', 'fontWeight': 'bold', 'padding': '5px'}),
                html.Th("Anomaly", style={'fontSize': '12px', 'fontWeight': 'bold', 'padding': '5px'})
            ])),
            html.Tbody([
                html.Tr([
                    html.Td(row['event_date'].strftime('%Y-%m-%d'), style={'fontSize': '12px', 'padding': '5px'}),
                    html.Td(row['celebrity'], style={'fontSize': '12px', 'padding': '5px'}),
                    html.Td(row['event_description'], style={'fontSize': '12px', 'padding': '5px'}),
                    html.Td(f"{row['fan_count_change']:+d}", style={'fontSize': '12px', 'padding': '5px', 'color': 'green' if row['fan_count_change'] > 0 else 'red' if row['fan_count_change'] < 0 else 'black'}),
                    html.Td(f"{row['percent_change']:.2f}%" if not pd.isna(row['percent_change']) else "N/A", style={'fontSize': '12px', 'padding': '5px'}),
                    html.Td("ALERT" if row['is_anomaly'] else "", style={'fontSize': '12px', 'padding': '5px', 'color': 'red', 'fontWeight': 'bold'})
                ]) for _, row in pdf.iterrows()
            ])
        ], style={'width': '100%', 'textAlign': 'left', 'borderCollapse': 'collapse'})
    ])
    
    return event_log_html


# Initialize Dash app
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# Define custom styles for KPI cards
card_styles = [
    {"backgroundColor": "#0075A4", "color": "white", "padding": "0px", "borderRadius": "10px", "width": "200px", "height": "60px", "lineHeight": "1"},
    {"backgroundColor": "#008FAD", "color": "white", "padding": "0px", "borderRadius": "10px", "width": "200px", "height": "60px", "lineHeight": "1"},
    {"backgroundColor": "#00A697", "color": "white", "padding": "0px", "borderRadius": "10px", "width": "200px", "height": "60px", "lineHeight": "1"},
    {"backgroundColor": "#1EB769", "color": "white", "padding": "0px", "borderRadius": "10px", "width": "200px", "height": "60px", "lineHeight": "1"}
]

# Initialize layout with KPI cards next to each other and graph with legend on top
app.layout = html.Div([
     dbc.Row([
        dbc.Col(html.Button('Start', id='start-stop-button', n_clicks=0, className="btn btn-primary"), width="auto"),
        dbc.Col(html.H1("StarMeter", style={"fontSize": "24px"}), width=True)
    ], align='center', className="mb-4"),
    
    dbc.Row([
        dbc.Col(dbc.Card([
            dbc.CardBody([
                html.H4("Total Fans (in thousands)", className="card-title", style={"fontSize": "12px", "margin": "0"}),
                html.H5(id='total-fans', className="card-text", style={"fontSize": "16px", "margin": "0"})
            ])
        ], style=card_styles[0], className="mb-2"), width=2),
        dbc.Col(dbc.Card([
            dbc.CardBody([
                html.H4("Most Popular Celebrity", className="card-title", style={"fontSize": "12px", "margin": "0"}),
                html.H5(id='most-popular-celebrity', className="card-text", style={"fontSize": "16px", "margin": "0"})
            ])
        ], style=card_styles[1], className="mb-2"), width=2),
        dbc.Col(dbc.Card([
            dbc.CardBody([
                html.H4("Average Fans", className="card-title", style={"fontSize": "12px", "margin": "0"}),
                html.H5(id='average-fans', className="card-text", style={"fontSize": "16px", "margin": "0"})
            ])
        ], style=card_styles[2], className="mb-2"), width=2),
        dbc.Col(dbc.Card([
            dbc.CardBody([
                html.H4("Total Events", className="card-title", style={"fontSize": "12px", "margin": "0"}),
                html.H5(id='total-events', className="card-text", style={"fontSize": "16px", "margin": "0"})
            ])
        ], style=card_styles[3], className="mb-2"), width=2),
    ], className="g-1"),
    
    dbc.Row([
        dbc.Col(dcc.Graph(id='live-update-graph'), width=8),
        dbc.Col(html.Div(id='event-log', children="Event log will be displayed here.", style={"height": "400px", "overflowY": "auto", "border": "1px solid #ddd", "padding": "10px"}), width=4)
    ]),
    
    dbc.Row([
        dbc.Col(dcc.Graph(id='bar-chart'), width=12)
    ]),
    
    dcc.Interval(
        id='interval-component',
        interval=1000,  # update interval milliseconds
        n_intervals=0,
        disabled=True  # interval initially disabled
    ),
])

# Initialize data storage
fan_counts_data = {celebrity: [] for celebrity in ['Sabrina Carpenter', 'Snoop Dogg', 'Tony Stark', 'LeBron James']}
time_data = []

@app.callback(
    [Output('live-update-graph', 'figure'),
     Output('bar-chart', 'figure'),
     Output('total-fans', 'children'),
     Output('most-popular-celebrity', 'children'),
     Output('average-fans', 'children'),
     Output('total-events', 'children'),
     Output('event-log', 'children')],  # Add this output for the event log
    Input('interval-component', 'n_intervals')
)
def update_graph_and_kpis(n):
    global time_data, fan_counts_data

    try:
        # Fetch latest fan counts and event log using Spark
        fan_counts_df = get_fan_counts()  # Fetch fan counts
        event_log_html = fetch_and_calculate_changes()  # Fetch event log data

        # Initialize KPI variables
        total_fans = 0
        most_popular_celeb = None
        max_fans = 0
        total_events = 0
        current_fan_counts = []  # To store the latest fan counts for the bar chart
        colors = ['#0075A4', '#008FAD', '#00A697', '#1EB769']

        if not fan_counts_df.empty:
            # Append current timestamp
            current_time = time.time()
            time_data.append(current_time - time_data[0] if time_data else 0)

            # Update fan counts data
            for idx, celebrity in enumerate(fan_counts_data.keys()):
                # Fetch the latest fan count for each celebrity
                fan_count = fan_counts_df[fan_counts_df['current_favorite'] == celebrity]['fan_count'].sum() if not fan_counts_df[fan_counts_df['current_favorite'] == celebrity].empty else 0
                fan_counts_data[celebrity].append(fan_count)
                current_fan_counts.append(fan_count)  # Add to current fan counts for bar chart
                total_events += len(fan_counts_data[celebrity])

                # Update total fans and most popular celebrity
                total_fans += fan_count
                if fan_count > max_fans:
                    max_fans = fan_count
                    most_popular_celeb = celebrity

            colors = [
                '#1f77b4',  # muted blue
                '#9467bd',  # muted purple
                '#d62728',  # brick red
                '#17becf'   # blue-teal
            ]
            
            # Create traces for each celebrity in the line chart
            line_fig = go.Figure()
            for idx, (celebrity, fan_counts) in enumerate(fan_counts_data.items()):
                line_fig.add_trace(go.Scatter(
                    x=time_data,
                    y=fan_counts,
                    mode='lines+markers',
                    name=celebrity,
                    marker=dict(size=5, color=colors[idx]),
                    line=dict(color=colors[idx]),
                    text=['Event happened here'] * len(fan_counts),
                    hoverinfo='text',
                    customdata=list(range(len(fan_counts))),  # Index of the markers
                ))

            # Ensure time_data has enough data points to set range
            if len(time_data) > 1:
                line_fig.update_layout(
                    xaxis=dict(range=[max(time_data) - 100, max(time_data)]),
                )

            line_fig.update_layout(
                xaxis=dict(
                    range=[max(0, max(time_data) - 100), max(time_data)],
                    fixedrange=True,  # Prevents zooming/panning from affecting range
                    showticklabels=False,  # Hides x-axis labels
                ),
                xaxis_title='Time (Days)',
                yaxis_title='Number of Fans (in thousands)',
                margin=dict(l=0, r=0, t=40, b=0),  # Adjust margins to fit the layout
                autosize=False,  # Disable autosize to fix width
                width=900,  # Set the desired width
                height=400,  # Set the desired height
                showlegend=True,
                legend=dict(
                    orientation='h',  # Horizontal orientation
                    yanchor='bottom',
                    y=1.15,
                    xanchor='right',
                    x=1
                ),
                modebar=dict(
                    remove=['zoom', 'pan', 'select', 'zoomIn', 'zoomOut', 'autoScale', 'resetScale2d', 'lasso2d', 'zoom2d', 'resetScale', 'toImage', 'plotly_logo']
                )
            )


            # Create bar chart for current fan counts
            bar_fig = go.Figure(data=[
                go.Bar(
                    x=list(fan_counts_data.keys()),
                    y=current_fan_counts,  # Use the latest counts
                    marker_color=colors,  # Match colors with the line graph
                    width=0.4  # Set the bar width; adjust as needed
                )
            ])
            # Set the y-axis range to the total current fans
            bar_fig.update_layout(
                title='Current Fans per Celebrity',
                xaxis_title='Celebrity',
                yaxis_title='Current Fans',
                yaxis=dict(range=[0, (total_fans/2)-1000]),  # Set range based on total fans
                margin=dict(l=40, r=40, t=40, b=80),  # Adjust margins to fit the layout
                autosize=True,
                width=400,  # Set the desired width
                height=400,  # Set the desired height
                showlegend=False,
                modebar=dict(
                    remove=['zoom', 'pan', 'select', 'zoomIn', 'zoomOut', 'autoScale', 'resetScale2d', 'lasso2d', 'zoom2d', 'resetScale', 'toImage', 'plotly_logo']
                )
            )
            bar_fig.update_xaxes(tickangle=-45)  # Rotate x-axis labels if needed

        else:
            line_fig = go.Figure().update_layout(
                title='No data available',
                xaxis_title='Time (seconds)',
                yaxis_title='Number of Fans'
            )
            bar_fig = go.Figure().update_layout(
                title='No data available',
                xaxis_title='Celebrity',
                yaxis_title='Current Fans'
            )

        average_fans = total_fans // len(fan_counts_data) if fan_counts_data else 0

        return line_fig, bar_fig, f'{total_fans}', most_popular_celeb or 'No data', f'{average_fans}', f'{total_events}', event_log_html

    except Exception as e:
        print(f"Error in updating graph and KPIs: {e}")
        return go.Figure().update_layout(title="Error updating graph"), go.Figure().update_layout(title="Error updating graph"), 'Error', 'Error', 'Error', 'Error', html.Div(["Error updating event log"])


# Callback control start stop functionality
@app.callback(
    [Output('interval-component', 'disabled'),
     Output('start-stop-button', 'children')],
    [Input('start-stop-button', 'n_clicks')],
    [State('interval-component', 'disabled')]
)

def toggle_interval(n_clicks, is_disabled):
    # Toggle state interval (start stop updates)
    if n_clicks % 2 == 0:
        return True, 'Start'  # disabled, show 'Start' button text
    else:
        return False, 'Stop'  # enabled, show 'Stop' button text

if __name__ == '__main__':
    app.run_server(debug=True)
 
 

In [8]:
spark.stop()

In [2]:
pip install numpy sentence-transformers pinecone-client

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting transformers<5.0.0,>=4.34.0 (from sentence-transformers)
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.4.1-cp312-none-macosx_11_0_arm64.whl.metadata (26 kB)
Collecting huggingface-hub>=0.15.1 (from sentence-transformers)
  Using cached huggingface_hub-0.24.6-py3-none-any.whl.metadata (13 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Using cached pinecone_plugin_inference-1.0.3-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Using cached pinecone_plugin_interface-0

In [3]:
import mysql.connector
from mysql.connector import Error
from sentence_transformers import SentenceTransformer
import numpy as np

def extract_event_log():
    try:
        connection = mysql.connector.connect(
            host='localhost',
            database='starmeter',
            user='timlinkous',
            password='zipcode1'
        )

        if connection.is_connected():
            cursor = connection.cursor(dictionary=True)
            query = "SELECT * FROM event_log"
            cursor.execute(query)
            event_log_data = cursor.fetchall()
            return event_log_data

    except Error as e:
        print(f"Error while connecting to MySQL: {e}")
    finally:
        if connection.is_connected():
            cursor.close()
            connection.close()

# Usage
event_data = extract_event_log()


def prepare_data_for_pinecone(event_data):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    prepared_data = []
    for event in event_data:
        # Combine relevant fields into a single text
        text = f"{event['celebrity']} {event['event_description']}"
        
        # Generate embedding
        embedding = model.encode(text)
        
        # Prepare the data structure for Pinecone
        prepared_data.append({
            'id': str(event['event_id']),
            'values': embedding.tolist(),
            'metadata': {
                'date': event['event_date'].isoformat(),
                'celebrity': event['celebrity'],
                'description': event['event_description'],
                'fan_count': event['current_fan_count']
            }
        })
    
    return prepared_data

# Usage
prepared_data = prepare_data_for_pinecone(event_data)

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
import pinecone

def index_data_in_pinecone(prepared_data):
    # Initialize Pinecone
    pinecone.init(api_key="00fdf680-fdf4-458a-9d14-5271f9ab7002", environment="your_environment")
    
    # Create or connect to an existing index
    index_name = "celebrity-events"
    if index_name not in pinecone.list_indexes():
        pinecone.create_index(index_name, dimension=384)  # Dimension depends on your embedding model
    
    index = pinecone.Index(index_name)
    
    # Upsert data in batches
    batch_size = 100
    for i in range(0, len(prepared_data), batch_size):
        batch = prepared_data[i:i+batch_size]
        index.upsert(vectors=batch)

    print(f"Indexed {len(prepared_data)} events in Pinecone")

# Usage
index_data_in_pinecone(prepared_data)


def similarity_search(query, top_k=5):
    # Initialize Pinecone (if not already initialized)
    pinecone.init(api_key="00fdf680-fdf4-458a-9d14-5271f9ab7002", environment="your_environment")
    index = pinecone.Index("celebrity_events")
    
    # Generate embedding for the query
    model = SentenceTransformer('all-MiniLM-L6-v2')
    query_embedding = model.encode(query).tolist()
    
    # Perform the search
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
    
    return results

# Usage
query = "Celebrity X scandal during award show"
similar_events = similarity_search(query)

# Process and display results
for match in similar_events['matches']:
    print(f"Event ID: {match['id']}")
    print(f"Similarity Score: {match['score']}")
    print(f"Date: {match['metadata']['date']}")
    print(f"Celebrity: {match['metadata']['celebrity']}")
    print(f"Description: {match['metadata']['description']}")
    print(f"Fan Count: {match['metadata']['fan_count']}")
    print("---")

AttributeError: module 'pinecone' has no attribute 'init'