In [9]:

import pandas as pd
import numpy as np
import time
import dash
from pyspark.sql.functions import desc
import dash_bootstrap_components as dbc
import plotly.graph_objects as go
from dash import dcc, html, Output, Input, State
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from dash import dcc, html, Input, Output, State

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("StarMeter") \
    .config("spark.jars", "jars/mysql-connector-java-8.4.0.jar") \
    .getOrCreate()

# Function to retrieve fan counts with error handling
def get_fan_counts():
    try:
        # Fetch data using Spark
        df = spark.read.format("jdbc") \
            .option("url", "jdbc:mysql://127.0.0.1:3306/starmeter") \
            .option("dbtable", "user_dynamic_preferences") \
            .option("user", "timlinkous") \
            .option("password", "zipcode1") \
            .option("driver", "com.mysql.cj.jdbc.Driver") \
            .load()

        # Perform the aggregation using Spark
        fan_counts_df = df.groupBy("current_favorite").count().withColumnRenamed("count", "fan_count")

        # Convert to Pandas DataFrame for Dash processing
        fan_counts_pd = fan_counts_df.toPandas()

        if fan_counts_pd.empty:
            print("Warning: The retrieved DataFrame is empty.")
        return fan_counts_pd

    except Exception as e:
        print(f"Error retrieving data: {e}")
        return pd.DataFrame(columns=['current_favorite', 'fan_count'])



def fetch_and_calculate_changes():
    # Fetch data from MySQL
    df = spark.read.format("jdbc") \
        .option("url", "jdbc:mysql://localhost:3306/starmeter") \
        .option("driver", "com.mysql.cj.jdbc.Driver") \
        .option("dbtable", "event_log") \
        .option("user", "timlinkous") \
        .option("password", "zipcode1") \
        .load()

    # Convert to Pandas DataFrame and select necessary columns
    pdf = df.select("event_date", "celebrity", "event_description", "current_fan_count") \
        .orderBy(F.desc("event_date")) \
        .limit(11).toPandas()
    
    # Sort by date in descending order
    pdf = pdf.sort_values('event_date', ascending=False).reset_index(drop=True)
    
    # Calculate fan_count_change
    pdf['fan_count_change'] = pdf['current_fan_count'].diff(-1).fillna(0).astype(int)
    
    # Calculate percent change
    pdf['percent_change'] = (pdf['fan_count_change'] / pdf['current_fan_count'].shift(-1)) * 100
    
    # Detect anomalies (changes >= 15%)
    pdf['is_anomaly'] = np.abs(pdf['percent_change']) >= 15
    
    # Remove the extra row we used for calculation
    pdf = pdf.iloc[:-1]

    # Create the event log HTML using Dash components with specified font sizes
    event_log_html = html.Div([
        html.H4("Event Log", style={'fontSize': '20px', 'marginBottom': '10px'}),
        html.Table([
            html.Thead(html.Tr([
                html.Th("Date", style={'fontSize': '12px', 'fontWeight': 'bold', 'padding': '5px'}),
                html.Th("Celebrity", style={'fontSize': '12px', 'fontWeight': 'bold', 'padding': '5px'}),
                html.Th("Event", style={'fontSize': '12px', 'fontWeight': 'bold', 'padding': '5px'}),
                html.Th("Change", style={'fontSize': '12px', 'fontWeight': 'bold', 'padding': '5px'}),
                html.Th("% Change", style={'fontSize': '12px', 'fontWeight': 'bold', 'padding': '5px'}),
                html.Th("Anomaly", style={'fontSize': '12px', 'fontWeight': 'bold', 'padding': '5px'})
            ])),
            html.Tbody([
                html.Tr([
                    html.Td(row['event_date'].strftime('%Y-%m-%d'), style={'fontSize': '12px', 'padding': '5px'}),
                    html.Td(row['celebrity'], style={'fontSize': '12px', 'padding': '5px'}),
                    html.Td(row['event_description'], style={'fontSize': '12px', 'padding': '5px'}),
                    html.Td(f"{row['fan_count_change']:+d}", style={'fontSize': '12px', 'padding': '5px', 'color': 'green' if row['fan_count_change'] > 0 else 'red' if row['fan_count_change'] < 0 else 'black'}),
                    html.Td(f"{row['percent_change']:.2f}%" if not pd.isna(row['percent_change']) else "N/A", style={'fontSize': '12px', 'padding': '5px'}),
                    html.Td("ALERT" if row['is_anomaly'] else "", style={'fontSize': '12px', 'padding': '5px', 'color': 'red', 'fontWeight': 'bold'})
                ]) for _, row in pdf.iterrows()
            ])
        ], style={'width': '100%', 'textAlign': 'left', 'borderCollapse': 'collapse'})
    ])
    
    return event_log_html


# Initialize Dash app
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# Define custom styles for KPI cards
card_styles = [
    {"backgroundColor": "#0075A4", "color": "white", "padding": "0px", "borderRadius": "10px", "width": "200px", "height": "60px", "lineHeight": "1"},
    {"backgroundColor": "#008FAD", "color": "white", "padding": "0px", "borderRadius": "10px", "width": "200px", "height": "60px", "lineHeight": "1"},
    {"backgroundColor": "#00A697", "color": "white", "padding": "0px", "borderRadius": "10px", "width": "200px", "height": "60px", "lineHeight": "1"},
    {"backgroundColor": "#1EB769", "color": "white", "padding": "0px", "borderRadius": "10px", "width": "200px", "height": "60px", "lineHeight": "1"}
]

# Initialize layout with KPI cards next to each other and graph with legend on top
app.layout = html.Div([
     dbc.Row([
        dbc.Col(html.Button('Start', id='start-stop-button', n_clicks=0, className="btn btn-primary"), width="auto"),
        dbc.Col(html.H1("StarMeter", style={"fontSize": "24px"}), width=True)
    ], align='center', className="mb-4"),
    
    dbc.Row([
        dbc.Col(dbc.Card([
            dbc.CardBody([
                html.H4("Total Fans (in thousands)", className="card-title", style={"fontSize": "12px", "margin": "0"}),
                html.H5(id='total-fans', className="card-text", style={"fontSize": "16px", "margin": "0"})
            ])
        ], style=card_styles[0], className="mb-2"), width=2),
        dbc.Col(dbc.Card([
            dbc.CardBody([
                html.H4("Most Popular Celebrity", className="card-title", style={"fontSize": "12px", "margin": "0"}),
                html.H5(id='most-popular-celebrity', className="card-text", style={"fontSize": "16px", "margin": "0"})
            ])
        ], style=card_styles[1], className="mb-2"), width=2),
        dbc.Col(dbc.Card([
            dbc.CardBody([
                html.H4("Average Fans", className="card-title", style={"fontSize": "12px", "margin": "0"}),
                html.H5(id='average-fans', className="card-text", style={"fontSize": "16px", "margin": "0"})
            ])
        ], style=card_styles[2], className="mb-2"), width=2),
        dbc.Col(dbc.Card([
            dbc.CardBody([
                html.H4("Total Events", className="card-title", style={"fontSize": "12px", "margin": "0"}),
                html.H5(id='total-events', className="card-text", style={"fontSize": "16px", "margin": "0"})
            ])
        ], style=card_styles[3], className="mb-2"), width=2),
    ], className="g-1"),
    
    dbc.Row([
        dbc.Col(dcc.Graph(id='live-update-graph'), width=8),
        dbc.Col(html.Div(id='event-log', children="Event log will be displayed here.", style={"height": "400px", "overflowY": "auto", "border": "1px solid #ddd", "padding": "10px"}), width=4)
    ]),
    
    dbc.Row([
        dbc.Col(dcc.Graph(id='bar-chart'), width=12)
    ]),
    
    dcc.Interval(
        id='interval-component',
        interval=1000,  # update interval milliseconds
        n_intervals=0,
        disabled=True  # interval initially disabled
    ),
])

# Initialize data storage
fan_counts_data = {celebrity: [] for celebrity in ['Sabrina Carpenter', 'Snoop Dogg', 'Tony Stark', 'LeBron James']}
time_data = []

@app.callback(
    [Output('live-update-graph', 'figure'),
     Output('bar-chart', 'figure'),
     Output('total-fans', 'children'),
     Output('most-popular-celebrity', 'children'),
     Output('average-fans', 'children'),
     Output('total-events', 'children'),
     Output('event-log', 'children')],  # Add this output for the event log
    Input('interval-component', 'n_intervals')
)
def update_graph_and_kpis(n):
    global time_data, fan_counts_data

    try:
        # Fetch latest fan counts and event log using Spark
        fan_counts_df = get_fan_counts()  # Fetch fan counts
        event_log_html = fetch_and_calculate_changes()  # Fetch event log data

        # Initialize KPI variables
        total_fans = 0
        most_popular_celeb = None
        max_fans = 0
        total_events = 0
        current_fan_counts = []  # To store the latest fan counts for the bar chart
        colors = ['#0075A4', '#008FAD', '#00A697', '#1EB769']

        if not fan_counts_df.empty:
            # Append current timestamp
            current_time = time.time()
            time_data.append(current_time - time_data[0] if time_data else 0)

            # Update fan counts data
            for idx, celebrity in enumerate(fan_counts_data.keys()):
                # Fetch the latest fan count for each celebrity
                fan_count = fan_counts_df[fan_counts_df['current_favorite'] == celebrity]['fan_count'].sum() if not fan_counts_df[fan_counts_df['current_favorite'] == celebrity].empty else 0
                fan_counts_data[celebrity].append(fan_count)
                current_fan_counts.append(fan_count)  # Add to current fan counts for bar chart
                total_events += len(fan_counts_data[celebrity])

                # Update total fans and most popular celebrity
                total_fans += fan_count
                if fan_count > max_fans:
                    max_fans = fan_count
                    most_popular_celeb = celebrity

            colors = [
                '#1f77b4',  # muted blue
                '#9467bd',  # muted purple
                '#d62728',  # brick red
                '#17becf'   # blue-teal
            ]
            
            # Create traces for each celebrity in the line chart
            line_fig = go.Figure()
            for idx, (celebrity, fan_counts) in enumerate(fan_counts_data.items()):
                line_fig.add_trace(go.Scatter(
                    x=time_data,
                    y=fan_counts,
                    mode='lines+markers',
                    name=celebrity,
                    marker=dict(size=5, color=colors[idx]),
                    line=dict(color=colors[idx]),
                    text=['Event happened here'] * len(fan_counts),
                    hoverinfo='text',
                    customdata=list(range(len(fan_counts))),  # Index of the markers
                ))

            # Ensure time_data has enough data points to set range
            if len(time_data) > 1:
                line_fig.update_layout(
                    xaxis=dict(range=[max(time_data) - 100, max(time_data)]),
                )

            line_fig.update_layout(
                xaxis=dict(
                    range=[max(0, max(time_data) - 100), max(time_data)],
                    fixedrange=True,  # Prevents zooming/panning from affecting range
                    showticklabels=False,  # Hides x-axis labels
                ),
                xaxis_title='Time (Days)',
                yaxis_title='Number of Fans (in thousands)',
                margin=dict(l=0, r=0, t=40, b=0),  # Adjust margins to fit the layout
                autosize=False,  # Disable autosize to fix width
                width=900,  # Set the desired width
                height=400,  # Set the desired height
                showlegend=True,
                legend=dict(
                    orientation='h',  # Horizontal orientation
                    yanchor='bottom',
                    y=1.15,
                    xanchor='right',
                    x=1
                ),
                modebar=dict(
                    remove=['zoom', 'pan', 'select', 'zoomIn', 'zoomOut', 'autoScale', 'resetScale2d', 'lasso2d', 'zoom2d', 'resetScale', 'toImage', 'plotly_logo']
                )
            )


            # Create bar chart for current fan counts
            bar_fig = go.Figure(data=[
                go.Bar(
                    x=list(fan_counts_data.keys()),
                    y=current_fan_counts,  # Use the latest counts
                    marker_color=colors,  # Match colors with the line graph
                    width=0.4  # Set the bar width; adjust as needed
                )
            ])
            # Set the y-axis range to the total current fans
            bar_fig.update_layout(
                title='Current Fans per Celebrity',
                xaxis_title='Celebrity',
                yaxis_title='Current Fans',
                yaxis=dict(range=[0, (total_fans/2)-1000]),  # Set range based on total fans
                margin=dict(l=40, r=40, t=40, b=80),  # Adjust margins to fit the layout
                autosize=True,
                width=400,  # Set the desired width
                height=400,  # Set the desired height
                showlegend=False,
                modebar=dict(
                    remove=['zoom', 'pan', 'select', 'zoomIn', 'zoomOut', 'autoScale', 'resetScale2d', 'lasso2d', 'zoom2d', 'resetScale', 'toImage', 'plotly_logo']
                )
            )
            bar_fig.update_xaxes(tickangle=-45)  # Rotate x-axis labels if needed

        else:
            line_fig = go.Figure().update_layout(
                title='No data available',
                xaxis_title='Time (seconds)',
                yaxis_title='Number of Fans'
            )
            bar_fig = go.Figure().update_layout(
                title='No data available',
                xaxis_title='Celebrity',
                yaxis_title='Current Fans'
            )

        average_fans = total_fans // len(fan_counts_data) if fan_counts_data else 0

        return line_fig, bar_fig, f'{total_fans}', most_popular_celeb or 'No data', f'{average_fans}', f'{total_events}', event_log_html

    except Exception as e:
        print(f"Error in updating graph and KPIs: {e}")
        return go.Figure().update_layout(title="Error updating graph"), go.Figure().update_layout(title="Error updating graph"), 'Error', 'Error', 'Error', 'Error', html.Div(["Error updating event log"])


# Callback control start stop functionality
@app.callback(
    [Output('interval-component', 'disabled'),
     Output('start-stop-button', 'children')],
    [Input('start-stop-button', 'n_clicks')],
    [State('interval-component', 'disabled')]
)

def toggle_interval(n_clicks, is_disabled):
    # Toggle state interval (start stop updates)
    if n_clicks % 2 == 0:
        return True, 'Start'  # disabled, show 'Start' button text
    else:
        return False, 'Stop'  # enabled, show 'Stop' button text

if __name__ == '__main__':
    app.run_server(debug=True)
 
 

In [8]:
spark.stop()

In [2]:
pip install numpy sentence-transformers pinecone-client

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install --upgrade pinecone-client


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [15]:
import os
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
import numpy as np

# Initialize Pinecone
pc = Pinecone(api_key="00fdf680-fdf4-458a-9d14-5271f9ab7002")

index_name = "event-similarity"

# Check if the index exists, if not create it
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine"
    )

# Connect to the index
index = pc.Index(index_name)

# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

def truncate_content(content, max_bytes=40000):
    """Truncate content to fit within Pinecone's metadata size limit."""
    encoded_content = content.encode('utf-8')
    if len(encoded_content) <= max_bytes:
        return content
    return encoded_content[:max_bytes].decode('utf-8', 'ignore')

def process_and_index_articles(directory):
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r') as file:
                content = file.read()
                
                # Create embedding
                embedding = model.encode(content)
                
                # Truncate content to fit within metadata size limit
                truncated_content = truncate_content(content)
                
                # Index the document
                try:
                    index.upsert(vectors=[(filename, embedding.tolist(), {"content": truncated_content})])
                    print(f"Indexed: {filename}")
                except Exception as e:
                    print(f"Error indexing {filename}: {str(e)}")
                    # You might want to log this error or handle it in some way

# Usage
article_directory = "/Users/timl/Projects/txt files/Sabrina Carpenter"
process_and_index_articles(article_directory)

Indexed: collected_text_4.txt


In [16]:
import mysql.connector
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
from collections import defaultdict

# Initialize Pinecone
pc = Pinecone(api_key="00fdf680-fdf4-458a-9d14-5271f9ab7002")
index = pc.Index("event-similarity")

# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Retrieve event descriptions from MySQL and deduplicate
def get_unique_event_descriptions():
    db = mysql.connector.connect(
        host="localhost",
        user="timlinkous",
        password="zipcode1",
        database="starmeter"
    )
    cursor = db.cursor()
    cursor.execute("SELECT event_id, event_description FROM event_log")
    events = cursor.fetchall()
    
    # Deduplicate events based on description
    unique_events = defaultdict(list)
    for event_id, description in events:
        unique_events[description].append(event_id)
    
    return [(ids[0], desc) for desc, ids in unique_events.items()]

def find_similar_articles():
    unique_events = get_unique_event_descriptions()
    similar_articles = {}
    
    for event_id, description in unique_events:
        event_vector = model.encode(description).tolist()
        # Perform similarity search in Pinecone
        results = index.query(vector=event_vector, top_k=10, include_metadata=True)
        
        # Sort matches by score in descending order and remove duplicates
        seen_ids = set()
        sorted_unique_matches = []
        for match in sorted(results.matches, key=lambda x: x.score, reverse=True):
            if match.id not in seen_ids:
                sorted_unique_matches.append(match)
                seen_ids.add(match.id)
            if len(sorted_unique_matches) == 5:  # Limit to top 5 unique matches
                break
        
        similar_articles[event_id] = sorted_unique_matches
        print(f"Event {event_id}, Description: '{description[:50]}...' matches articles:")
        for match in sorted_unique_matches:
            print(f"  - ID: {match.id}, Description: {match.description}, Score: {match.score:.4f}")
        print()  # Add a blank line for readability
    
    return similar_articles

# Usage
event_article_matches = find_similar_articles()

# You can now use event_article_matches in your application
# It's a dictionary where keys are event_ids and values are lists of unique similar articles sorted by score

Event 8825, Description: 'Public argument/feud...' matches articles:
  - ID: collected_text_4.txt, Description: None, Score: 0.1075
  - ID: june_28.txt, Description: None, Score: 0.0573
  - ID: aug_10.txt, Description: None, Score: 0.0264

Event 8826, Description: 'Talk show appearance...' matches articles:
  - ID: collected_text_4.txt, Description: None, Score: 0.3735
  - ID: june_28.txt, Description: None, Score: 0.0219
  - ID: aug_10.txt, Description: None, Score: 0.0058

Event 8828, Description: 'Releases new shoe/product...' matches articles:
  - ID: june_28.txt, Description: None, Score: 0.1358
  - ID: collected_text_4.txt, Description: None, Score: 0.0886
  - ID: aug_10.txt, Description: None, Score: 0.0351

Event 8829, Description: 'Editorial Article...' matches articles:
  - ID: june_28.txt, Description: None, Score: 0.1310
  - ID: aug_10.txt, Description: None, Score: 0.0594
  - ID: collected_text_4.txt, Description: None, Score: -0.0365

Event 8831, Description: 'Cameo appea