Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 30 additions & 22 deletions home.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import streamlit as st
import plotly.express as px
import pymysql
import time
from pages.sidebar import load_sidebar # Import the sidebar function
import html
from pages.sidebar import load_sidebar


# ---- Optimized Connection Function ----
def create_connection():
Expand All @@ -21,40 +22,43 @@ def create_connection():

# ---- Streamlit Layout ----
st.set_page_config(page_title="Keyword Search", layout="wide")
st.title("Keyword Search in English Notes")
st.title("Keyword Search in Community Notes")
load_sidebar()

# ---- Sidebar for Language Selection ----
language = st.radio("Language", ["English", "German"],index=1, horizontal=True)
table_name = "df_X_Eng_preprocessed" if "English" in language else "df_X_German_preprocessed"

# ---- Database Connection ----
with st.spinner("Connecting to Database..."):
conn = create_connection()

# ---- Combined Data Retrieval Query ----
@st.cache_data
def fetch_dates_and_keyword_range(_conn, keyword):
def fetch_dates_and_keyword_range(_conn, keyword, table_name):
query = f"""
SELECT
MIN(date), MAX(date),
MIN(CASE WHEN summary LIKE %s THEN date ELSE NULL END),
MAX(CASE WHEN summary LIKE %s THEN date ELSE NULL END)
FROM df_X_Eng_preprocessed
FROM {table_name}
"""
with _conn.cursor() as cursor:
cursor.execute(query, (f"%{keyword}%", f"%{keyword}%"))
global_min_date, global_max_date, keyword_min_date, keyword_max_date = cursor.fetchone()

# Handle None values
keyword_min_date = keyword_min_date or global_min_date
keyword_max_date = keyword_max_date or global_max_date

return pd.to_datetime(global_min_date).date(), pd.to_datetime(global_max_date).date(), \
pd.to_datetime(keyword_min_date).date(), pd.to_datetime(keyword_max_date).date()

# ---- Sidebar for Keyword Search ----
keyword_searched = st.text_input(label='Type your keyword', value='birdwatch')
# ---- Keyword Search ----
keyword_searched = st.text_input(label='Type your keyword', value='twitter')
st.markdown("Note: Add spaces before/after the keyword for exact word matches.")

# ---- Fetch Data from Cache ----
global_min_date, global_max_date, keyword_min_date, keyword_max_date = fetch_dates_and_keyword_range(conn, keyword_searched)
global_min_date, global_max_date, keyword_min_date, keyword_max_date = fetch_dates_and_keyword_range(conn, keyword_searched, table_name)

# ---- Date Range Selection ----
st.subheader("Select Date Range")
Expand All @@ -68,10 +72,10 @@ def fetch_dates_and_keyword_range(_conn, keyword):

# ---- Optimized Data Fetching ----
@st.cache_data
def fetch_data_counts(_conn, keyword, start_date, end_date):
query = """
def fetch_data_counts(_conn, keyword, start_date, end_date, table_name):
query = f"""
SELECT date, COUNT(*)
FROM df_X_Eng_preprocessed
FROM {table_name}
WHERE summary LIKE %s AND date BETWEEN %s AND %s
GROUP BY date
"""
Expand All @@ -80,27 +84,27 @@ def fetch_data_counts(_conn, keyword, start_date, end_date):
return pd.DataFrame(cursor.fetchall(), columns=['Date', 'Number of Notes'])

# ---- Data Plotting ----
data_counts = fetch_data_counts(conn, keyword_searched, start_date, end_date)
data_counts = fetch_data_counts(conn, keyword_searched, start_date, end_date, table_name)
if not data_counts.empty:
fig = px.line(
data_counts,
x='Date',
y='Number of Notes',
title=f"Notes per Date for keyword: '{keyword_searched}'",
title=f"Notes per Date for keyword: '{keyword_searched}' ({language})",
markers=True,
height=450,
width=1000
)
st.plotly_chart(fig, use_container_width=True)
else:
st.warning("No data found for the selected keyword and date range.")
st.warning(f"No data found for the selected keyword and date range in {language}.")

# ---- Optimized Display Data Fetching ----
@st.cache_data
def fetch_display_data(_conn, keyword, start_date, end_date):
query = """
def fetch_display_data(_conn, keyword, start_date, end_date, table_name):
query = f"""
SELECT CAST(noteID AS CHAR), date, summary, tweetId
FROM df_X_Eng_preprocessed
FROM {table_name}
WHERE summary LIKE %s AND date BETWEEN %s AND %s
ORDER BY date
"""
Expand All @@ -110,20 +114,24 @@ def fetch_display_data(_conn, keyword, start_date, end_date):

headers = ['Note ID', 'Date', 'Note Content', 'Tweet ID']
display_df = pd.DataFrame(data, columns=headers)

# Decode HTML entities in the 'Note Content' column
display_df['Note Content'] = display_df['Note Content'].apply(html.unescape)

display_df['Tweet URL'] = 'https://twitter.com/notesense/status/' + display_df['Tweet ID'].astype(str)
return display_df

# ---- Show Filtered Data Table ----
display_df = fetch_display_data(conn, keyword_searched, start_date, end_date)
st.subheader(f"Notes containing '{keyword_searched}' between {start_date} and {end_date}")
display_df = fetch_display_data(conn, keyword_searched, start_date, end_date, table_name)
st.subheader(f"Notes containing '{keyword_searched}' between {start_date} and {end_date} ({language})")
st.subheader(f"Total Notes Found: {len(display_df)}")
st.dataframe(display_df[['Note ID', 'Date', 'Note Content', 'Tweet ID', 'Tweet URL']], height=400, use_container_width=True)

# ---- Download Button ----
csv_data = display_df.to_csv(index=False).encode('utf-8')
st.download_button(
label="Download as CSV",
label=f"Download as CSV ({language})",
data=csv_data,
file_name=f"filtered_notes_{keyword_searched}.csv",
file_name=f"filtered_notes_{keyword_searched}_{language}.csv",
mime="text/csv",
)
130 changes: 130 additions & 0 deletions pages/keywords-en.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import pandas as pd
import streamlit as st
import plotly.express as px
import pymysql
import time
from pages.sidebar import load_sidebar # Import the sidebar function

# ---- Optimized Connection Function ----
def create_connection():
try:
return pymysql.connect(
host="communitynotes.c3ui44m26pgw.eu-west-1.rds.amazonaws.com",
port=3306,
user="communitynotes",
password="noted",
database="communitynotes"
)
except pymysql.MySQLError as e:
st.error(f"Error connecting to MySQL/MariaDB: {e}")
return None

# ---- Streamlit Layout ----
st.set_page_config(page_title="Keyword Search", layout="wide")
st.title("Keyword Search in English Notes")
load_sidebar()

# ---- Database Connection ----
with st.spinner("Connecting to Database..."):
conn = create_connection()

# ---- Combined Data Retrieval Query ----
@st.cache_data
def fetch_dates_and_keyword_range(_conn, keyword):
query = f"""
SELECT
MIN(date), MAX(date),
MIN(CASE WHEN summary LIKE %s THEN date ELSE NULL END),
MAX(CASE WHEN summary LIKE %s THEN date ELSE NULL END)
FROM df_X_Eng_preprocessed
"""
with _conn.cursor() as cursor:
cursor.execute(query, (f"%{keyword}%", f"%{keyword}%"))
global_min_date, global_max_date, keyword_min_date, keyword_max_date = cursor.fetchone()

# Handle None values
keyword_min_date = keyword_min_date or global_min_date
keyword_max_date = keyword_max_date or global_max_date

return pd.to_datetime(global_min_date).date(), pd.to_datetime(global_max_date).date(), \
pd.to_datetime(keyword_min_date).date(), pd.to_datetime(keyword_max_date).date()

# ---- Sidebar for Keyword Search ----
keyword_searched = st.text_input(label='Type your keyword', value='birdwatch')
st.markdown("Note: Add spaces before/after the keyword for exact word matches.")

# ---- Fetch Data from Cache ----
global_min_date, global_max_date, keyword_min_date, keyword_max_date = fetch_dates_and_keyword_range(conn, keyword_searched)

# ---- Date Range Selection ----
st.subheader("Select Date Range")
start_date, end_date = st.slider(
"Date Range",
min_value=global_min_date,
max_value=global_max_date,
value=(keyword_min_date, keyword_max_date),
format="YYYY-MM-DD"
)

# ---- Optimized Data Fetching ----
@st.cache_data
def fetch_data_counts(_conn, keyword, start_date, end_date):
query = """
SELECT date, COUNT(*)
FROM df_X_Eng_preprocessed
WHERE summary LIKE %s AND date BETWEEN %s AND %s
GROUP BY date
"""
with _conn.cursor() as cursor:
cursor.execute(query, (f"%{keyword}%", start_date, end_date))
return pd.DataFrame(cursor.fetchall(), columns=['Date', 'Number of Notes'])

# ---- Data Plotting ----
data_counts = fetch_data_counts(conn, keyword_searched, start_date, end_date)
if not data_counts.empty:
fig = px.line(
data_counts,
x='Date',
y='Number of Notes',
title=f"Notes per Date for keyword: '{keyword_searched}'",
markers=True,
height=450,
width=1000
)
st.plotly_chart(fig, use_container_width=True)
else:
st.warning("No data found for the selected keyword and date range.")

# ---- Optimized Display Data Fetching ----
@st.cache_data
def fetch_display_data(_conn, keyword, start_date, end_date):
query = """
SELECT CAST(noteID AS CHAR), date, summary, tweetId
FROM df_X_Eng_preprocessed
WHERE summary LIKE %s AND date BETWEEN %s AND %s
ORDER BY date
"""
with _conn.cursor() as cursor:
cursor.execute(query, (f"%{keyword}%", start_date, end_date))
data = cursor.fetchall()

headers = ['Note ID', 'Date', 'Note Content', 'Tweet ID']
display_df = pd.DataFrame(data, columns=headers)
display_df['Tweet URL'] = 'https://twitter.com/notesense/status/' + display_df['Tweet ID'].astype(str)
return display_df

# ---- Show Filtered Data Table ----
display_df = fetch_display_data(conn, keyword_searched, start_date, end_date)
st.subheader(f"Notes containing '{keyword_searched}' between {start_date} and {end_date}")
st.subheader(f"Total Notes Found: {len(display_df)}")
st.dataframe(display_df[['Note ID', 'Date', 'Note Content', 'Tweet ID', 'Tweet URL']], height=400, use_container_width=True)

# ---- Download Button ----
csv_data = display_df.to_csv(index=False).encode('utf-8')
st.download_button(
label="Download as CSV",
data=csv_data,
file_name=f"filtered_notes_{keyword_searched}.csv",
mime="text/csv",
)

7 changes: 4 additions & 3 deletions pages/sidebar.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,17 @@
# Define a function to load the sidebar
def load_sidebar():
st.sidebar.title("Notesense Project")
st.sidebar.page_link(page="home.py", label="Keyword Search English")
st.sidebar.page_link(page="pages/keywords-de.py", label="Keyword Search German")
st.sidebar.page_link(page="home.py", label="Keyword Search")
#st.sidebar.page_link(page="pages/keywords-de.py", label="Keyword Search German")
# st.sidebar.page_link(page="pages/keywords-en.py", label="Keyword Search English")
st.sidebar.page_link(page="pages/topics_en.py", label="Topics Network English")
st.sidebar.page_link(page="pages/topics_de.py", label="Topics Network German")
st.sidebar.page_link(page="pages/about-data.py", label="About Data")
st.sidebar.page_link(page="pages/about-us.py", label="About Us")

# ---- Footer Message ----
st.sidebar.markdown("---")
st.sidebar.markdown('By [Notesense](https://github.com/Notesense/CommunityNotes) team.', unsafe_allow_html=True)
st.sidebar.markdown('By [Notesense](https://github.com/Notesense) team.', unsafe_allow_html=True)

# ---- Add the logo at the bottom of the sidebar ----
logo_path = "NotesenseLogo.png" # Adjust the path if needed
Expand Down