In [1]:
import pandas as pd
from IPython.display import HTML, display, clear_output, Javascript
import os
import openai
from dotenv import load_dotenv
import ipywidgets as widgets
from ipywidgets import interactive, fixed, Button, VBox, HBox
from ipywidgets import interact, widgets
from datetime import datetime
import time
import warnings
import spacy
warnings.simplefilter(action='ignore', category=Warning)
# from Upcoming_Modules import syllabus


In [2]:
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 10000000

In [3]:
# Load environment variables
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

In [4]:
import chardet

# Detect the encoding of the CSV file
with open('Message Log.csv', 'rb') as file:
    result = chardet.detect(file.read())

encoding_type = result['encoding']

# Read the CSV file with detected encoding and tab separator
df = pd.read_csv('Message Log.csv', encoding=encoding_type, delimiter="\t", engine='python')

# If needed, strip timezone information
df['Message Timestamp'] = df['Message Timestamp'].str.split('+').str[0]

# Ensure that 'Message Timestamp' is in datetime format
df['Message Timestamp'] = pd.to_datetime(df['Message Timestamp'], format='%m/%d/%Y', errors='coerce')


In [5]:
# Extract unique cohort names and sort them alphabetically
unique_cohorts = sorted([x for x in df['Class Name'].unique() if isinstance(x, str)])

In [6]:
# Filter out non-message rows
df = df[df['Message'].notna()]

In [7]:
# Assuming a 'messages' column that contains chat inputs
df['Message_Clean'] = df['Message'].str.strip().str.lower()

In [8]:
# Ensure your date column is in datetime format
df['Message Timestamp'] = pd.to_datetime(df['Message Timestamp'], format='mixed', errors='coerce')

In [9]:
def large_chunk_text(text, max_length=128000, prompt_length=0):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        
        # Tokenize the text using spaCy
        doc = nlp(text, disable=["parser", "ner", "tagger"])  # Only tokenization is needed

    max_tokens_per_chunk = max_length - prompt_length - 1
    chunks = []
    current_chunk = ""
    current_chunk_token_count = 0

    for token in doc:
        token_text_with_ws = token.text_with_ws
        if current_chunk_token_count + len(token_text_with_ws) > max_tokens_per_chunk:
            chunks.append(current_chunk.strip())
            current_chunk = ""
            current_chunk_token_count = 0

        current_chunk += token_text_with_ws
        current_chunk_token_count += len(token_text_with_ws)

    if current_chunk.strip():
        chunks.append(current_chunk.strip())

    return chunks, current_chunk_token_count

In [10]:
def chunk_text(text, max_length=1280000, buffer=10000):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        
        # Tokenize the text using spaCy
        doc = nlp(text, disable=["parser", "ner", "tagger"])  # Only tokenization is needed
        tokens = [token.text for token in doc]

    token_count = len(tokens)
    
    # Setting a maximum chunk size with a conservative buffer
    max_chunk_size = max_length - buffer  # Reducing chunk size to account for tokenization differences

    chunks = []
    start_index = 0
    total_tokens_processed = 0
    while start_index < token_count:
        end_index = start_index + max_chunk_size

        # Adjust end_index if it exceeds the total number of tokens
        if end_index > token_count:
            end_index = token_count

        # Combine tokens back to text for the chunk
        chunk = ' '.join(tokens[start_index:end_index])
        chunks.append(chunk)

        total_tokens_processed += (end_index - start_index)
        if total_tokens_processed > max_length:
            raise ValueError("Total token count exceeds the maximum allowed limit. Consider reducing the input size.")

        start_index = end_index

    return chunks, total_tokens_processed

In [11]:
def size_check(text, max_length=1280000, buffer=10000):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        
        # Tokenize the text using spaCy
        doc = nlp(text, disable=["parser", "ner", "tagger"])  # Only tokenization is needed
        tokens = [token.text for token in doc]

    token_count = len(tokens)
    if token_count > 100000:
        return large_chunk_text(text)
        
    else:
        return chunk_text(text)

In [12]:
def get_summary(text):
    summaries = []
    
    # Chunk the input text
    # chunks, token_count = chunk_text(text)
    chunks, token_count = size_check(text)
  
    for chunk in chunks:
        try:
            selected_prompt = prompt_dropdown.value
            if selected_prompt == "Custom":
                prompt_text = custom_prompt_textarea.value
            else:
                prompt_text = prompt_options[selected_prompt]

            prompt_text = prompt_text.format(text=chunk)  # Use the chunk here

            response = openai.ChatCompletion.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt_text}
                ],
                max_tokens=4096  # This is the maximum output tokens, not input
            )
            summary = response['choices'][0]['message']['content'].strip()
            summaries.append(summary)
        except Exception as e:
            print(f"Error: {str(e)}")
            summaries.append(None)
    
    # Join the summaries from each chunk
    return " ".join([s for s in summaries if s])

In [13]:
def process_data(cohort_name, start_date, end_date):
    # Clear the previous output
    clear_output(wait=False)
    
    # Set the API key at the beginning of the function
    api_key = os.getenv("OPENAI_API_KEY")
    openai.api_key = api_key
    
    # Check if the cohort_name exists in the DataFrame
    if cohort_name not in df['Class Name'].unique():
        display(HTML('''
        <div style="color: red; font-size: 20px; text-align: center; margin-top: 50px;">
            Error: The provided cohort name does not exist in the dataset.
        </div>
        '''))
        return
    
    # Convert strings to datetime
    start_date_dt = pd.Timestamp(start_date)
    end_date_dt = pd.Timestamp(end_date)
    
    # Check if the date range exists in the DataFrame
    if not ((df['Message Timestamp'] >= start_date_dt) & (df['Message Timestamp'] <= end_date_dt)).any():
        display(HTML('''
        <div style="color: red; font-size: 20px; text-align: center; margin-top: 50px;">
            Error: The provided date range does not exist in the dataset.
        </div>
        '''))
        return
    
    # Simulate a loading time with a progress message in orange
    display(HTML('''
    <div style="color: orange; font-size: 20px; text-align: center; margin-top: 50px;">
        Processing data...
    </div>
    '''))
    
    # Apply summarization to chat segments
    cohort = df[df['Class Name'] == cohort_name]
    date_range = cohort[(cohort['Message Timestamp'] >= start_date_dt) & (cohort['Message Timestamp'] <= end_date_dt)]
    user = date_range[(date_range['User Role'] == 'user') & (date_range['Conversation ID.1'].notnull())]
    
    # Concatenate all messages into a single string
    all_messages = ' '.join(user['Message'].astype(str))
    
    # Chunk the text if it exceeds the maximum token limit
    chunks, token_count = chunk_text(all_messages)
    if token_count > 100000:
        print('running get_summary')

    # Display the token count
    display(HTML(f'<div style="color: blue; font-size: 20px; text-align: center; margin-top: 20px;">Token Count: {token_count} tokens</div>'))
    if len(chunks) > 1:
        display(HTML(f'''
        <div style="color: blue; font-size: 20px; text-align: center; margin-top: 50px;">
            Chunking initiated due to content length. Total chunks: {len(chunks)}
        </div>
        '''))

    if not all_messages:
        print("No messages to process.")
        return
    
    # Get summary of all messages
    summaries = [get_summary(chunk) for chunk in chunks]
  
    summary = "\n\n".join(summaries)
    if token_count > 100000:
        summary = get_summary(summary)
        print('running get_summary')

    if summary is None:
        print("Failed to generate a summary. Please check the get_summary function.")
        return
    
    # Calculate user message counts for the specific cohort and time frame
    user_message_counts = user['User ID'].value_counts()
    user_message_count_str = "\n".join([f"User {uid}: {count} messages" for uid, count in user_message_counts.items()])
    
    # Create a dynamic filename based on the cohort name and date range
    filename = f'{cohort_name}_{start_date_dt.strftime("%Y%m%d")}_to_{end_date_dt.strftime("%Y%m%d")}.txt'.replace(":", "").replace(" ", "_").replace("-", "_")
    
    # Check if a folder with the cohort name exists, if not, create it
    folder_name = cohort_name.replace(":", "").replace(" ", "_").replace("-", "_")
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    
    # Save summary to a text file inside the cohort folder
    file_path = os.path.join(folder_name, filename)
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(summary)
        # Add a separator for clarity
        file.write("\n\n" + "-"*40 + "\n")

        # Write the header for user message counts
        file.write("User Message Counts:\n")

        # Format and write each user's message count
        for uid, count in user_message_counts.items():
            file.write(f"    User {uid}: {count} messages\n")
    
    display(HTML(f'''
        <div style="
            border: 2px solid #4CAF50;
            padding: 10px;
            margin: 5px 0;
            border-radius: 5px;
            background-color: #f9f9f9;
            text-align: center;
        ">
            <a href="{file_path}" target="_blank" style="
                text-decoration: none;
                color: #4CAF50;
                font-weight: bold;
                font-size: 16px;
            ">
                Click here to view {filename}
            </a>
        </div>
        '''))
    
    # Extracting the selected prompt
    selected_prompt = prompt_dropdown.value
    
    # At the end, display success message in green
    display(HTML(f'''
    <div style="color: green; font-size: 20px; text-align: center; margin-top: 50px;">
        Data processed successfully!
    </div>
    <div style="color: black; font-size: 18px; text-align: center; margin-top: 20px;">
        Cohort Analyzed: {cohort_name}
    </div>
    <div style="color: black; font-size: 18px; text-align: center; margin-top: 10px;">
        Time Stamp Analyzed: From {start_date} to {end_date}
    </div>
    <div style="color: black; font-size: 18px; text-align: center; margin-top: 10px;">
        Words Processed: {len(all_messages.split())}
    </div>
    <div style="color: black; font-size: 18px; text-align: center; margin-top: 10px;">
        Prompt Used: {selected_prompt}
    </div>
    '''))



In [14]:
# Set up the widgets and interactive function
cohort_dropdown = widgets.Dropdown(
    options=unique_cohorts,
    description='Cohort:',
    disabled=False,
)
start_date_widget = widgets.DatePicker(description='Start Date', value=pd.to_datetime(''))
end_date_widget = widgets.DatePicker(description='End Date', value=pd.to_datetime(''))

In [15]:
# Create a dropdown widget to select a cohort
cohort_dropdown = widgets.Dropdown(
    options=unique_cohorts,
    description='Cohort:',
    disabled=False,
)

In [16]:
# Define the default prompt
default_prompt = ("Please analyze the student comments and questions in the text: {text}. Provide a high level summary of the comments and questions, a list of topics students are struggling with and insights for the instructor.")


In [17]:
prompt_options = {
    "Default Prompt": default_prompt
}

   
prompt_dropdown = widgets.Dropdown(
    options=prompt_options.keys(),
    value="Default Prompt",
    description='Prompt:'
)

In [18]:
def clear_all(b):
    cohort_dropdown.value = None  # Set to None or the first option
    start_date_widget.value = None
    end_date_widget.value = None
    if prompt_options and isinstance(prompt_options, list):  # Check if it's a list and not empty
        prompt_dropdown.value = prompt_options[0]
    elif prompt_options and isinstance(prompt_options, dict):  # Check if it's a dictionary and not empty
        prompt_dropdown.value = next(iter(prompt_options.keys()))  # Set to the first key

In [19]:
# Function to display all widgets to the user
def display_ui():
    # For processing data
    process_data_button = widgets.Button(description="Process Data")
    clear_button = widgets.Button(description="Clear Input")
    process_data_button.on_click(lambda b: process_data(cohort_dropdown.value, start_date_widget.value, end_date_widget.value))
    clear_button.on_click(clear_all)
    

    # Adjusting the width of the dropdown
    cohort_dropdown.layout.width = '500px'  # Adjust the width as per your preference


    # Display all widgets, including those for processing data and saving module content
    display(VBox([
        # HBox([prompt_dropdown, custom_prompt_textarea]),
        VBox([cohort_dropdown, start_date_widget, end_date_widget]),
        HBox([process_data_button, clear_button]),
    ]))


In [20]:
# Create the widget using your unique_cohorts list
cohort_dropdown = widgets.Dropdown(
    options=unique_cohorts,
    description='Cohort:',
    disabled=False,
)
# interact(display_current_unit, cohort=cohort_dropdown);
interact(cohort=cohort_dropdown);

In [21]:
# Display the UI
display_ui()

VBox(children=(VBox(children=(Dropdown(description='Cohort:', layout=Layout(width='500px'), options=('ASU-VIRTâ€¦