In [None]:
!pip install streamlit PyPDF2 scikit-learn pandas matplotlib seaborn

Collecting streamlit
  Downloading streamlit-1.44.1-py3-none-any.whl.metadata (8.9 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.44.1-py3-none-any.whl (9.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m73.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [None]:
%%writefile app.py
# Paste the entire Streamlit app code here
import streamlit as st
import pandas as pd
import numpy as np
import re
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import PyPDF2
from collections import Counter
import base64
from io import BytesIO


st.set_page_config(
    page_title="Cognitive Level Classifier",
    page_icon="🧠",
    layout="wide"
)

def extract_text_from_pdf(pdf_file):
    text = ""
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        text += page.extract_text()
    return text


def extract_data(text):

    pattern = r'(\d+)\.\s+(.*?)\s+\((CO\d+)\s+(K\d+)\s+-\s+(.*?)\)'
    matches = re.findall(pattern, text)

    data = []
    for match in matches:
        question_num = match[0]
        question_text = match[1].strip()
        course_outcome = match[2]
        knowledge_level_code = match[3]
        knowledge_level = match[4]
        data.append({
            'question_num': question_num,
            'question_text': question_text,
            'course_outcome': course_outcome,
            'knowledge_level_code': knowledge_level_code,
            'knowledge_level': knowledge_level
        })


    if not data:
        alt_pattern = r'(\d+)\.\s+(.*?)\s*\((?:CO\d+)?(?:\s*|\s*,\s*)(K\d+)(?:\s*-\s*|\s*:\s*|\s+)(.*?)(?:\)|$)'
        alt_matches = re.findall(alt_pattern, text)
        for match in alt_matches:
            question_num = match[0]
            question_text = match[1].strip()
            knowledge_level_code = match[2]
            knowledge_level = match[3].strip()
            data.append({
                'question_num': question_num,
                'question_text': question_text,
                'course_outcome': 'Unknown',  # May be missing in alternative format
                'knowledge_level_code': knowledge_level_code,
                'knowledge_level': knowledge_level
            })

    return pd.DataFrame(data)


knowledge_level_mapping = {
    'K1': 1,  # Knowledge
    'K2': 2,  # Comprehension
    'K3': 3,  # Application
    'K4': 4,  # Analysis
    'K5': 5,  # Evaluation
    'K6': 6   # Creation
}

knowledge_level_descriptions = {
    'K1': 'Knowledge',
    'K2': 'Comprehension',
    'K3': 'Application',
    'K4': 'Analysis',
    'K5': 'Evaluation',
    'K6': 'Creation'
}

def preprocess_text(text):
    """Preprocess text for better feature extraction"""

    text = text.lower()

    text = re.sub(r'[^\w\s]', ' ', text)

    text = re.sub(r'\s+', ' ', text).strip()
    return text

def train_model(train_pdf):
    """Train the model from uploaded PDF"""

    train_text = extract_text_from_pdf(train_pdf)


    train_df = extract_data(train_text)

    if train_df.empty:
        st.error("No matching data found in the training PDF. Check the pattern or PDF content.")
        return None, None, None, None, None

    st.success(f"Extracted {len(train_df)} questions from training data")


    class_counts = Counter(train_df['knowledge_level_code'])


    train_df['knowledge_level_num'] = train_df['knowledge_level_code'].apply(lambda x: knowledge_level_mapping.get(x, 0))


    train_df['processed_text'] = train_df['question_text'].apply(preprocess_text)


    X = train_df['processed_text']
    y = train_df['knowledge_level_code']


    vectorizer = TfidfVectorizer(
        stop_words='english',
        max_features=300,
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95
    )
    X_features = vectorizer.fit_transform(X)


    min_class_count = min(class_counts.values())


    model = RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        class_weight='balanced'
    )
    model.fit(X_features, y)


    y_pred = model.predict(X_features)
    accuracy = accuracy_score(y, y_pred)
    report = classification_report(y, y_pred, output_dict=True)

    return model, vectorizer, train_df, accuracy, report

def predict_cognitive_level(model, vectorizer, question_text):
    """Predict the cognitive level of a given question"""
    if not model or not vectorizer:
        return None


    processed_text = preprocess_text(question_text)

    question_features = vectorizer.transform([processed_text])


    prediction = model.predict(question_features)[0]
    confidence = model.predict_proba(question_features)[0]


    predicted_class_index = list(model.classes_).index(prediction)
    confidence_score = confidence[predicted_class_index]


    predicted_level = knowledge_level_descriptions.get(prediction, "Unknown")

    return {
        'question': question_text,
        'predicted_code': prediction,
        'predicted_level': predicted_level,
        'confidence': confidence_score
    }

def predict_from_pdf(model, vectorizer, pdf_file):
    """Extract questions from a PDF and predict their cognitive levels"""
    if not model or not vectorizer:
        st.error("Model not trained")
        return pd.DataFrame()

    pdf_text = extract_text_from_pdf(pdf_file)


    pattern = r'(\d+)\.\s+(.*?)(?=\(\w+\s+\w+|\d+\.|$)'
    matches = re.findall(pattern, pdf_text)

    if not matches:
        st.warning("No questions found using structured pattern. Using simple line extraction.")
        # Fall back to simple line-by-line extraction
        lines = [line.strip() for line in pdf_text.split('\n') if line.strip()]
        questions = [line for line in lines if re.match(r'^\d+\.', line)]
    else:
        questions = [match[1].strip() for match in matches]

    if not questions:
        st.error("No questions found in the PDF.")
        return pd.DataFrame()

    predictions = []
    for question in questions:
        if len(question) > 10:  # Minimum length to avoid fragments
            pred = predict_cognitive_level(model, vectorizer, question)
            predictions.append(pred)

    # Create a DataFrame for better visualization
    pred_df = pd.DataFrame(predictions)
    st.success(f"Predicted cognitive levels for {len(pred_df)} questions")

    return pred_df

def plot_knowledge_level_distribution(df):
    """Plot the distribution of knowledge levels"""
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.countplot(x='knowledge_level_code', data=df, ax=ax)
    ax.set_title('Distribution of Knowledge Levels in the Training Data')
    ax.set_xlabel('Knowledge Level')
    ax.set_ylabel('Count')
    return fig

def plot_feature_importance(model, vectorizer):
    """Plot feature importance"""
    feature_names = vectorizer.get_feature_names_out()
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False).head(15)

    fig, ax = plt.subplots(figsize=(12, 6))
    sns.barplot(x='importance', y='feature', data=feature_importance, ax=ax)
    ax.set_title('Top 15 Important Words for Predicting Knowledge Level')
    plt.tight_layout()
    return fig

def get_table_download_link(df, filename, text):
    """Generate a link to download the dataframe as a CSV file"""
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode()).decode()
    href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">📥 {text}</a>'
    return href

def main():
    st.title("🧠 Cognitive Level Classifier")
    st.markdown("""
    This app classifies questions based on Bloom's Taxonomy cognitive levels:
    - **K1**: Knowledge - Recalling facts, terms, basic concepts
    - **K2**: Comprehension - Understanding the meaning of information
    - **K3**: Application - Using knowledge in new situations
    - **K4**: Analysis - Breaking down information into parts
    - **K5**: Evaluation - Making judgments based on criteria
    - **K6**: Creation - Creating new ideas or ways of viewing things
    """)

    # Initialize session state variables if they don't exist
    if 'model' not in st.session_state:
        st.session_state.model = None
    if 'vectorizer' not in st.session_state:
        st.session_state.vectorizer = None
    if 'accuracy' not in st.session_state:
        st.session_state.accuracy = None
    if 'report' not in st.session_state:
        st.session_state.report = None
    if 'train_df' not in st.session_state:
        st.session_state.train_df = None

    # Create tabs
    tab1, tab2, tab3, tab4 = st.tabs(["Train Model", "Predict Single Question", "Predict From PDF", "Model Analysis"])

    with tab1:
        st.header("Train your model")
        st.markdown("""
        Upload a training PDF containing questions tagged with knowledge levels (K1-K6).
        The PDF should follow formats like: `1. Question text (CO1 K2 - Comprehension)`
        """)

        train_pdf = st.file_uploader("Upload Training PDF", type=['pdf'], key='train_pdf')

        train_col1, train_col2 = st.columns(2)
        with train_col1:
            if st.button("Train Model", type="primary"):
                if train_pdf is not None:
                    with st.spinner("Training model..."):
                        model, vectorizer, train_df, accuracy, report = train_model(train_pdf)
                        if model is not None:
                            st.session_state.model = model
                            st.session_state.vectorizer = vectorizer
                            st.session_state.train_df = train_df
                            st.session_state.accuracy = accuracy
                            st.session_state.report = report
                            st.success(f"Model trained successfully! Accuracy: {accuracy:.2f}")
                else:
                    st.error("Please upload a training PDF")

        with train_col2:
            if st.session_state.model is not None:
                st.success("Model is trained and ready for predictions!")
                if st.session_state.train_df is not None:
                    st.write(f"Training data: {len(st.session_state.train_df)} questions")
                    if st.checkbox("Show training data sample"):
                        st.dataframe(st.session_state.train_df[['question_num', 'question_text', 'knowledge_level_code', 'knowledge_level']].head(10))

    with tab2:
        st.header("Predict Knowledge Level for a Single Question")

        # Check if model is trained
        if st.session_state.model is None:
            st.warning("Please train the model first in the 'Train Model' tab")
        else:
            question_text = st.text_area("Enter your question:", height=100)

            if st.button("Predict", key="predict_single"):
                if question_text:
                    with st.spinner("Predicting..."):
                        prediction = predict_cognitive_level(st.session_state.model, st.session_state.vectorizer, question_text)

                        # Display prediction with color coding
                        col1, col2, col3 = st.columns(3)
                        with col1:
                            st.metric("Predicted Level", f"{prediction['predicted_code']} - {prediction['predicted_level']}")
                        with col2:
                            st.metric("Confidence", f"{prediction['confidence']:.2f}")

                        # Knowledge level explanation
                        level_explanations = {
                            'K1': "Knowledge level question - Focuses on recall of facts, terms, or basic concepts",
                            'K2': "Comprehension level question - Tests understanding of meaning and interpretation",
                            'K3': "Application level question - Requires using knowledge in a new context or situation",
                            'K4': "Analysis level question - Involves breaking down information and understanding relationships",
                            'K5': "Evaluation level question - Requires making judgments based on criteria",
                            'K6': "Creation level question - Involves creating new ideas, products, or ways of viewing things"
                        }

                        st.info(level_explanations.get(prediction['predicted_code'], "Unknown level"))

                        # Suggestion for improvement if confidence is low
                        if prediction['confidence'] < 0.7:
                            st.warning("Low confidence prediction. Consider rewording the question for clearer classification.")
                else:
                    st.error("Please enter a question")

    with tab3:
        st.header("Predict Knowledge Levels from PDF")

        # Check if model is trained
        if st.session_state.model is None:
            st.warning("Please train the model first in the 'Train Model' tab")
        else:
            test_pdf = st.file_uploader("Upload PDF with questions to classify", type=['pdf'], key='test_pdf')

            if st.button("Predict from PDF", key="predict_pdf"):
                if test_pdf is not None:
                    with st.spinner("Extracting questions and predicting..."):
                        predictions_df = predict_from_pdf(st.session_state.model, st.session_state.vectorizer, test_pdf)

                        if not predictions_df.empty:
                            # Display results
                            st.dataframe(predictions_df)

                            # Download link for predictions
                            st.markdown(get_table_download_link(predictions_df, "predictions.csv", "Download predictions as CSV"), unsafe_allow_html=True)

                            # Summary statistics
                            st.subheader("Summary Statistics")
                            level_counts = predictions_df['predicted_code'].value_counts()
                            col1, col2 = st.columns(2)

                            with col1:
                                st.write("Distribution of predicted levels:")
                                st.dataframe(pd.DataFrame({
                                    'Level': level_counts.index,
                                    'Count': level_counts.values,
                                    'Percentage': (level_counts.values / level_counts.sum() * 100).round(1)
                                }))

                            with col2:
                                fig, ax = plt.subplots()
                                ax.pie(level_counts.values, labels=level_counts.index, autopct='%1.1f%%')
                                ax.set_title('Distribution of Predicted Knowledge Levels')
                                st.pyplot(fig)
                else:
                    st.error("Please upload a PDF")

    with tab4:
        st.header("Model Analysis")

        if st.session_state.model is None:
            st.warning("Please train the model first in the 'Train Model' tab")
        else:
            col1, col2 = st.columns(2)

            with col1:
                st.subheader("Training Accuracy")
                st.metric("Overall Accuracy", f"{st.session_state.accuracy:.2f}")

                st.subheader("Class-wise Performance")
                report_df = pd.DataFrame(st.session_state.report).T
                st.dataframe(report_df.iloc[:-3][['precision', 'recall', 'f1-score']].round(2))

            with col2:
                if st.session_state.train_df is not None:
                    st.subheader("Knowledge Level Distribution")
                    fig = plot_knowledge_level_distribution(st.session_state.train_df)
                    st.pyplot(fig)

            st.subheader("Feature Importance")
            if st.session_state.model is not None and st.session_state.vectorizer is not None:
                fig = plot_feature_importance(st.session_state.model, st.session_state.vectorizer)
                st.pyplot(fig)

            # Display some example questions for each level
            st.subheader("Example Questions by Level")
            if st.session_state.train_df is not None:
                for level in sorted(st.session_state.train_df['knowledge_level_code'].unique()):
                    with st.expander(f"{level} - {knowledge_level_descriptions.get(level, 'Unknown')}"):
                        examples = st.session_state.train_df[st.session_state.train_df['knowledge_level_code'] == level]['question_text'].head(3).tolist()
                        for i, example in enumerate(examples, 1):
                            st.write(f"{i}. {example}")

if __name__ == "__main__":
    main()

Writing app.py


In [None]:
!curl https://loca.lt/mytunnelpassword

34.141.247.186

In [None]:
# Install localtunnel to expose the Streamlit app
!npm install -g localtunnel

# Run Streamlit in the background and create a tunnel
!streamlit run app.py &>/content/logs.txt & sleep 5 && lt --port 8501

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K
added 22 packages in 4s
[1G[0K⠧[1G[0K
[1G[0K⠧[1G[0K3 packages are looking for funding
[1G[0K⠧[1G[0K  run `npm fund` for details
[1G[0K⠧[1G[0Kyour url is: https://thick-olives-repeat.loca.lt
