In [None]:
# ---------------- Imports ----------------
from lit_nlp import notebook
from lit_nlp.api import dataset as lit_dataset
from lit_nlp.api import types as lit_types
from lit_nlp.api import model as lit_model
from lit_nlp.components import metrics as lit_metrics
from lime.lime_text import LimeTextExplainer
import numpy as np
import pandas as pd

# ---------------- Dataset Wrapper ----------------
class FocusDFDataset(lit_dataset.Dataset):
    """Expose focus_df columns for LIT (with percentages and risk)."""
    def __init__(self, df: pd.DataFrame):
        self._examples = []
        for _, row in df.iterrows():
            self._examples.append({
                "text": row["Content"],
                "day": str(row["Day"]),
                "predicted_percent": float(row["Predicted_Percent"]),
                "failure_risk": str(row["Failure_Risk"])
            })

    def spec(self):
        return {
            "text": lit_types.TextSegment(),
            "day": lit_types.TextSegment(),
            "predicted_percent": lit_types.RegressionScore(),
            "failure_risk": lit_types.CategoryLabel(vocab=["Safe", "At Risk"])
        }

    @property
    def examples(self):
        return self._examples


# ---------------- LIT Model Wrapper w/ LIME ----------------
class LITLIMEModel(lit_model.Model):
    """LIT-compatible wrapper to show predicted_percent & failure risk with LIME."""

    def __init__(self, df):
        self.df = df
        self.explainer = LimeTextExplainer(class_names=["Safe", "At Risk"])

    def predict(self, inputs):
        """Return LIT-compatible outputs."""
        outputs = []
        for ex in inputs:
            text = ex["text"]
            row = self.df[self.df["Content"] == text]
            if len(row) == 0:
                risk_prob = 0.0
            else:
                risk_prob = row["Predicted_Percent"].values[0] / 100
            outputs.append({
                "Safe": 1 - risk_prob,
                "At Risk": risk_prob
            })
        return outputs

    def input_spec(self):
        return {"text": lit_types.TextSegment()}

    def output_spec(self):
        return {
            "Safe": lit_types.RegressionScore(),
            "At Risk": lit_types.RegressionScore()
        }

    def explain_instance(self, text):
        """Get top LIME tokens for a single text instance."""
        exp = self.explainer.explain_instance(
            text_instance=text,
            classifier_fn=lambda x: np.array([list(d.values()) for d in self.predict([{"text": t} for t in x])]),
            num_features=10
        )
        return exp.as_list()


# ---------------- Compute failure risk from focus_df ----------------
FAILURE_THRESHOLD_RATIO = 50
focus_df["Failure_Risk"] = np.where(
    focus_df["Predicted_Percent"] > FAILURE_THRESHOLD_RATIO,
    "At Risk",
    "Safe"
)


# ---------------- Launch LIT ----------------
regression_metrics = lit_metrics.RegressionMetrics()

widget = notebook.LitWidget(
    models={"focus_df_lime": LITLIMEModel(focus_df)},
    datasets={"focus_df_data": FocusDFDataset(focus_df)},
    metrics={"regression": regression_metrics},
    port=9004
)

widget.render(height=900)
print("ðŸ”— Access LIT in your browser at: http://localhost:9004")


In [None]:
model_wrapper = TextToBiLSTMModel(
    keras_model=model,              # Your trained GRU/LSTM model
    sentence_model=sentence_model,  # SentenceTransformer embeddings
    vectorizer=vectorizer,          # CountVectorizer for LDA
    lda_model=lda_final,            # Trained LDA model
    scaler_embed=scaler_embed,      # Scaler fitted on embeddings
    scaler_y=scaler_y,              # Scaler fitted on original targets
    model_feature_dim=X_seq.shape[2],  # Input feature dimension for model
    failure_threshold=FAILURE_THRESHOLD_RATIO  # Can now set as % threshold
)


In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
from lit_nlp.components import lime_explainer
from lit_nlp.api import dataset as lit_dataset
from lit_nlp.api import types as lit_types

# ---------------- Dataset Wrapper for focus_df ----------------
class FocusDFDataset(lit_dataset.Dataset):
    """Expose focus_df columns for LIME / LIT."""
    def __init__(self, df: pd.DataFrame):
        self._examples = []
        for _, row in df.iterrows():
            self._examples.append({
                "text": row["Content"],
                "day": str(row["Day"]),
                "actual_failures": float(row["Actual_Failures"]),
                "predicted_failures": float(row["Predicted_Failures"]),
                "actual_percent": float(row["Actual_Percent"]),
                "predicted_percent": float(row["Predicted_Percent"]),
                "failure_risk": str(row["Failure_Risk"])
            })

    def spec(self):
        return {
            "text": lit_types.TextSegment(),
            "day": lit_types.TextSegment(),
            "actual_failures": lit_types.RegressionScore(),
            "predicted_failures": lit_types.RegressionScore(),
            "actual_percent": lit_types.RegressionScore(),
            "predicted_percent": lit_types.RegressionScore(),
            "failure_risk": lit_types.CategoryLabel(vocab=["Safe", "At Risk"])
        }

    @property
    def examples(self):
        return self._examples

# Create dataset instance
focus_dataset = FocusDFDataset(focus_df)

# ---------------- Use your corrected model wrapper ----------------
lit_model_wrapper = model_wrapper  # TextToBiLSTMModel

# ---------------- Pick an instance ----------------
instance_idx = 95
instance_95 = focus_dataset.examples[instance_idx]

# ---------------- Initialize LIME explainer ----------------
lime = lime_explainer.LIME()

# ---------------- Run LIME ----------------
lime_results = lime.run([instance_95], model=lit_model_wrapper, dataset=focus_dataset)
result_95 = lime_results[0]

# ---------------- Extract TokenSalience ----------------
token_salience_obj = result_95['text']
tokens = token_salience_obj.tokens
salience = token_salience_obj.salience

# ---------------- Aggregate salience for repeated tokens ----------------
df_plot = pd.DataFrame({"Token": tokens, "Importance": salience})
df_plot = df_plot.groupby("Token").mean().reset_index()

# ---------------- Get top positive and negative tokens ----------------
df_top = pd.concat([
    df_plot.nlargest(10, "Importance"),
    df_plot.nsmallest(10, "Importance")
])
df_top_sorted = df_top.sort_values("Importance")

# ---------------- Interactive Plot ----------------
fig = px.bar(
    df_top_sorted,
    x="Importance",
    y="Token",
    orientation='h',
    color="Importance",
    color_continuous_scale="RdBu",
    title="Top Positive & Negative Tokens  (LIME-based)",
    labels={"Importance": "LIME Importance", "Token": "Tokens"},
    hover_data={"Importance": True, "Token": True}
)

fig.update_layout(
    xaxis_title="LIME Importance (Effect on Prediction)",
    yaxis_title="Tokens",
    template="plotly_white",
    height=600,
    margin=dict(l=150, r=50, t=50, b=50)
)

fig.show()

# ---------------- Optional: Print top tokens ----------------
top_positive_tokens = df_plot.nlargest(10, "Importance")["Token"].tolist()
top_negative_tokens = df_plot.nsmallest(10, "Importance")["Token"].tolist()

print("Example 95 Text:\n", instance_95['text'], "\n")
print("Top Positive Tokens:", top_positive_tokens)
print("Top Negative Tokens:", top_negative_tokens)


In [None]:
# ---------------- Imports ----------------
from lit_nlp import notebook
from lit_nlp.api import dataset as lit_dataset
from lit_nlp.api import types as lit_types
from lit_nlp.components import metrics as lit_metrics
from lit_nlp.components import lime_explainer

import numpy as np
import pandas as pd

# ---------------- Compute additional columns ----------------
# Convert failures to percentage of total count
focus_df['Actual_Percent'] = (focus_df['Actual_Failures'] / focus_df['Total_Count']) * 100
focus_df['Predicted_Percent'] = (focus_df['Predicted_Failures'] / focus_df['Total_Count']) * 100

# Compute failure risk (50% threshold)
FAILURE_THRESHOLD_RATIO = 50  # percent
focus_df['Failure_Risk'] = np.where(focus_df['Predicted_Percent'] > FAILURE_THRESHOLD_RATIO, 'At Risk', 'Safe')

# ---------------- Dataset Wrapper ----------------
class FocusDFDataset(lit_dataset.Dataset):
    """Expose focus_df columns with true_score for LIT + LIME."""

    def __init__(self, df: pd.DataFrame):
        self._examples = []
        for _, row in df.iterrows():
            self._examples.append({
                "text": row["Content"],
                "day": str(row["Day"]),
                "true_score": float(row["Actual_Failures"]),  # Required by model
                "predicted_score": float(row["Predicted_Failures"]),
                "actual_percent": float(row["Actual_Percent"]),
                "predicted_percent": float(row["Predicted_Percent"]),
                "failure_risk": str(row["Failure_Risk"])
            })

    def spec(self):
        return {
            "text": lit_types.TextSegment(),
            "day": lit_types.TextSegment(),
            "true_score": lit_types.RegressionScore(),
            "predicted_score": lit_types.RegressionScore(),
            "actual_percent": lit_types.RegressionScore(),
            "predicted_percent": lit_types.RegressionScore(),
            "failure_risk": lit_types.CategoryLabel(vocab=["Safe", "At Risk"])
        }

    @property
    def examples(self):
        return self._examples

# Create dataset instance
focus_dataset = FocusDFDataset(focus_df)

# ---------------- Model Wrapper ----------------
class TextToBiLSTMModel:
    """Wrapper to feed Text + embeddings into your trained RNN/GRU model."""

    def __init__(self, keras_model, sentence_model, vectorizer, lda_model, scaler_embed, scaler_y, model_feature_dim, failure_threshold=3.0):
        self.model = keras_model
        self.sentence_model = sentence_model
        self.vectorizer = vectorizer
        self.lda_model = lda_model
        self.scaler_embed = scaler_embed
        self.scaler_y = scaler_y
        self.model_feature_dim = model_feature_dim
        self.failure_threshold = failure_threshold

    def text_to_combined_vector(self, text_list):
        embed_vecs = self.sentence_model.encode(text_list)
        counts = self.vectorizer.transform(text_list)
        lda_vecs = self.lda_model.transform(counts)
        return np.hstack([embed_vecs, lda_vecs])

    def predict(self, inputs):
        texts = [ex["text"] for ex in inputs]
        embeddings = self.text_to_combined_vector(texts)
        X_scaled = self.scaler_embed.transform(embeddings)

        # Pad/trim to model_feature_dim
        curr_dim = X_scaled.shape[1]
        if curr_dim < self.model_feature_dim:
            pad = np.zeros((X_scaled.shape[0], self.model_feature_dim - curr_dim))
            X_final = np.hstack([X_scaled, pad])
        else:
            X_final = X_scaled[:, :self.model_feature_dim]

        # GRU expects sequence of length 3
        seq_len = 3
        X_seq_input = np.array([np.tile(X_final[i], (seq_len, 1)) for i in range(len(X_final))], dtype=np.float32)

        # Model prediction
        preds_scaled = self.model.predict(X_seq_input, verbose=0).flatten()
        preds_actual = self.scaler_y.inverse_transform(preds_scaled.reshape(-1, 1)).flatten()

        outputs = []
        for p, ex in zip(preds_actual, inputs):
            risk_label = "At Risk" if p > self.failure_threshold else "Safe"
            outputs.append({
                "predicted_score": float(p),
                "true_score": float(ex["true_score"]),  # âœ… Now exists
                "failure_risk": risk_label
            })
        return outputs

# ---------------- Initialize model wrapper ----------------
lit_model_wrapper = TextToBiLSTMModel(
    keras_model=model,
    sentence_model=sentence_model,
    vectorizer=vectorizer,
    lda_model=lda_final,
    scaler_embed=scaler_embed,
    scaler_y=scaler_y,
    model_feature_dim=X_seq.shape[2],
    failure_threshold=3.0
)

# ---------------- Metrics ----------------
regression_metrics = lit_metrics.RegressionMetrics()  # optional

# ---------------- Launch LIT ----------------
widget = notebook.LitWidget(
    models={"focus_df_lime": lit_model_wrapper},
    datasets={"focus_df_data": focus_dataset},
    metrics={"regression": regression_metrics},
    port=9004
)

widget.render(height=900)
print("ðŸ”— Access LIT in your browser at: http://localhost:9004")

# ---------------- LIME Explainer for a single instance ----------------
instance_95 = focus_dataset.examples[95]

lime = lime_explainer.LIME()
lime_results = lime.run([instance_95], model=lit_model_wrapper, dataset=focus_dataset)
result_95 = lime_results[0]

# ---------------- Extract token-level salience ----------------
tokens = result_95['text'].tokens
salience = result_95['text'].salience

# ---------------- Aggregate repeated tokens ----------------
df_plot = pd.DataFrame({"Token": tokens, "Importance": salience})
df_plot = df_plot.groupby("Token").mean().reset_index()

# ---------------- Top positive & negative tokens ----------------
df_top = pd.concat([df_plot.nlargest(10, "Importance"), df_plot.nsmallest(10, "Importance")])
df_top_sorted = df_top.sort_values("Importance")

# ---------------- Optional: Print top tokens ----------------
top_positive_tokens = df_plot.nlargest(10, "Importance")["Token"].tolist()
top_negative_tokens = df_plot.nsmallest(10, "Importance")["Token"].tolist()

print("Example 95 Text:\n", instance_95['text'], "\n")
print("Top Positive Tokens:", top_positive_tokens)
print("Top Negative Tokens:", top_negative_tokens)


In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
from lit_nlp.components import lime_explainer

# ---------------- Use your corrected model wrapper ----------------
lit_model_wrapper = model_wrapper  # model_wrapper must be TextToRNN_GRUModel

# ---------------- Pick an instance ----------------
# Use the LIT dataset wrapper instead of the raw DataFrame
instance_95 = focus_dataset.examples[95]  # âœ… focus_dataset has `examples`

# ---------------- Initialize LIME explainer ----------------
lime = lime_explainer.LIME()

# ---------------- Run LIME ----------------
lime_results = lime.run([instance_95], model=lit_model_wrapper, dataset=focus_dataset)
result_95 = lime_results[0]

# ---------------- Extract TokenSalience ----------------
token_salience_obj = result_95['text']
tokens = token_salience_obj.tokens
salience = token_salience_obj.salience

# ---------------- Aggregate salience for repeated tokens ----------------
df_plot = pd.DataFrame({"Token": tokens, "Importance": salience})
df_plot = df_plot.groupby("Token").mean().reset_index()  # average salience per token

# ---------------- Get top positive and negative tokens ----------------
df_top = pd.concat([
    df_plot.nlargest(10, "Importance"),
    df_plot.nsmallest(10, "Importance")
])
df_top_sorted = df_top.sort_values("Importance")

# ---------------- Interactive Plot ----------------
fig = px.bar(
    df_top_sorted,
    x="Importance",
    y="Token",
    orientation='h',
    color="Importance",
    color_continuous_scale="RdBu",
    title="Top Positive & Negative Tokens  (LIME-based)",
    labels={"Importance": "LIME Importance", "Token": "Tokens"},
    hover_data={"Importance": True, "Token": True}
)

fig.update_layout(
    xaxis_title="LIME Importance (Effect on Prediction)",
    yaxis_title="Tokens",
    template="plotly_white",
    height=600,
    margin=dict(l=150, r=50, t=50, b=50)
)

fig.show()

# ---------------- Optional: Print top tokens ----------------
top_positive_tokens = df_plot.nlargest(10, "Importance")["Token"].tolist()
top_negative_tokens = df_plot.nsmallest(10, "Importance")["Token"].tolist()

print("Example 95 Text:\n", instance_95['text'], "\n")
print("Top Positive Tokens:", top_positive_tokens)
print("Top Negative Tokens:", top_negative_tokens)
