In [None]:
import transformers
import shap

# Load your custom emotion classification model and tokenizer
emotion_pipeline = transformers.pipeline(
    "text-classification",
    model="./emotion_model",
    tokenizer="./emotion_model",
    return_all_scores=True
)

# Predict on a single sample
sample_text = "I feel absolutely wonderful today!"
prediction = emotion_pipeline(sample_text)
    
print("Prediction:", prediction)

# Use SHAP for explanations
# explainer = shap.Explainer(emotion_pipeline)

# # Sample inputs for SHAP explanations
# sample_inputs = [
#     "What a great day! I am so happy and full of joy.",
#     "I am scared to try new things, it makes me nervous.",
#     "I hate this so much, it's terrible!"
# ]

# # Compute SHAP values for the inputs
# shap_values = explainer(sample_inputs)

# # Select the class index for "joy" (replace with the actual index if different)
# joy_class_index = 1  # Adjust this based on your model's output

# # Visualize the SHAP explanation for the first input
# shap.plots.text(shap_values[0, :, joy_class_index])


In [None]:
import pandas as pd
from collections import Counter
import spacy
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Load the dataset
file_path = "text.csv"
data = pd.read_csv(file_path)

# Ensure columns are correctly loaded
data.columns = [col.strip() for col in data.columns]

# Load spaCy language model
nlp = spacy.load("en_core_web_sm")

# Function to extract POS tags from phrases
def extract_pos_tags(phrases):
    """
    Extract verbs, adjectives, adverbs, and nouns from a list of phrases using spaCy.
    Returns a dictionary with POS tags as keys and their counts as values.
    """
    pos_tags = {"VERB": Counter(), "ADJ": Counter(), "ADV": Counter(), "NOUN": Counter()}
    
    for phrase in phrases:
        doc = nlp(phrase)
        for token in doc:
            if token.pos_ in pos_tags:
                pos_tags[token.pos_].update([token.lemma_])  # Use lemma for base form
    
    return pos_tags

# Group phrases by label
grouped_data = data.groupby("label")["text"]

# Analyze each label
label_pos_counts = {}
for label, phrases in grouped_data:
    print(f"Processing label: {label}")
    label_pos_counts[label] = extract_pos_tags(phrases)


In [None]:

# Display and visualize results
for label, pos_counts in label_pos_counts.items():
    print(f"\nLabel: {label}")
    
    # Display most common words by POS tag
    for pos, counter in pos_counts.items():
        print(f"  Most common {pos}s:")
        for word, count in counter.most_common(25):
            print(f"    {word}: {count}")
    
    # Generate a word cloud for nouns as an example
    # wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(pos_counts["NOUN"])
    # plt.figure(figsize=(10, 5))
    # plt.imshow(wordcloud, interpolation="bilinear")
    # plt.axis("off")
    # plt.title(f"Word Cloud for Nouns in Label: {label}")
    # plt.show()


In [None]:
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import re
from wordcloud import WordCloud

# Load the dataset
file_path = "text.csv"
data = pd.read_csv(file_path)

# Ensure columns are correctly loaded
data.columns = [col.strip() for col in data.columns]

# Count the number of phrases for each label
label_counts = data['label'].value_counts()

# Tokenize and count word frequencies for each label
def clean_and_tokenize(phrase):
    """Clean text and tokenize."""
    phrase = re.sub(r"[^\w\s]", "", phrase.lower())  # Remove punctuation and lowercase
    return phrase.split()

word_frequencies = {label: Counter() for label in label_counts.index}
for _, row in data.iterrows():
    label = row['label']
    words = clean_and_tokenize(row['text'])
    word_frequencies[label].update(words)

# Calculate mean of positive numbers for each label (if a "pos" column exists)
if "pos" in data.columns:
    pos_means = data.groupby("label")["pos"].mean()
else:
    pos_means = None

# Display most common words for each label
for label, counter in word_frequencies.items():
    print(f"Most common words for label '{label}':")
    for word, count in counter.most_common(10):
        print(f"  {word}: {count}")
    print()

# Display label counts
print("Number of phrases for each label:")
print(label_counts)

# Display mean of positive numbers (if available)
if pos_means is not None:
    print("\nMean of 'pos' values for each label:")
    print(pos_means)

# Generate word clouds for each label
for label, counter in word_frequencies.items():
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(counter)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Word Cloud for Label: {label}")
    plt.show()
