In [None]:
pip install datasets


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
!pip install datasets --upgrade



In [None]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=8892fc60ef747b62e6df1cdc23bbe5576cb76b272b861cbcfcf91dae326cc15d
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [41]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import gensim.downloader as api
from gensim.models import Word2Vec
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# --- BOW Implementation ---
def bow_representation(corpus):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)
    return X.toarray(), vectorizer.get_feature_names_out()

# --- TF-IDF Implementation ---
def tfidf_representation(corpus):
    vectorizer = TfidfVectorizer()
    X_tfidf = vectorizer.fit_transform(corpus)
    return X_tfidf.toarray(), vectorizer.get_feature_names_out()

# --- GloVe Embedding Implementation ---
def load_glove_embeddings():
    glove_model = api.load("glove-wiki-gigaword-100")
    return glove_model

# --- Word2Vec Embedding Implementation ---
def train_word2vec(corpus):
    sentences = [sentence.split() for sentence in corpus]
    model = Word2Vec(sentences, min_count=1)
    return model

# --- Displaying embeddings and comparisons ---
def display_comparisons(corpus):
    # BOW Representation
    bow_array, bow_features = bow_representation(corpus)
    print("BOW Representation (word counts):")
    print(bow_array)
    print("Feature Names:", bow_features)

    # TF-IDF Representation
    tfidf_array, tfidf_features = tfidf_representation(corpus)
    print("\nTF-IDF Representation (word weights):")
    print(tfidf_array)
    print("Feature Names:", tfidf_features)

    # GloVe Embedding Example
    glove_model = load_glove_embeddings()
    glove_vector = glove_model['product']  # Example: Get GloVe vector for 'product'
    print("\nGloVe Vector for 'product':", glove_vector)

    # Word2Vec Example
    word2vec_model = train_word2vec(corpus)
    word2vec_vector = word2vec_model.wv['product']  # Example: Get Word2Vec vector for 'product'
    print("\nWord2Vec Vector for 'product':", word2vec_vector)

# Example corpus
corpus = [
    "I love this product",
    "This product is amazing",
    "I hate this product",
    "Not great, not bad"
]

# Show comparisons
display_comparisons(corpus)


BOW Representation (word counts):
[[0 0 0 0 0 1 0 1 1]
 [1 0 0 0 1 0 0 1 1]
 [0 0 0 1 0 0 0 1 1]
 [0 1 1 0 0 0 2 0 0]]
Feature Names: ['amazing' 'bad' 'great' 'hate' 'is' 'love' 'not' 'product' 'this']

TF-IDF Representation (word weights):
[[0.         0.         0.         0.         0.         0.74230628
  0.         0.47380449 0.47380449]
 [0.59603894 0.         0.         0.         0.59603894 0.
  0.         0.38044393 0.38044393]
 [0.         0.         0.         0.74230628 0.         0.
  0.         0.47380449 0.47380449]
 [0.         0.40824829 0.40824829 0.         0.         0.
  0.81649658 0.         0.        ]]
Feature Names: ['amazing' 'bad' 'great' 'hate' 'is' 'love' 'not' 'product' 'this']

GloVe Vector for 'product': [ 0.12804    0.34131    0.33106   -0.026678  -0.022675  -1.0228
  0.65186   -0.14204    0.29102    0.56137   -0.1294    -0.77794
 -0.014738  -0.0082412  0.19769    0.42299    0.64201    0.89195
  0.28199    0.038209  -0.066105  -0.39848   -0.025111   0.4

In [2]:
!pip install datasets transformers torch scikit-learn


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [3]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

# Load dataset
dataset = load_dataset("HaltiaAI/Her-The-Movie-Samantha-and-Theodore-Dataset")

# Inspect a sample
print(dataset['train'][0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

Theodore â¥ Samantha - CSV.csv:   0%|          | 0.00/42.3k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/306 [00:00<?, ? examples/s]

{'Speaker': None, 'Prompt': None, 'Speaker ': None, 'Response': None, 'start time': None, 'end time': None, '         notes ': None}


In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


In [9]:
from transformers import pipeline

# Load sentiment analysis pipeline
sentiment_pipe = pipeline("sentiment-analysis")

# Auto-generate labels for the 'Response' field
def add_sentiment_labels(example):
    response_text = example.get('Response')  # Safely get the 'Response' text
    if response_text and isinstance(response_text, str):  # Ensure it's a valid string
        result = sentiment_pipe(response_text)[0]  # Run sentiment analysis
        example['label'] = 1 if result['label'] == 'POSITIVE' else 0  # 1=Positive, 0=Negative
    else:
        example['label'] = None  # Default label for invalid responses
    return example



No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [27]:
# Import necessary libraries
import pandas as pd
from transformers import pipeline
from datasets import load_dataset

# Step 1: Load the dataset
print("Loading dataset...")
dataset = load_dataset("HaltiaAI/Her-The-Movie-Samantha-and-Theodore-Dataset", split="train")

# Step 2: Inspect dataset structure
print("Dataset structure:")
print(dataset)

# Convert dataset to DataFrame and inspect columns
df = dataset.to_pandas()
print("Dataset columns:")
print(df.columns)

# Step 3: Clean column names to remove extra spaces
df.columns = df.columns.str.strip()

# Step 4: Load Hugging Face Sentiment Analysis Pipeline
print("Loading sentiment analysis pipeline...")
sentiment_pipeline = pipeline("sentiment-analysis", batch_size=32)

# Step 5: Define a function to analyze sentiment in batches
def analyze_sentiments(dialogues):
    non_empty_dialogues = [d for d in dialogues if isinstance(d, str) and len(d.strip()) > 0]
    # Apply sentiment analysis on non-empty dialogues
    results = sentiment_pipeline(non_empty_dialogues)
    sentiments = [result['label'] for result in results]

    # Create a list of sentiment results, matching the length of the original dataframe
    sentiment_results = []
    i = 0
    for d in dialogues:
        if isinstance(d, str) and len(d.strip()) > 0:
            sentiment_results.append(sentiments[i])
            i += 1
        else:
            sentiment_results.append("Unknown")  # Assign 'Unknown' or None for empty responses

    return sentiment_results

# Step 6: Perform Sentiment Analysis on the 'Prompt' and 'Response' columns separately
print("Performing sentiment analysis on 'Prompt' and 'Response' columns...")

# Analyze sentiment for 'Prompt' column
prompts = df['Prompt'].tolist()
df['Prompt_Sentiment'] = analyze_sentiments(prompts)

# Analyze sentiment for 'Response' column
responses = df['Response'].tolist()
df['Response_Sentiment'] = analyze_sentiments(responses)

# Step 7: Save the results to a new CSV file
output_file = "sentiment_analysis_results_separate.csv"
df.to_csv(output_file, index=False)
print(f"Sentiment analysis completed. Results saved to {output_file}.")

# Step 8: Display sentiment distribution for both 'Prompt' and 'Response' columns
print("Prompt Sentiment Distribution:")
print(df['Prompt_Sentiment'].value_counts())

print("Response Sentiment Distribution:")
print(df['Response_Sentiment'].value_counts())


Loading dataset...


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Dataset structure:
Dataset({
    features: ['Speaker', 'Prompt', 'Speaker ', 'Response', 'start time', 'end time', '         notes '],
    num_rows: 306
})
Dataset columns:
Index(['Speaker', 'Prompt', 'Speaker ', 'Response', 'start time', 'end time',
       '         notes '],
      dtype='object')
Loading sentiment analysis pipeline...
Performing sentiment analysis on 'Prompt' and 'Response' columns...
Sentiment analysis completed. Results saved to sentiment_analysis_results_separate.csv.
Prompt Sentiment Distribution:
Prompt_Sentiment
POSITIVE    139
NEGATIVE    130
Unknown      37
Name: count, dtype: int64
Response Sentiment Distribution:
Response_Sentiment
POSITIVE    148
NEGATIVE    135
Unknown      23
Name: count, dtype: int64


In [28]:
data = pd.read_csv("/content/sentiment_analysis_results_separate.csv")

# Step 2: Inspect the data
print("First 5 rows of the dataset:")
print(data.head(10))

First 5 rows of the dataset:
    Speaker                                             Prompt Speaker.1  \
0       NaN                                                NaN       NaN   
1  Theodore                                  [Presses button.]  Samantha   
2  Theodore                                            Oh, hi.  Samantha   
3  Theodore             I’m well. How is everything with\nyou?  Samantha   
4  Theodore  Yeah, it’s nice to meet you, too.\nWhat should...  Samantha   
5  Theodore              Really? Where did you get that\nname?  Samantha   
6  Theodore                                          How come?  Samantha   
7  Theodore                  When did you give it to yourself?  Samantha   
8  Theodore  You read a whole book in the second\nthat I as...  Samantha   
9  Theodore     Wow. Do you know what I’m thinking\nright now?  Samantha   

                                            Response start time end time  \
0                                                NaN      

In [31]:
pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=99a73b4e20c8fe7bc55776936dfae7516522d3eaddb01f1d228628fce9c31e7a
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [34]:
pip install datasets



In [38]:
# Import Libraries
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import pipeline
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import re
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords
nltk.download('stopwords')

# --- Text Preprocessing ---
def preprocess_text(text):
    """Preprocess text by lowercasing, removing special characters, and stopwords."""
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return " ".join(filtered_tokens)

# --- Load Dataset ---
print("Loading dataset...")
dataset = load_dataset("xsum", split="test[:5]")  # Small subset for testing
articles = dataset['document']
reference_summaries = dataset['summary']

# Preprocess articles
print("Preprocessing articles...")
preprocessed_articles = [preprocess_text(article) for article in articles]

# --- Transformer-based Summarization ---
print("Generating summaries using Transformer (BART)...")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Generate summaries
generated_summaries = []
for article in preprocessed_articles:
    summary = summarizer(article, max_length=60, min_length=20, do_sample=False)[0]['summary_text']
    generated_summaries.append(summary)

# --- Evaluation Functions ---
def compute_rouge(predictions, references):
    """Compute ROUGE scores."""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(ref, pred) for pred, ref in zip(predictions, references)]
    return scores

def compute_bleu(predictions, references):
    """Compute BLEU scores with smoothing."""
    smooth = SmoothingFunction().method1
    bleu_scores = [sentence_bleu([ref.split()], pred.split(), smoothing_function=smooth)
                   for pred, ref in zip(predictions, references)]
    return bleu_scores

# --- Evaluation ---
print("Evaluating summaries...")

# Compute ROUGE
rouge_scores = compute_rouge(generated_summaries, reference_summaries)
rouge1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])
rouge2 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])
rougeL = np.mean([score['rougeL'].fmeasure for score in rouge_scores])

# Compute BLEU
bleu_scores = compute_bleu(generated_summaries, reference_summaries)
mean_bleu = np.mean(bleu_scores)

# --- Display Results ---
print("\n--- Evaluation Results ---")
print(f"ROUGE-1 F1 Score: {rouge1:.4f}")
print(f"ROUGE-2 F1 Score: {rouge2:.4f}")
print(f"ROUGE-L F1 Score: {rougeL:.4f}")
print(f"BLEU Score (with smoothing): {mean_bleu:.4f}")

# --- Display Example Summaries ---
print("\n--- Sample Results ---")
for i in range(len(articles)):
    print(f"\nOriginal Article: {articles[i][:200]}...")
    print(f"Reference Summary: {reference_summaries[i]}")
    print(f"Generated Summary: {generated_summaries[i]}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Loading dataset...
Preprocessing articles...
Generating summaries using Transformer (BART)...


Your max_length is set to 60, but your input_length is only 37. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)
Your max_length is set to 60, but your input_length is only 34. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)


Evaluating summaries...

--- Evaluation Results ---
ROUGE-1 F1 Score: 0.1386
ROUGE-2 F1 Score: 0.0209
ROUGE-L F1 Score: 0.0920
BLEU Score (with smoothing): 0.0078

--- Sample Results ---

Original Article: Prison Link Cymru had 1,099 referrals in 2015-16 and said some ex-offenders were living rough for up to a year before finding suitable accommodation.
Workers at the charity claim investment in housing...
Reference Summary: There is a "chronic" need for more housing for prison leavers in Wales, according to a charity.
Generated Summary:  prison link cymru referrals said exoffenders living rough year finding suitable accommodation. charity claim investment housing would cheaper jailing homeless repeat offenders welsh government said people ever getting help address housing problems.

Original Article: Officers searched properties in the Waterfront Park and Colonsay View areas of the city on Wednesday.
Detectives said three firearms, ammunition and a five-figure sum of money were recove