In [1]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
from transformers import pipeline
import spacy
import polars as pl
from pathlib import Path

spacy.require_gpu()

sentiment_pipeline = pipeline("sentiment-analysis", model="LiYuan/amazon-review-sentiment-analysis", batch_size=8)

nlp = spacy.load("en_core_web_md")


def extract_negative_sentences(text: str):
    """Extract full sentences with negative sentiment."""
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents] # collect them into a list

    if not sentences: 
        # This shouldn't ever happen on review. tips seem to be way shorter so may not have anything negative
        return []

    sentiments = sentiment_pipeline(sentences)

    negative_sentences = [sent for sent, sentiment in zip(sentences, sentiments) if sentiment["label"] in {"1 star", "2 stars"} and sentiment["score"] > 0.75]

    return negative_sentences


def apply_absa_extraction(df: pl.DataFrame) -> pl.DataFrame:
    """Adds a new column 'absa' containing negative sentences."""
    return df.with_columns(pl.col("text").map_elements(extract_negative_sentences, return_dtype=pl.List(pl.Utf8())).alias("absa"))


task2_review_path = Path("/kaggle/input/ds3010-task2/task2/task_2_review.parquet")
task2_tip_path = Path("/kaggle/input/ds3010-task2/task2/task_2_tip.parquet")

review_df = pl.read_parquet(task2_review_path)
tip_df = pl.read_parquet(task2_tip_path)

review_df = apply_absa_extraction(review_df) # change to what you need, but takes a long time. max 100
review_df.write_parquet("/kaggle/working/task2_absa_review.parquet")

tip_df = apply_absa_extraction(tip_df) # change to what you need, but takes a long time. max 100
tip_df.write_parquet("/kaggle/working/task2_absa_tip.parquet")


config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
