In [1]:
!mkdir -p /kaggle/working/synthetic_nlp/generator
!mkdir -p /kaggle/working/synthetic_nlp/data/synthetic

In [2]:
!touch /kaggle/working/synthetic_nlp/generator/__init__.py

In [3]:
%%writefile /kaggle/working/synthetic_nlp/generator/lexicon.py
SUBJECTS = ["I", "We", "They", "He", "She"]

OBJECTS = [
    "this movie",
    "the food",
    "the service",
    "the product",
    "the experience"
]

POSITIVE_VERBS = ["like", "love", "enjoy"]
NEGATIVE_VERBS = ["hate", "dislike", "regret"]

ADVERBS = {
    "": 1.0,
    "really": 1.2,
    "very": 1.3,
    "absolutely": 1.5,
    "hardly": -0.7,
    "barely": -0.6
}

Writing /kaggle/working/synthetic_nlp/generator/lexicon.py


In [4]:
%%writefile /kaggle/working/synthetic_nlp/generator/grammar.py
TEMPLATE = "{subject} {adverb} {verb} {object}"

Writing /kaggle/working/synthetic_nlp/generator/grammar.py


In [5]:
%%writefile /kaggle/working/synthetic_nlp/generator/labeler.py
def compute_sentiment(verb_sentiment, adverb_weight):
    return 1 if verb_sentiment * adverb_weight > 0 else 0

Writing /kaggle/working/synthetic_nlp/generator/labeler.py


In [6]:
%%writefile /kaggle/working/synthetic_nlp/generator/sentence_build.py
import random
from .lexicon import SUBJECTS, OBJECTS, POSITIVE_VERBS, NEGATIVE_VERBS, ADVERBS
from .grammar import TEMPLATE
from .labeler import compute_sentiment

def generate_sentence():
    subject = random.choice(SUBJECTS)
    obj = random.choice(OBJECTS)

    is_positive = random.choice([True, False])
    if is_positive:
        verb = random.choice(POSITIVE_VERBS)
        vs = 1
    else:
        verb = random.choice(NEGATIVE_VERBS)
        vs = -1

    adv = random.choice(list(ADVERBS.keys()))
    aw = ADVERBS[adv]

    sentence = TEMPLATE.format(
        subject=subject,
        adverb=adv,
        verb=verb,
        object=obj
    ).replace("  ", " ").strip()

    label = compute_sentiment(vs, aw)
    return sentence, label

Writing /kaggle/working/synthetic_nlp/generator/sentence_build.py


In [7]:
!ls /kaggle/working/synthetic_nlp/generator

grammar.py  __init__.py  labeler.py  lexicon.py  sentence_build.py


In [8]:
import sys
sys.path.append("/kaggle/working/synthetic_nlp")

from generator.sentence_build import generate_sentence
generate_sentence()

('They barely love the food', 0)

In [9]:
import os

save_path = "/kaggle/working/synthetic_nlp/data/synthetic/sentiment.csv"

with open(save_path, "w", encoding="utf-8") as f:
    f.write("text,label\n")
    for _ in range(2000):
        s, l = generate_sentence()
        f.write(f"\"{s}\",{l}\n")

print("Dataset generated at:", save_path)

Dataset generated at: /kaggle/working/synthetic_nlp/data/synthetic/sentiment.csv


In [10]:
import pandas as pd

df = pd.read_csv("/kaggle/working/synthetic_nlp/data/synthetic/sentiment.csv")
print("Total samples:", len(df))
df.head(10)

Total samples: 2000


Unnamed: 0,text,label
0,He barely love the product,0
1,We really regret the food,0
2,She really enjoy the product,1
3,They really regret the experience,0
4,I enjoy the product,1
5,We dislike the service,0
6,She really love the product,1
7,We barely enjoy the service,0
8,They absolutely enjoy the experience,1
9,She really hate the service,0
