In [1]:
!mkdir -p /kaggle/working/synthetic_nlp/generator
!mkdir -p /kaggle/working/synthetic_nlp/data/synthetic

In [2]:
!touch /kaggle/working/synthetic_nlp/generator/__init__.py

In [3]:
%%writefile /kaggle/working/synthetic_nlp/generator/lexicon.py
SUBJECTS = ["I", "We", "They", "He", "She"]

OBJECTS = [
    "this movie",
    "the food",
    "the service",
    "the product",
    "the experience"
]

POSITIVE_VERBS = ["like", "love", "enjoy"]
NEGATIVE_VERBS = ["hate", "dislike", "regret"]

ADVERBS = {
    "": 1.0,
    "really": 1.2,
    "very": 1.3,
    "absolutely": 1.5,
    "hardly": -0.7,
    "barely": -0.6
}

Writing /kaggle/working/synthetic_nlp/generator/lexicon.py


In [4]:
%%writefile /kaggle/working/synthetic_nlp/generator/grammar.py
TEMPLATE = "{subject} {adverb} {verb} {object}"

Writing /kaggle/working/synthetic_nlp/generator/grammar.py


In [5]:
%%writefile /kaggle/working/synthetic_nlp/generator/labeler.py
def compute_sentiment(verb_sentiment, adverb_weight):
    return 1 if verb_sentiment * adverb_weight > 0 else 0

Writing /kaggle/working/synthetic_nlp/generator/labeler.py


In [6]:
%%writefile /kaggle/working/synthetic_nlp/generator/sentence_build.py
import random
from .lexicon import SUBJECTS, OBJECTS, POSITIVE_VERBS, NEGATIVE_VERBS, ADVERBS
from .grammar import TEMPLATE
from .labeler import compute_sentiment

def generate_sentence():
    subject = random.choice(SUBJECTS)
    obj = random.choice(OBJECTS)

    is_positive = random.choice([True, False])
    if is_positive:
        verb = random.choice(POSITIVE_VERBS)
        vs = 1
    else:
        verb = random.choice(NEGATIVE_VERBS)
        vs = -1

    adv = random.choice(list(ADVERBS.keys()))
    aw = ADVERBS[adv]

    sentence = TEMPLATE.format(
        subject=subject,
        adverb=adv,
        verb=verb,
        object=obj
    ).replace("  ", " ").strip()

    label = compute_sentiment(vs, aw)
    return sentence, label

Writing /kaggle/working/synthetic_nlp/generator/sentence_build.py


In [7]:
!ls /kaggle/working/synthetic_nlp/generator

grammar.py  __init__.py  labeler.py  lexicon.py  sentence_build.py


In [8]:
import sys
sys.path.append("/kaggle/working/synthetic_nlp")

from generator.sentence_build import generate_sentence
generate_sentence()

('They barely love the food', 0)

In [9]:
import os

save_path = "/kaggle/working/synthetic_nlp/data/synthetic/sentiment.csv"

with open(save_path, "w", encoding="utf-8") as f:
    f.write("text,label\n")
    for _ in range(2000):
        s, l = generate_sentence()
        f.write(f"\"{s}\",{l}\n")

print("Dataset generated at:", save_path)

Dataset generated at: /kaggle/working/synthetic_nlp/data/synthetic/sentiment.csv


In [10]:
import pandas as pd

df = pd.read_csv("/kaggle/working/synthetic_nlp/data/synthetic/sentiment.csv")
print("Total samples:", len(df))
df.head(10)

Total samples: 2000


Unnamed: 0,text,label
0,He barely love the product,0
1,We really regret the food,0
2,She really enjoy the product,1
3,They really regret the experience,0
4,I enjoy the product,1
5,We dislike the service,0
6,She really love the product,1
7,We barely enjoy the service,0
8,They absolutely enjoy the experience,1
9,She really hate the service,0


In [12]:
df["label"].value_counts(normalize=True)

label
0    0.5
1    0.5
Name: proportion, dtype: float64

In [13]:
negators = ["hardly", "barely"]

df_neg = df[df["text"].str.contains("|".join(negators))]
df_neg.sample(10)

Unnamed: 0,text,label
1426,I barely regret the food,1
65,They hardly dislike the experience,1
1399,We hardly dislike the service,1
917,We hardly enjoy the service,0
408,They hardly hate this movie,1
1762,I hardly love this movie,0
841,They hardly dislike this movie,1
1657,I barely dislike the product,1
43,We barely regret the product,1
404,I hardly dislike the food,1


In [14]:
df_neg["label"].value_counts()

label
0    328
1    328
Name: count, dtype: int64

In [15]:
df_non_neg = df[~df["text"].str.contains("|".join(negators))]
df_non_neg["label"].value_counts(normalize=True)

label
0    0.5
1    0.5
Name: proportion, dtype: float64

In [16]:
from collections import Counter

words = Counter()

for t in df["text"]:
    words.update(t.lower().split())

words.most_common(15)

[('the', 1594),
 ('experience', 427),
 ('i', 419),
 ('we', 417),
 ('product', 416),
 ('this', 406),
 ('movie', 406),
 ('she', 404),
 ('he', 394),
 ('service', 378),
 ('food', 373),
 ('dislike', 373),
 ('barely', 367),
 ('they', 366),
 ('really', 361)]

In [17]:
df["has_hardly"] = df["text"].str.contains("hardly")
df.groupby("has_hardly")["label"].mean()

has_hardly
False    0.504383
True     0.474048
Name: label, dtype: float64

In [18]:
df["length"] = df["text"].str.split().apply(len)
df["length"].describe()

count    2000.000000
mean        4.835500
std         0.370821
min         4.000000
25%         5.000000
50%         5.000000
75%         5.000000
max         5.000000
Name: length, dtype: float64

In [19]:
assert df.isnull().sum().sum() == 0
assert set(df["label"].unique()) == {0, 1}
assert len(df) >= 1000

print("All dataset sanity checks passed.")

All dataset sanity checks passed.


In [20]:
import pandas as pd

path = "/kaggle/working/synthetic_nlp/data/synthetic/sentiment.csv"
df = pd.read_csv(path)

X = df["text"]
y = df["label"]

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    ngram_range=(1, 2),
    lowercase=True
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

X_train_vec.shape, X_test_vec.shape

((1600, 113), (400, 113))

In [23]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_train)

In [24]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = clf.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       200
           1       1.00      1.00      1.00       200

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400



In [25]:
import numpy as np

feature_names = vectorizer.get_feature_names_out()
coefs = clf.coef_[0]

top_pos = np.argsort(coefs)[-10:]
top_neg = np.argsort(coefs)[:10]

print("Top positive n-grams:")
for i in top_pos:
    print(feature_names[i], coefs[i])

print("\nTop negative n-grams:")
for i in top_neg:
    print(feature_names[i], coefs[i])

Top positive n-grams:
really enjoy 1.670300183245798
really love 1.7190560203581138
very enjoy 1.7208862275204664
very love 1.7620074840963307
hardly regret 3.74033064516526
hardly hate 3.752885090616459
hardly dislike 3.938911082483333
barely regret 3.9409103896777666
barely dislike 4.079264596583073
barely hate 4.22908315982442

Top negative n-grams:
barely enjoy -4.209943463033454
barely love -4.157470873547808
hardly enjoy -4.022385611692789
barely like -3.8765682003350475
hardly love -3.7962629585367793
hardly like -3.6653398152548355
very dislike -1.7580522570637018
really dislike -1.7548795085146016
really regret -1.654520013057732
absolutely dislike -1.5804487100920044


In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np
from collections import Counter

In [28]:
df = pd.read_csv("/kaggle/working/synthetic_nlp/data/synthetic/sentiment.csv")

texts = df["text"].tolist()
labels = df["label"].tolist()

In [29]:
def tokenize(text):
    return text.lower().split()

tokenized = [tokenize(t) for t in texts]

word_counts = Counter(w for sent in tokenized for w in sent)

vocab = {"<PAD>": 0, "<UNK>": 1}
for w in word_counts:
    vocab[w] = len(vocab)

inv_vocab = {i: w for w, i in vocab.items()}

In [30]:
def encode(sent, vocab):
    return [vocab.get(w, vocab["<UNK>"]) for w in sent]

encoded = [encode(s, vocab) for s in tokenized]

MAX_LEN = max(len(s) for s in encoded)

def pad(seq, max_len):
    return seq + [0] * (max_len - len(seq))

X = torch.tensor([pad(s, MAX_LEN) for s in encoded])
y = torch.tensor(labels)

In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [32]:
class SentimentDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

train_ds = SentimentDataset(X_train, y_train)
test_ds = SentimentDataset(X_test, y_test)

train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=32)

In [33]:
class LSTMSentiment(nn.Module):
    def __init__(self, vocab_size, emb_dim=64, hidden_dim=64):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        emb = self.embedding(x)
        _, (h, _) = self.lstm(emb)
        out = self.fc(h[-1])
        return out.squeeze()

In [34]:
model = LSTMSentiment(len(vocab))
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

EPOCHS = 5

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for xb, yb in train_dl:
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_dl):.4f}")

Epoch 1, Loss: 0.6740
Epoch 2, Loss: 0.3461
Epoch 3, Loss: 0.0112
Epoch 4, Loss: 0.0025
Epoch 5, Loss: 0.0015


In [35]:
model.eval()
preds, trues = [], []

with torch.no_grad():
    for xb, yb in test_dl:
        logits = model(xb)
        probs = torch.sigmoid(logits)
        preds.extend((probs > 0.5).int().tolist())
        trues.extend(yb.tolist())

from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(trues, preds))
print(classification_report(trues, preds))

Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       200
           1       1.00      1.00      1.00       200

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400

