In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
import joblib
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout

In [44]:
!pip install kagglehub scikit-learn pandas joblib fastapi uvicorn



In [45]:
import kagglehub

# Download dataset
path = kagglehub.dataset_download(
    "cosmos98/twitter-and-reddit-sentimental-analysis-dataset"
)

print("Dataset downloaded at:", path)


Using Colab cache for faster access to the 'twitter-and-reddit-sentimental-analysis-dataset' dataset.
Dataset downloaded at: /kaggle/input/twitter-and-reddit-sentimental-analysis-dataset


In [46]:
import os

for root, dirs, files in os.walk(path):
    for file in files:
        print(file)


Twitter_Data.csv
Reddit_Data.csv


In [47]:
twitter = pd.read_csv(f"{path}/Twitter_Data.csv")
reddit = pd.read_csv(f"{path}/Reddit_Data.csv")

df = pd.concat([twitter, reddit], ignore_index=True)

df.head()

Unnamed: 0,clean_text,category,clean_comment
0,when modi promised “minimum government maximum...,-1.0,
1,talk all the nonsense and continue all the dra...,0.0,
2,what did just say vote for modi welcome bjp t...,1.0,
3,asking his supporters prefix chowkidar their n...,1.0,
4,answer who among these the most powerful world...,1.0,


In [48]:
twitter = twitter.rename(columns={
    "clean_text": "text",
    "category": "sentiment"
})

reddit = reddit.rename(columns={
    "clean_text": "text",
    "category": "sentiment"
})


In [49]:
df = pd.concat([twitter, reddit], ignore_index=True)

df.head()


Unnamed: 0,text,sentiment,clean_comment
0,when modi promised “minimum government maximum...,-1.0,
1,talk all the nonsense and continue all the dra...,0.0,
2,what did just say vote for modi welcome bjp t...,1.0,
3,asking his supporters prefix chowkidar their n...,1.0,
4,answer who among these the most powerful world...,1.0,


In [50]:
# Keep only positive & negative
df = df[df["sentiment"] != 0]

# Map to binary
df["sentiment"] = df["sentiment"].map({
    1: 1,     # positive
    -1: 0     # negative
})

df["sentiment"].value_counts()


Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1.0,88080
0.0,43787


In [51]:
df["sentiment"].value_counts(dropna=False)

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1.0,88080
0.0,43787
,7


In [52]:
df = df.dropna(subset=["sentiment"])

In [53]:
df["sentiment"] = df["sentiment"].astype(int)

In [54]:
df["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,88080
0,43787


In [55]:
X = df["text"]
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("successful")

successful


In [56]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words="english",
        max_features=5000,
        ngram_range=(1, 2)
    )),
    ("clf", LinearSVC())
])

In [57]:
df["text"].isna().sum()

np.int64(24109)

In [58]:
df = df.dropna(subset=["text"])

In [59]:
df["text"] = df["text"].astype(str)

In [60]:
df["text"].isna().sum(), df["sentiment"].isna().sum()

(np.int64(0), np.int64(0))

In [61]:

X = df["text"]
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [62]:
pipeline.fit(X_train, y_train)
print("✅ Model training completed successfully")

✅ Model training completed successfully


In [63]:
from sklearn.metrics import f1_score

y_pred = pipeline.predict(X_test)
candidate_f1 = f1_score(y_test, y_pred, average="weighted")

print("Candidate model F1-score:", candidate_f1)


Candidate model F1-score: 0.8738657232039153


In [64]:
import json

production_metrics = {
    "f1_score": 0.80,   # baseline threshold
    "model_version": "sentiment_prod_v1"
}

with open("production_metrics.json", "w") as f:
    json.dump(production_metrics, f)

print("✅ Production model registry created")


✅ Production model registry created


In [65]:
import shutil

with open("production_metrics.json") as f:
    prod_f1 = json.load(f)["f1_score"]

print("Candidate F1-score :", candidate_f1)
print("Production F1-score:", prod_f1)



Candidate F1-score : 0.8738657232039153
Production F1-score: 0.8


Deep Learning


In [66]:
!pip install tensorflow keras




In [67]:
MAX_WORDS = 20000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding="post")

In [68]:
model = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=MAX_LEN),
    GRU(64, return_sequences=False),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model.summary()



In [69]:
history = model.fit(
    X_train_pad, y_train,
    validation_split=0.1,
    epochs=5,
    batch_size=64
)


Epoch 1/5
[1m1213/1213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 21ms/step - accuracy: 0.6691 - loss: 0.6373 - val_accuracy: 0.6734 - val_loss: 0.6321
Epoch 2/5
[1m1213/1213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 22ms/step - accuracy: 0.6692 - loss: 0.6355 - val_accuracy: 0.6734 - val_loss: 0.6320
Epoch 3/5
[1m1213/1213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 17ms/step - accuracy: 0.6692 - loss: 0.6356 - val_accuracy: 0.6734 - val_loss: 0.6319
Epoch 4/5
[1m1213/1213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 10ms/step - accuracy: 0.6692 - loss: 0.6354 - val_accuracy: 0.6734 - val_loss: 0.6319
Epoch 5/5
[1m1213/1213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 10ms/step - accuracy: 0.6692 - loss: 0.6355 - val_accuracy: 0.6734 - val_loss: 0.6318


In [70]:
import numpy as np
from sklearn.metrics import f1_score

y_probs = model.predict(X_test_pad)
y_pred = (y_probs > 0.5).astype(int)

candidate_f1 = f1_score(y_test, y_pred, average="weighted")
candidate_f1


[1m674/674[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step


0.5382096544417828

transfer learning...

In [71]:
!pip install transformers datasets torch accelerate



In [72]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df[["text", "sentiment"]],
    test_size=0.2,
    stratify=df["sentiment"],
    random_state=42
)



In [73]:
from datasets import Dataset

dataset = {
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(test_df)
}

In [74]:
from datasets import DatasetDict

dataset = DatasetDict({
    "train": dataset["train"],
    "test": dataset["test"]
})


In [75]:
# dataset = dataset.rename_column("sentiment", "labels")

In [76]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

dataset = dataset.map(tokenize, batched=True)
dataset = dataset.rename_column("sentiment", "labels")
dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

Map:   0%|          | 0/86206 [00:00<?, ? examples/s]

Map:   0%|          | 0/21552 [00:00<?, ? examples/s]

In [77]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [78]:
import numpy as np
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "f1": f1_score(labels, preds, average="weighted")
    }


In [79]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none"
)


In [80]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [81]:
# Dataset reduction
dataset["train"] = dataset["train"].shuffle(seed=42).select(range(10000))
dataset["test"]  = dataset["test"].shuffle(seed=42).select(range(2000))


In [82]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none"
)


In [83]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,No log,0.150112,0.950962


TrainOutput(global_step=313, training_loss=0.1477354007026258, metrics={'train_runtime': 114.9862, 'train_samples_per_second': 86.967, 'train_steps_per_second': 2.722, 'total_flos': 331168496640000.0, 'train_loss': 0.1477354007026258, 'epoch': 1.0})

In [85]:
!ls

production_metrics.json  results  sample_data
