In [1]:
# --- 🔧 Imports
import boto3
import pandas as pd
import json
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
# --- 🔗 Load from S3
s3 = boto3.client("s3")
bucket = "bluesky-raw-posts-parvathi"
key = "raw/bluesky_posts.json"

try:
    content = s3.get_object(Bucket=bucket, Key=key)["Body"].read().decode("utf-8")
    records = json.loads(content)
    all_posts = [item["post"] for item in records if isinstance(item, dict) and "post" in item]
    post_df = pd.DataFrame(all_posts)
    print(f"✅ Ingested {len(post_df)} posts from {key}")
except Exception as e:
    print(f"❌ Failed to ingest posts: {e}")
    raise

✅ Ingested 431 posts from raw/bluesky_posts.json


In [3]:
# --- 🧼 Preprocess
post_df["text"] = post_df["record"].apply(lambda r: r.get("text") if isinstance(r, dict) else None)
post_df = post_df.dropna(subset=["text"])

# --- 🧠 VADER Labeling
analyzer = SentimentIntensityAnalyzer()

def vader_label(text):
    score = analyzer.polarity_scores(text)["compound"]
    if score >= 0.2:
        return "positive"
    elif score <= -0.2:
        return "negative"
    else:
        return "neutral"

post_df["label"] = post_df["text"].apply(vader_label)
print(post_df["label"].value_counts())


neutral     206
positive    158
negative     67
Name: label, dtype: int64


In [None]:
#---Training the model

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib
import os
import tarfile

X = post_df["text"]
y = post_df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english")),
    ("clf", LogisticRegression(max_iter=300))
])

pipeline.fit(X_train, y_train)

# Evaluation
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

# Save model to tar.gz for SageMaker
os.makedirs("bluesky_model", exist_ok=True)
joblib.dump(pipeline, "bluesky_model/model.joblib")

with tarfile.open("bluesky_model/bluesky_model.tar.gz", "w:gz") as tar:
    tar.add("bluesky_model/model.joblib", arcname="model.joblib")



              precision    recall  f1-score   support

    negative       0.00      0.00      0.00        13
     neutral       0.52      0.79      0.63        42
    positive       0.46      0.34      0.39        32

    accuracy                           0.51        87
   macro avg       0.33      0.38      0.34        87
weighted avg       0.42      0.51      0.45        87



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# --- Upload to S3
s3.upload_file(
    Filename="bluesky_model/bluesky_model.tar.gz",
    Bucket="bluesky-raw-posts-parvathi",
    Key="model/bluesky_model.tar.gz"
)

# --- Deploy Endpoint
import sagemaker
from sagemaker.sklearn.model import SKLearnModel

role = sagemaker.get_execution_role()

sklearn_model = SKLearnModel(
    model_data="s3://bluesky-raw-posts-parvathi/model/bluesky_model.tar.gz",
    role=role,
    entry_point="bluesky_handler.py",
    framework_version="1.2-1",
    py_version="py3"
)

predictor = sklearn_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    endpoint_name="bluesky-sentiment-endpoint"

)


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
-----

In [None]:
# --Locally testing inside jupyter notebook
import boto3
import json

# --- Configuration
endpoint_name = "bluesky-sentiment-endpoint"  
region = "ap-south-1"

# --- Sample input 
sample_input = {
    "inputs": [
        "I absolutely love the interface!",
        "Not impressed with this update.",
        "Feels a bit clunky but it works."
    ]
}

# --- Create a runtime client
runtime = boto3.client("sagemaker-runtime", region_name=region)

# --- Make the request
response = runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType="application/json",
    Body=json.dumps(sample_input)
)

# --- Read and display the result
result = response["Body"].read().decode("utf-8")
print("🔮 Model Prediction:", result)


🔮 Model Prediction: [{"label": "positive", "confidence": 0.4715}, {"label": "neutral", "confidence": 0.5336}, {"label": "neutral", "confidence": 0.4978}]
