In [1]:
# importing libraries and making new directory
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib, os

os.makedirs("data", exist_ok=True)



In [2]:
# Generating Large Dataset (Positive, Negative, Neutral)
positive_texts = ["Great article!", "Very informative", "Awesome work!", "Loved it!", "Excellent blog"] * 30
negative_texts = ["I didn't like this", "This is bad", "Terrible content", "Worst article ever", "Poor explanation"] * 30
neutral_texts  = ["It is okay", "Average post", "Neutral thoughts", "Fine but not great", "Not bad, not good"] * 30

texts = positive_texts + negative_texts + neutral_texts
labels = [1]*len(positive_texts) + [-1]*len(negative_texts) + [0]*len(neutral_texts)

data = pd.DataFrame({
    "comment_id": range(1, len(texts)+1),
    "post_id": random.choices([1,2,3,4,5], k=len(texts)),
    "user_id": random.choices(range(101,401), k=len(texts)),
    "text": texts,
    "timestamp": pd.date_range("2025-08-01", periods=len(texts), freq="H").astype(str)
})
comments = data.copy()
comments.to_csv("data/comments.csv", index=False)



  "timestamp": pd.date_range("2025-08-01", periods=len(texts), freq="H").astype(str)


In [3]:
# Engagement Data
engagement = pd.DataFrame({
    "post_id": [1,2,3,4,5],
    "likes": [300,120,500,50,220],
    "dislikes": [20,50,10,80,30],
    "avg_read_time_seconds": [240,180,400,150,300],
    "category": ["case-study","opinion","tutorial","opinion","tutorial"]
})
engagement.to_csv("data/engagement.csv", index=False)



In [4]:
# Training the Model
X_train, X_test, y_train, y_test = train_test_split(comments["text"], labels, test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression(class_weight="balanced", max_iter=1000)
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

joblib.dump((vectorizer, model), "sentiment_model.pkl")



✅ Accuracy: 1.0
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00        23
           0       1.00      1.00      1.00        32
           1       1.00      1.00      1.00        35

    accuracy                           1.00        90
   macro avg       1.00      1.00      1.00        90
weighted avg       1.00      1.00      1.00        90



['sentiment_model.pkl']

In [5]:
# Sample Example of Predictions
examples = ["Great job!", "This is bad", "It's okay"]
preds = model.predict(vectorizer.transform(examples))
print("\n🔹 Example Predictions:")
for t, p in zip(examples, preds):
    print(f"{t} → {p}")




🔹 Example Predictions:
Great job! → 1
This is bad → -1
It's okay → 0


In [6]:
# Prediction for All Comments
comments["sentiment"] = model.predict(vectorizer.transform(comments["text"]))
comments["sentiment_score"] = comments["sentiment"]



In [7]:
# Merging & Computing Relevance Score
merged = comments.merge(engagement, on="post_id", how="left")
for col in ["likes","dislikes","avg_read_time_seconds"]:
    merged[f"{col}_norm"] = merged[col] / merged[col].max()

category_weights = {"case-study":1.2, "opinion":1.0, "tutorial":1.5}
merged["category_weight"] = merged["category"].map(category_weights)

merged["relevance_score"] = (
    merged["sentiment_score"]*0.4 +
    merged["likes_norm"]*0.3 -
    merged["dislikes_norm"]*0.1 +
    merged["avg_read_time_seconds_norm"]*0.3
) * merged["category_weight"]

merged.to_csv("scored_posts.csv", index=False)
print("\n✅ Scored posts saved to scored_posts.csv")
merged.head()



✅ Scored posts saved to scored_posts.csv


Unnamed: 0,comment_id,post_id,user_id,text,timestamp,sentiment,sentiment_score,likes,dislikes,avg_read_time_seconds,category,likes_norm,dislikes_norm,avg_read_time_seconds_norm,category_weight,relevance_score
0,1,5,342,Great article!,2025-08-01 00:00:00,1,1,220,30,300,tutorial,0.44,0.375,0.75,1.5,1.07925
1,2,3,339,Very informative,2025-08-01 01:00:00,1,1,500,10,400,tutorial,1.0,0.125,1.0,1.5,1.48125
2,3,4,335,Awesome work!,2025-08-01 02:00:00,1,1,50,80,150,opinion,0.1,1.0,0.375,1.0,0.4425
3,4,5,366,Loved it!,2025-08-01 03:00:00,1,1,220,30,300,tutorial,0.44,0.375,0.75,1.5,1.07925
4,5,1,311,Excellent blog,2025-08-01 04:00:00,1,1,300,20,240,case-study,0.6,0.25,0.6,1.2,0.882
