In [10]:
import pandas as pd


df = pd.read_csv("../reduced_dataset.csv")

In [11]:
df.head()

Unnamed: 0,title,text,url,authors,timestamp,tags,processed_text,word_count,year,reading_time_min,title_length,text_length,polarity,subjectivity,title_sentiment,text_combined
0,Alcoholism isnâ€™t about Drinking too much: Al...,I feel sorry for people who think alcoholism i...,https://mattsalis.medium.com/alcoholism-isnt-a...,['Matt Salis'],2020-02-04 21:51:34.005,"['Recovery', 'Addiction', 'Health', 'Mental He...",feel sorry people think alcoholism drinking mu...,660,2020,3.3,69,8032,-0.07734,0.556261,-0.5106,Alcoholism isnâ€™t about Drinking too much: Al...
1,Is Anger Good or Bad?,Is Anger Good or Bad?\n\nTo be angry with the ...,https://medium.com/@rajeshragothaman/is-anger-...,['Rajesh Kumar R'],2020-08-20 18:21:58.251,"['Anger Control', 'Anger', 'Anger Management']",anger good bad angry right person right degree...,304,2020,1.52,21,3642,-0.050206,0.47244,-0.6486,Is Anger Good or Bad? anger good bad angry rig...
2,Fact check: Fake post claims Indian Railways h...,Fact check: Fake post claims Indian Railways h...,https://medium.com/@newsvibesindia/fact-check-...,['News Vibes'],2020-12-13 12:42:06.553,"['Fact Check', 'Hazrat Nizamuddin', 'Hazrat Ni...",fact check fake post claim indian railway rena...,74,2020,0.37,82,751,-0.179167,0.579167,-0.4767,Fact check: Fake post claims Indian Railways h...
3,A Murder,â€œIt was dark and violent. I was very angry :...,https://medium.com/fictionhub/a-murder-bb78504...,['Kim Ferrer'],2019-01-18 02:07:56.615,"['Dreams', 'Hate', 'Fiction', 'Short Story', '...",dark violent angry emotion responsible every g...,383,2019,1.92,8,4631,-0.061114,0.505187,-0.6908,A Murder dark violent angry emotion responsibl...
4,"If You Failed to Get a Job, You Are Not Alone","If You Failed to Get a Job, You Are Not Alone\...",https://medium.com/better-programming/if-you-f...,['Fatos Morina'],2020-12-16 22:04:54.705,"['Life Lessons', 'Programming', 'Startup', 'Li...",failed get job alone industry titan contextual...,39,2020,0.2,45,487,-0.167857,0.44881,-0.3736,"If You Failed to Get a Job, You Are Not Alone ..."


In [12]:
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

In [13]:
# Keep only required columns
df = df[['title', 'text_combined', 'tags']]

# Combine title + text
df['input_text'] = df['title'] + ' ' + df['text_combined']



In [16]:
from collections import Counter

# Flatten all tags into one list
all_tags = [tag for tags in df['tags'] for tag in tags]
tag_counts = Counter(all_tags)

# Keep only top N tags (e.g., 1000)
TOP_N_TAGS = 1000
top_tags = set([tag for tag, count in tag_counts.most_common(TOP_N_TAGS)])

# Filter tags in each row
df['tags'] = df['tags'].apply(lambda tags: [tag for tag in tags if tag in top_tags])

# Drop rows with no valid tags left
df = df[df['tags'].map(len) > 0]

# Now apply MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['tags'])

# Split as before
X_train, X_test, y_train, y_test = train_test_split(df['input_text'], y, test_size=0.2, random_state=42)


In [17]:
# STEP 3: TF-IDF Vectorization (Memory Efficient)
vectorizer = TfidfVectorizer(
    max_features=5000,      # Reduce if you still get memory error
    stop_words='english'
)

X_train_tfidf = vectorizer.fit_transform(X_train).astype('float32')
X_test_tfidf = vectorizer.transform(X_test).astype('float32')


In [18]:
# STEP 4: Train Logistic Regression (One-vs-Rest for multi-label)
model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train_tfidf, y_train)


In [19]:
# STEP 5: Evaluate Model
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, target_names=mlb.classes_))


                              precision    recall  f1-score   support

                      '2020'       0.00      0.00      0.00        33
                     '2020']       0.00      0.00      0.00        10
                      '2021'       0.00      0.00      0.00        14
           'A Cornered Gurl'       0.00      0.00      0.00         8
                        'AI'       0.00      0.00      0.00        35
                       'AI']       0.00      0.00      0.00        15
                       'API'       0.00      0.00      0.00        12
                        'AR'       0.00      0.00      0.00         2
                       'AWS'       0.29      0.08      0.13        24
                     'Abuse'       0.00      0.00      0.00         8
                  'Activism'       0.00      0.00      0.00        10
                 'Addiction'       0.00      0.00      0.00        13
                    'Advent'       0.00      0.00      0.00         6
                 'A

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [20]:
# STEP 6: Save Model & Vectorizer
joblib.dump(model, 'tag_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(mlb, 'multilabel_binarizer.pkl')


['multilabel_binarizer.pkl']

In [26]:
def predict_tags(title, text_combined, threshold=0.2):
    # Load everything
    model = joblib.load('tag_model.pkl')
    vectorizer = joblib.load('tfidf_vectorizer.pkl')
    mlb = joblib.load('multilabel_binarizer.pkl')

    full_text = title + ' ' + text_combined
    tfidf_input = vectorizer.transform([full_text])
    
    # Predict probabilities
    probas = model.predict_proba(tfidf_input)[0]

    # Apply custom threshold
    predicted_indices = [i for i, p in enumerate(probas) if p >= threshold]

    if not predicted_indices:
        return ['No confident tags found 😕']

    return [mlb.classes_[i] for i in predicted_indices]


In [41]:
new_title = "Revolutionary Health Tracker Launched"
new_text = "The wearable device uses machine learning to monitor heart rate and sleep patterns in real-time. Artificial intelligence Health "


predicted_tags = predict_tags(new_title, new_text, threshold=0.05)
print("Predicted Tags:", predicted_tags)


Predicted Tags: [" 'Artificial Intelligence'", " 'Health'"]
