In [1]:
from transformers import pipeline
from pymongo import MongoClient
import pandas as pd
import json
import urllib.parse

In [2]:
# Load credentials from JSON file
with open('credentials_mongodb.json') as f:
    login = json.load(f)

# Assign credentials to variables
username = login['username']
password = urllib.parse.quote(login['password'])  # Ensure the password is URL encoded
host = login['host']

# Construct the MongoDB connection string
url = f"mongodb+srv://{username}:{password}@{host}/?retryWrites=true&w=majority"

# MongoDB setup
client = MongoClient(url)  # Use the constructed connection string
db = client['news_database']  # Replace with your database name
collection = db['news_collection']  # Replace with your collection name

In [3]:
# Fetch all documents from the MongoDB collection
#articles = list(collection.find())
# Fetch a limited number of documents from MongoDB (e.g., 100 documents)
subset_size = 5  # Adjust this number based on the subset size you want
articles = list(collection.find().limit(subset_size))


In [4]:
# Convert to a pandas DataFrame
df = pd.DataFrame(articles)

In [5]:
# Step 2: Define possible writing styles for classification
writing_styles = ["expository", "narrative", "descriptive", "persuasive"]

In [7]:
import torch
from transformers import pipeline

# Check if GPU is available
device = 0 if torch.cuda.is_available() else -1

# Initialize the zero-shot classifier pipeline and use GPU if available
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device)


In [8]:
# Step 3: Perform zero-shot classification on the headlines or short descriptions
df['detected_writing_style'] = df['headline'].apply(lambda x: classifier(x, candidate_labels=writing_styles)['labels'][0])

In [9]:
# Step 4: Display the dataset with detected writing styles
print(df[['headline', 'detected_writing_style']].head())

                                            headline detected_writing_style
0  Over 4 Million Americans Roll Up Sleeves For O...             persuasive
1  American Airlines Flyer Charged, Banned For Li...             persuasive
2  23 Of The Funniest Tweets About Cats And Dogs ...             persuasive
3  The Funniest Tweets From Parents This Week (Se...             persuasive
4  Woman Who Called Cops On Black Bird-Watcher Lo...            descriptive


In [11]:
# Step 5: (Optional) Save the auto-labeled dataset back to MongoDB
collection_with_styles = db['news_articles_with_styles']  # New collection to store the auto-labeled data
collection_with_styles.insert_many(df.to_dict('records'))

InsertManyResult([ObjectId('6706b7e97424b08e25a00d5a'), ObjectId('6706b7e97424b08e25a00d5b'), ObjectId('6706b7e97424b08e25a00d5c'), ObjectId('6706b7e97424b08e25a00d5d'), ObjectId('6706b7e97424b08e25a00d5e')], acknowledged=True)