In [1]:
from pymongo import MongoClient
import json
import urllib.parse

# Load credentials from JSON file
with open('credentials_mongodb.json') as f:
    login = json.load(f)

# Assign credentials
username = login['username']
password = urllib.parse.quote(login['password'])
host = login['host']

# Construct MongoDB connection string and connect
url = f"mongodb+srv://{username}:{password}@{host}/?retryWrites=true&w=majority"
client = MongoClient(url)
db = client['news_database']
collection = db['news_collection']


In [4]:
# Check if data can be fetched
sample_data = list(collection.find().limit(5))  # Retrieve a few sample documents
for document in sample_data:
    print(document)


{'_id': ObjectId('67213634790a264730074184'), 'link': 'https://www.huffpost.com/entry/covid-boosters-uptake-us_n_632d719ee4b087fae6feaac9', 'headline': 'Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters', 'category': 'U.S. NEWS', 'short_description': 'Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.', 'authors': 'Carla K. Johnson, AP', 'date': '2022-09-23'}
{'_id': ObjectId('67213634790a264730074185'), 'link': 'https://www.huffpost.com/entry/american-airlines-passenger-banned-flight-attendant-punch-justice-department_n_632e25d3e4b0e247890329fe', 'headline': 'American Airlines Flyer Charged, Banned For Life After Punching Flight Attendant On Video', 'category': 'U.S. NEWS', 'short_description': "He was subdued by passengers and crew when he fled to the back of the aircraft after the confrontation, according to the U.S. attorney's office in Los Angeles.", '

In [5]:
pipeline = [
    {"$match": {"category": "Technology"}}  # Adjust category if needed
]

results = list(collection.aggregate(pipeline))
print(f"Documents found: {len(results)}")
for result in results[:5]:  # Display a few documents
    print(result)


Documents found: 0


In [6]:
pipeline = [
    {"$match": {"category": "Technology"}},
    {"$project": {"headline": 1, "short_description": 1, "date": 1, "category": 1}}
]

results = list(collection.aggregate(pipeline))
print(f"Documents after projection: {len(results)}")
for result in results[:5]:
    print(result)


Documents after projection: 0


In [7]:
# MongoDB Aggregation Pipeline
pipeline = [
    {
        "$group": {
            "_id": "$category",  # Group by the 'category' field
            "count": {"$sum": 1}  # Count the number of documents in each category
        }
    },
    {
        "$sort": {
            "count": -1  # Sort by count in descending order
        }
    }
]

# Execute the pipeline
results = list(collection.aggregate(pipeline))

# Print the results
print("Category Counts:")
for result in results:
    print(f"Category: {result['_id']}, Count: {result['count']}")


Category Counts:
Category: POLITICS, Count: 35602
Category: WELLNESS, Count: 17945
Category: ENTERTAINMENT, Count: 17362
Category: TRAVEL, Count: 9900
Category: STYLE & BEAUTY, Count: 9814
Category: PARENTING, Count: 8791
Category: HEALTHY LIVING, Count: 6694
Category: QUEER VOICES, Count: 6347
Category: FOOD & DRINK, Count: 6340
Category: BUSINESS, Count: 5992
Category: COMEDY, Count: 5400
Category: SPORTS, Count: 5077
Category: BLACK VOICES, Count: 4583
Category: HOME & LIVING, Count: 4320
Category: PARENTS, Count: 3955
Category: THE WORLDPOST, Count: 3664
Category: WEDDINGS, Count: 3653
Category: WOMEN, Count: 3572
Category: CRIME, Count: 3562
Category: IMPACT, Count: 3484
Category: DIVORCE, Count: 3426
Category: WORLD NEWS, Count: 3299
Category: MEDIA, Count: 2944
Category: WEIRD NEWS, Count: 2777
Category: GREEN, Count: 2622
Category: WORLDPOST, Count: 2579
Category: RELIGION, Count: 2577
Category: STYLE, Count: 2254
Category: SCIENCE, Count: 2206
Category: TECH, Count: 2104
Categ

In [14]:
from pymongo import UpdateOne

# Create a list to hold update operations
updates = []

# Iterate over all documents and prepare updates
for doc in collection.find():
    date_string = doc['date']
   


In [19]:
from datetime import datetime, timedelta

# Define the date range for filtering (last 5 years)
five_years_ago = datetime.now() - timedelta(days=5*365)

# Create the aggregation pipeline
pipeline_complex = [
    {
        '$match': {
            'date': {'$gte': five_years_ago}  # Filter for articles from the last 5 years
        }
    },
    {
        '$group': {
            '_id': {
                'year': {'$year': '$date'},
                'category': '$category'  # Group by both year and category
            },
            'count': {'$sum': 1}  # Count articles in each category per year
        }
    },
    {
        '$sort': {'_id.year': 1, 'count': -1}  # Sort by year and then by count descending
    },
    {
        '$project': {
            'year': '$_id.year',
            'category': '$_id.category',
            'count': 1,  # Include the count in the output
            '_id': 0  # Exclude the default _id field
        }
    }
]

# Execute the pipeline
complex_results = list(collection.aggregate(pipeline_complex))

# Print the results
print("Articles Count by Category and Year (Last 5 Years):")
for entry in complex_results:
    print(f"Year: {entry['year']}, Category: {entry['category']}, Count: {entry['count']}")


Articles Count by Category and Year (Last 5 Years):
Year: 2019, Category: POLITICS, Count: 159
Year: 2019, Category: ENTERTAINMENT, Count: 59
Year: 2019, Category: COMEDY, Count: 22
Year: 2019, Category: STYLE & BEAUTY, Count: 14
Year: 2019, Category: WORLD NEWS, Count: 14
Year: 2019, Category: MEDIA, Count: 11
Year: 2019, Category: WOMEN, Count: 11
Year: 2019, Category: HOME & LIVING, Count: 11
Year: 2019, Category: SPORTS, Count: 10
Year: 2019, Category: U.S. NEWS, Count: 8
Year: 2019, Category: CRIME, Count: 8
Year: 2019, Category: WEIRD NEWS, Count: 5
Year: 2019, Category: QUEER VOICES, Count: 4
Year: 2019, Category: RELIGION, Count: 4
Year: 2019, Category: ENVIRONMENT, Count: 4
Year: 2019, Category: IMPACT, Count: 3
Year: 2019, Category: FOOD & DRINK, Count: 2
Year: 2019, Category: BLACK VOICES, Count: 2
Year: 2019, Category: PARENTING, Count: 2
Year: 2019, Category: WELLNESS, Count: 1
Year: 2019, Category: MONEY, Count: 1
Year: 2019, Category: BUSINESS, Count: 1
Year: 2020, Categ

In [9]:
pipeline_avg_headline_length = [
    {
        "$project": {
            "category": "$category",
            "headline_length": {"$strLenCP": "$headline"}  # Calculate the length of the headline
        }
    },
    {
        "$group": {
            "_id": "$category",
            "average_length": {"$avg": "$headline_length"}  # Calculate average length
        }
    },
    {
        "$sort": {
            "average_length": -1  # Sort by average length in descending order
        }
    }
]

# Execute the pipeline
avg_headline_lengths = list(collection.aggregate(pipeline_avg_headline_length))

# Print the results
print("Average Headline Length by Category:")
for result in avg_headline_lengths:
    print(f"Category: {result['_id']}, Average Headline Length: {result['average_length']:.2f}")


Average Headline Length by Category:
Category: U.S. NEWS, Average Headline Length: 68.38
Category: WORLD NEWS, Average Headline Length: 65.66
Category: ARTS & CULTURE, Average Headline Length: 65.21
Category: LATINO VOICES, Average Headline Length: 65.01
Category: ENTERTAINMENT, Average Headline Length: 64.62
Category: STYLE & BEAUTY, Average Headline Length: 64.59
Category: POLITICS, Average Headline Length: 64.25
Category: BLACK VOICES, Average Headline Length: 64.14
Category: GOOD NEWS, Average Headline Length: 64.00
Category: CRIME, Average Headline Length: 63.75
Category: THE WORLDPOST, Average Headline Length: 63.59
Category: SPORTS, Average Headline Length: 62.85
Category: MEDIA, Average Headline Length: 62.55
Category: QUEER VOICES, Average Headline Length: 62.31
Category: HOME & LIVING, Average Headline Length: 61.61
Category: SCIENCE, Average Headline Length: 60.96
Category: COMEDY, Average Headline Length: 60.84
Category: CULTURE & ARTS, Average Headline Length: 60.14
Catego

In [10]:
pipeline_author_count = [
    {
        "$group": {
            "_id": "$authors",
            "count": {"$sum": 1}
        }
    },
    {
        "$sort": {
            "count": -1  # Sort by count in descending order
        }
    }
]

# Execute the pipeline
author_counts = list(collection.aggregate(pipeline_author_count))

# Print the results
print("Articles Count by Author:")
for result in author_counts:
    print(f"Author: {result['_id']}, Count: {result['count']}")


Articles Count by Author:
Author: , Count: 37418
Author: Lee Moran, Count: 2954
Author: Ron Dicker, Count: 2219
Author: Ed Mazza, Count: 1590
Author: Mary Papenfuss, Count: 1566
Author: Reuters, Reuters, Count: 1563
Author: Cole Delbyck, Count: 1266
Author: Carly Ledbetter, Count: 1170
Author: Curtis M. Wong, Count: 1146
Author: Andy McDonald, Count: 1100
Author: David Moye, Count: 1067
Author: Julia Brucculieri, Count: 1063
Author: Nina Golgowski, Count: 1052
Author: Bill Bradley, Count: 997
Author: Dana Oliver, Count: 936
Author: Dominique Mosbergen, Count: 935
Author: Igor Bobic, Count: 907
Author: Sam Levine, Count: 905
Author: Caroline Bologna, Count: 889
Author: Michelle Manetti, Count: 876
Author: Michelle Persad, Count: 875
Author: Ellie Krupnick, Count: 861
Author: Jenna Amatulli, Count: 813
Author: Jamie Feldman, Count: 806
Author: Ryan Grenoble, Count: 770
Author: James Michael Nichols, Count: 764
Author: Rebecca Adams, Count: 753
Author: Sara Boboltz, Count: 729
Author: Mat

In [11]:
start_date = "2020-01-01"  # Define start date
end_date = "2020-12-31"    # Define end date

pipeline_date_filter = [
    {
        "$match": {
            "date": {
                "$gte": start_date,
                "$lte": end_date
            }
        }
    }
]

# Execute the pipeline
filtered_articles = list(collection.aggregate(pipeline_date_filter))

# Print the number of articles found
print(f"Articles from {start_date} to {end_date}: {len(filtered_articles)}")


Articles from 2020-01-01 to 2020-12-31: 2054


In [12]:
pipeline_common_words = [
    {
        "$project": {
            "words": {"$split": ["$headline", " "]},  # Split headline into words
        }
    },
    {
        "$unwind": "$words"  # Unwind the array of words into individual documents
    },
    {
        "$group": {
            "_id": "$words",
            "count": {"$sum": 1}  # Count occurrences of each word
        }
    },
    {
        "$sort": {
            "count": -1  # Sort by count in descending order
        }
    },
    {
        "$limit": 10  # Limit to top 10 most common words
    }
]

# Execute the pipeline
common_words = list(collection.aggregate(pipeline_common_words))

# Print the results
print("Most Common Words in Headlines:")
for result in common_words:
    print(f"Word: '{result['_id']}', Count: {result['count']}")


Most Common Words in Headlines:
Word: 'The', Count: 49249
Word: 'To', Count: 41132
Word: 'In', Count: 25941
Word: 'A', Count: 25704
Word: 'Of', Count: 24393
Word: 'For', Count: 20042
Word: 'Is', Count: 17365
Word: 'And', Count: 15827
Word: 'On', Count: 14663
Word: 'With', Count: 13222
