<a href="https://colab.research.google.com/github/ShaliniAnandaPhD/Real-Time-News-Aggregator-and-recommender/blob/main/Real_time_news_aggregator_and_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data collection

In [1]:
import requests
import pandas as pd

def fetch_news(api_key, query='technology', page_size=30, page=1):
    base_url = 'https://newsapi.org/v2/everything'
    params = {
        'q': query,
        'apiKey': api_key,
        'pageSize': page_size,
        'page': page
    }
    response = requests.get(base_url, params=params)
    return response.json()

# Use your API key here
api_key = '59205caa54af4966a522adf166e8e8a0'
news_data = fetch_news(api_key)

# Convert to DataFrame
df = pd.DataFrame(news_data['articles'])
print(df.head())


                                   source              author  \
0  {'id': 'engadget', 'name': 'Engadget'}         Malak Saleh   
1        {'id': 'wired', 'name': 'Wired'}  Angela Watercutter   
2        {'id': 'wired', 'name': 'Wired'}     Tristan Kennedy   
3        {'id': 'wired', 'name': 'Wired'}         Alex Miller   
4        {'id': 'wired', 'name': 'Wired'}  Caitlin Harrington   

                                               title  \
0  Researchers developed a gene-editing technolog...   
1  "Now and Then," The Beatles’ Last Song, Is Her...   
2                          Skiing Is Getting Riskier   
3         Tech and Games Can Help Curb Youth Suicide   
4  This AI Bot Fills Out Job Applications for You...   

                                         description  \
0  In a trial run by Verve Therapeutics\r\n, a Ca...   
1  The track was made possible thanks to technolo...   
2  The threat of avalanches is rising with global...   
3  In the face of lackluster mental health suppo

Data preprocessing

In [2]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

def clean_text(text):
    # Remove HTML tags, non-alphanumeric characters, and convert to lowercase
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.lower()

    # Remove stopwords and perform lemmatization
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

# Apply cleaning to the DataFrame
df['content_cleaned'] = df['content'].apply(clean_text)
print(df[['content', 'content_cleaned']].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


                                             content  \
0  In a trial run by Verve Therapeutics\r\n, a Ca...   
1  Following a lot of hypeand a quarter-century o...   
2  As Olivier Gardet piloted the drone around the...   
3  One of my most traumatizing moments was when m...   
4  In July, software engineer Julian Joseph becam...   

                                     content_cleaned  
0  trial run verve therapeutic cambridgebased bio...  
1  following lot hypeand quartercentury worknow p...  
2  olivier gardet piloted drone around mountain c...  
3  one traumatizing moment best friend terry got ...  
4  july software engineer julian joseph became la...  


News Categorization

In [3]:
from nltk.corpus import movie_reviews
from random import shuffle
from nltk import FreqDist, NaiveBayesClassifier
from nltk.classify import accuracy

nltk.download('movie_reviews')

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
shuffle(documents)

all_words = FreqDist(word.lower() for word in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features[f'contains({word})'] = (word in document_words)
    return features

featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = NaiveBayesClassifier.train(train_set)

# Example prediction
print(classifier.classify(document_features(df['content_cleaned'][0].split())))


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


neg


User Interaction and Data Collection - here is flask and streamlit!

In [5]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route('/read_article', methods=['POST'])
def read_article():
    user_id = request.json['user_id']
    article_id = request.json['article_id']
    # Here, you'd record the user's reading history
    return jsonify({'status': 'success'})

if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat


We can also try streamlit

In [6]:
!pip install streamlit


Collecting streamlit
  Downloading streamlit-1.28.2-py2.py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m66.6 MB/s[0m eta [36m0:00:00[0m
Collecting validators<1,>=0.2 (from streamlit)
  Downloading validators-0.22.0-py3-none-any.whl (26 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.40-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.8.1b0-py2.py3-none-any.whl (4.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m93.9 MB/s[0m eta [36m0:00:00[0m
Collecting watchdog>=2.1.5 (from streamlit)
  Downloading watchdog-3.0.0-py3-none-manylinux2014_x86_64.whl (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m12.0 MB/s[0m eta [36m0:0

In [7]:
%%writefile streamlit_app.py
import streamlit as st
import pandas as pd

# Sample DataFrame with news articles
# In a real application, this would be replaced with your actual data
articles = pd.DataFrame({
    'article_id': range(1, 6),
    'title': ['Article 1', 'Article 2', 'Article 3', 'Article 4', 'Article 5'],
    'content': ['Content 1', 'Content 2', 'Content 3', 'Content 4', 'Content 5']
})

# Simulate a user_id (in a real app, this would come from your user management system)
user_id = 1

# Streamlit interface
st.title("News Recommender System")

# Display articles
for index, row in articles.iterrows():
    st.subheader(row['title'])
    st.write(row['content'])

    # Button to record reading an article
    if st.button(f'Read Article {row["article_id"]}'):
        # Here, record the user's reading history
        # For example, append to a file or a database
        # This is a placeholder for the logic you'd implement
        st.write(f"You read article {row['article_id']}")



Writing streamlit_app.py


Recommendation System

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Ensure that the DataFrame has the required columns
if 'content_cleaned' in df.columns and 'title' in df.columns:
    # Check and remove any rows with missing or empty values in 'content_cleaned' or 'title'
    df = df.dropna(subset=['content_cleaned', 'title'])
    df = df[df['content_cleaned'].str.strip() != '']

    # Creating TF-IDF matrix
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(df['content_cleaned'])

    # Compute cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    def recommend_articles(title, cosine_sim=cosine_sim):
        if title in df['title'].values:
            idx = df[df['title'] == title].index[0]
            sim_scores = list(enumerate(cosine_sim[idx]))
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
            sim_scores = sim_scores[1:11]  # Get top 10 recommendations
            article_indices = [i[0] for i in sim_scores]
            return df['title'].iloc[article_indices]
        else:
            return f"Title '{title}' not found in the DataFrame."

    # Display some titles from the DataFrame
    print("Some titles from the DataFrame:")
    print(df['title'].head())

    # Example recommendation
    try:
        # Replace with an actual title from your DataFrame
        sample_title = df['title'].iloc[0]  # using the first title in the DataFrame
        print(f"Recommendations for '{sample_title}':")
        print(recommend_articles(sample_title))
    except Exception as e:
        print(f"An error occurred: {e}")
else:
    print("DataFrame does not have the required columns 'content_cleaned' and 'title'.")




Some titles from the DataFrame:
0    Researchers developed a gene-editing technolog...
1    "Now and Then," The Beatles’ Last Song, Is Her...
2                            Skiing Is Getting Riskier
3           Tech and Games Can Help Curb Youth Suicide
4    This AI Bot Fills Out Job Applications for You...
Name: title, dtype: object
Recommendations for 'Researchers developed a gene-editing technology that reduces 'bad' cholesterol':
11    Researchers printed a robotic hand with bones,...
8              Is AI the answer to sustainable farming?
10    Swedish court clears dual national Skvortsov o...
16    OpenAI CEO Sam Altman ousted as 'board no long...
22    Is technology our savior — or our slayer? | Ru...
3            Tech and Games Can Help Curb Youth Suicide
21    These Solid-State Drivers Bring the Future of ...
9                 The Case for Buying ‘Dumb’ Appliances
1     "Now and Then," The Beatles’ Last Song, Is Her...
25    GM’s Cruise Halts Self-Driving Operations Acro...
Name