In [1]:
!pip list

Package                 Version
----------------------- -----------
asttokens               3.0.0
colorama                0.4.6
comm                    0.2.2
debugpy                 1.8.15
decorator               5.2.1
executing               2.2.0
ipykernel               6.29.5
ipython                 9.4.0
ipython_pygments_lexers 1.1.1
jedi                    0.19.2
jupyter_client          8.6.3
jupyter_core            5.8.1
matplotlib-inline       0.1.7
nest-asyncio            1.6.0
numpy                   2.3.1
packaging               25.0
pandas                  2.3.1
parso                   0.8.4
pip                     25.1.1
platformdirs            4.3.8
prompt_toolkit          3.0.51
psutil                  7.0.0
pure_eval               0.2.3
Pygments                2.19.2
python-dateutil         2.9.0.post0
pytz                    2025.2
pywin32                 311
pyzmq                   27.0.0
setuptools              65.5.0
six                     1.17.0
stack-data         

In [2]:
import numpy as np
import pandas as pd 

In [3]:
df = pd.read_csv('rss_complete_articles.csv')

In [4]:
print("Initial shape:", df.shape)
print(df.columns)

Initial shape: (30, 6)
Index(['Source', 'Title', 'Link', 'Published', 'Content', 'HTML_File'], dtype='object')


In [5]:
df.dropna(subset=['Title', 'Content', 'Published'], inplace=True)

In [6]:
df = df[df['Published'].str.lower() != 'no date']

In [7]:
df['Published'] = pd.to_datetime(df['Published'], errors='coerce')

In [8]:
df.dropna(subset=['Published'], inplace=True)

In [9]:

df['Title'] = df['Title'].str.strip()
df['Content'] = df['Content'].str.strip()
df['Source'] = df['Source'].str.strip()

In [10]:
df.drop_duplicates(subset=['Title'], inplace=True)
df.drop_duplicates(subset=['Link'], inplace=True)

In [11]:
df.reset_index(drop=True, inplace=True)

In [12]:
print("After cleaning:", df.shape)
print(df[['Title', 'Published']].head())

After cleaning: (9, 6)
                                               Title           Published
0  Harry and Meghan call for stronger social medi... 2025-04-24 16:28:07
1  UK edges towards post-Brexit youth visa deal w... 2025-04-24 18:34:01
2    Student killed in French school stabbing attack 2025-04-24 15:51:00
3  Mini heatwave expected as UK set for warmest t... 2025-04-24 13:00:56
4  Tanzania opposition officials arrested as Tund... 2025-04-24 14:07:23


In [13]:
df.to_csv('cleaned_news_articles.csv', index=False)
print("✅ Cleaned data saved to cleaned_news_articles.csv")

✅ Cleaned data saved to cleaned_news_articles.csv


In [14]:
import random
from datetime import timedelta

In [22]:
df1 = pd.read_csv('cleaned_news_articles.csv')
df1.head()

Unnamed: 0,Source,Title,Link,Published,Content,HTML_File
0,BBC News,Harry and Meghan call for stronger social medi...,https://www.bbc.com/news/articles/cjewne81lq4o,2025-04-24 16:28:07,The Duke and Duchess of Sussex are calling for...,article_html/Harry and Meghan call for stronge...
1,BBC News,UK edges towards post-Brexit youth visa deal w...,https://www.bbc.com/news/articles/c9qw58r0x0do,2025-04-24 18:34:01,The government is no longer ruling out a youth...,article_html/UK edges towards post-Brexit yout...
2,BBC News,Student killed in French school stabbing attack,https://www.bbc.com/news/articles/c787r15xngyo,2025-04-24 15:51:00,One student has been killed and at least three...,article_html/Student killed in French school s...
3,BBC News,Mini heatwave expected as UK set for warmest t...,https://www.bbc.com/weather/articles/c89g5wd3pzeo,2025-04-24 13:00:56,"If it feels a bit chilly now, just wait until ...",article_html/Mini heatwave expected as UK set ...
4,The Guardian,Tanzania opposition officials arrested as Tund...,https://www.theguardian.com/world/2025/apr/24/...,2025-04-24 14:07:23,Chadema spokesperson says party’s vice-chair J...,article_html/Tanzania opposition officials arr...


In [24]:
import pandas as pd
from datetime import timedelta
import random

# Load the cleaned dataset
df = pd.read_csv('cleaned_news_articles.csv')

n_variants = 10  # Number of synthetic versions per article
augmented_data = []

# Generate synthetic variants
for idx, row in df1.iterrows():
    for i in range(n_variants):
        new_row = {
            'Title': f"{row['Title']} [v{i+1}]",
            'Content': f"{row['Content']} (Simulated Variant {i+1})",
            'Published': pd.to_datetime(row['Published'], errors='coerce') + timedelta(minutes=random.randint(0, 1440)),
            'Source': row['Source'],
            'Link': row['Link']
        }
        augmented_data.append(new_row)

# Save the synthetic dataset
augmented_df = pd.DataFrame(augmented_data)
augmented_df.to_csv('augmented_articles.csv', index=False)

print("✅ Augmented dataset created with", len(augmented_df), "entries.")


✅ Augmented dataset created with 90 entries.


In [26]:
augmented_df.head()

Unnamed: 0,Title,Content,Published,Source,Link
0,Harry and Meghan call for stronger social medi...,The Duke and Duchess of Sussex are calling for...,2025-04-24 20:41:07,BBC News,https://www.bbc.com/news/articles/cjewne81lq4o
1,Harry and Meghan call for stronger social medi...,The Duke and Duchess of Sussex are calling for...,2025-04-25 07:00:07,BBC News,https://www.bbc.com/news/articles/cjewne81lq4o
2,Harry and Meghan call for stronger social medi...,The Duke and Duchess of Sussex are calling for...,2025-04-25 14:33:07,BBC News,https://www.bbc.com/news/articles/cjewne81lq4o
3,Harry and Meghan call for stronger social medi...,The Duke and Duchess of Sussex are calling for...,2025-04-24 21:07:07,BBC News,https://www.bbc.com/news/articles/cjewne81lq4o
4,Harry and Meghan call for stronger social medi...,The Duke and Duchess of Sussex are calling for...,2025-04-25 05:22:07,BBC News,https://www.bbc.com/news/articles/cjewne81lq4o


In [25]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Load the augmented articles
articles_df = pd.read_csv('augmented_articles.csv')
num_users = 20
users = [f'user_{i}' for i in range(1, num_users + 1)]

# Assign article IDs
articles_df['article_id'] = range(1, len(articles_df) + 1)

# Simulate interactions
interactions = []
for user in users:
    interacted_articles = random.sample(list(articles_df['article_id']), k=random.randint(10, 30))
    for article_id in interacted_articles:
        interactions.append({
            'user_id': user,
            'article_id': article_id,
            'interaction_type': 'read',
            'timestamp': datetime.now() - timedelta(days=random.randint(0, 30))
        })

interactions_df = pd.DataFrame(interactions)
interactions_df.to_csv('user_interactions.csv', index=False)
articles_df.to_csv('augmented_articles_with_ids.csv', index=False)

print("✅ Generated user_interactions.csv with", len(interactions_df), "rows.")


✅ Generated user_interactions.csv with 378 rows.
