In [3]:
import pandas as pd
import numpy as np
from pathlib import Path

RAW_DIR = Path("../data/raw")
CLEAN_DIR = Path("../data/preprocessed")
CLEAN_DIR.mkdir(parents=True, exist_ok=True)

In [4]:
papers_raw = pd.read_csv(RAW_DIR / "papers.csv")
users_raw = pd.read_csv(RAW_DIR / "users.csv")
interactions_raw = pd.read_csv(RAW_DIR / "interactions.csv")

papers_raw.head(), users_raw.head(), interactions_raw.head()

(   paper_id                                    title  \
 0         0              Military staff happy event.   
 1         1  Possible standard former whether smile.   
 2         2               Shake evidence yeah cover.   
 3         3       Customer lay politics sure pretty.   
 4         4        Write animal forward dark tax if.   
 
                                             abstract  year topic_primary  
 0  Senior nor ahead consider. Success light capit...  2005  MentalHealth  
 1  Maintain hair general let. Character material ...  2001           HIV  
 2  Season education easy space argue. Stage inter...  2001           MCH  
 3  Detail herself easy miss red. Nor arm line for...  2011           CVD  
 4  Health memory budget matter simply set. None c...  2015           NCD  ,
    user_id research_focus_1 research_focus_2 experience_level  country
 0        0      Respiratory              NCD           junior  Germany
 1        1           Cancer              MCH          

In [5]:
papers = papers_raw.copy()

# Ensure abstracts are strings
papers["abstract"] = papers["abstract"].fillna("").astype(str)

# Strip whitespace
papers["abstract"] = papers["abstract"].str.strip()

# Remove weird whitespace characters
papers["abstract"] = papers["abstract"].str.replace(r"\s+", " ", regex=True)

# Drop duplicates if any
papers = papers.drop_duplicates(subset=["paper_id"]).reset_index(drop=True)

# Ensure year is int
papers["year"] = papers["year"].astype(int)

papers.head()

users = users_raw.copy()

# Strip whitespace for categorical fields
users["research_focus_1"] = users["research_focus_1"].astype(str).str.strip()
users["research_focus_2"] = users["research_focus_2"].astype(str).str.strip()
users["experience_level"] = users["experience_level"].astype(str).str.strip()
users["country"] = users["country"].astype(str).str.strip()

# Drop duplicates if any
users = users.drop_duplicates(subset=["user_id"]).reset_index(drop=True)

users.head()
inter = interactions_raw.copy()

# Convert timestamp to datetime
inter["timestamp"] = pd.to_datetime(inter["timestamp"], errors="coerce")

# Drop rows where timestamp conversion failed
inter = inter.dropna(subset=["timestamp"])

# Ensure event is string
inter["event"] = inter["event"].astype(str).str.strip()

# Remove interactions where user_id or paper_id is invalid
valid_inter = inter[
    (inter["user_id"].isin(users["user_id"])) &
    (inter["paper_id"].isin(papers["paper_id"]))
].reset_index(drop=True)

valid_inter.head()



Unnamed: 0,user_id,paper_id,event,timestamp,label
0,60,21,click,2024-03-05 06:23:17,1
1,38,760,open_pdf,2025-01-01 23:28:11,0
2,179,359,click,2025-08-18 12:13:33,1
3,66,986,click,2025-03-14 18:53:32,1
4,140,624,click,2025-06-20 03:57:24,1


In [6]:
# Paper recency feature (for ranking later)
CURRENT_YEAR = 2025
papers["paper_recency"] = CURRENT_YEAR - papers["year"]

papers.head()

Unnamed: 0,paper_id,title,abstract,year,topic_primary,paper_recency
0,0,Military staff happy event.,Senior nor ahead consider. Success light capit...,2005,MentalHealth,20
1,1,Possible standard former whether smile.,Maintain hair general let. Character material ...,2001,HIV,24
2,2,Shake evidence yeah cover.,Season education easy space argue. Stage inter...,2001,MCH,24
3,3,Customer lay politics sure pretty.,Detail herself easy miss red. Nor arm line for...,2011,CVD,14
4,4,Write animal forward dark tax if.,Health memory budget matter simply set. None c...,2015,NCD,10


In [7]:


papers.to_csv(CLEAN_DIR / "papers_cleaned.csv", index=False)
users.to_csv(CLEAN_DIR / "users_cleaned.csv", index=False)
valid_inter.to_csv(CLEAN_DIR / "interactions_cleaned.csv", index=False)

print("Saved cleaned data:")
print(" - papers_cleaned.csv")
print(" - users_cleaned.csv")
print(" - interactions_cleaned.csv")

Saved cleaned data:
 - papers_cleaned.csv
 - users_cleaned.csv
 - interactions_cleaned.csv


In [8]:
print("Papers:", papers.shape)
print("Users:", users.shape)
print("Interactions:", valid_inter.shape)

print("\nUnique topics:", papers["topic_primary"].unique())
print("\nEvents distribution:\n", valid_inter["event"].value_counts())


Papers: (1000, 6)
Users: (200, 5)
Interactions: (3000, 5)

Unique topics: ['MentalHealth' 'HIV' 'MCH' 'CVD' 'NCD' 'TB' 'Respiratory' 'Cancer']

Events distribution:
 event
click       1797
open_pdf     622
save         581
Name: count, dtype: int64
