In [3]:
# Team: Bharat Manivannan, Jiayang Wang, Sai Sneha Kakarla, Jaswanthi Mandalapu
# ============================================================
# DBMS PROJECT — FINAL VERSION WITH REAL AUTHORS AS MEMBERS
# ============================================================

import pandas as pd
import numpy as np
import random
import re
import ast
from datetime import datetime, timedelta

random.seed(42)
np.random.seed(42)

# ============================================================
# LOAD AND CLEAN INPUT DATA
# ============================================================

df = pd.read_csv("dblp-v10.csv", engine="python", on_bad_lines="skip")

df = df[['title', 'abstract', 'authors', 'year', 'id']]
df.columns = ['title', 'abstract', 'authors', 'year', 'paper_id_raw']

df['url'] = df['paper_id_raw'].apply(lambda x: f"https://university.edu/papers/{x}.pdf")

df = df[['title', 'abstract', 'authors', 'year', 'url']]

# Remove ANY row that has ANY missing value
df = df.dropna(how="any")

# Remove duplicate titles
df = df.drop_duplicates(subset=['title'])

# Ensure authors column is valid list representation
df = df[df['authors'].str.startswith('[')]  # keep only rows where authors is list-like

# Now safely sample 1500 rows
df_1500 = df.sample(1500).reset_index(drop=True)

# Convert authors string → list
df_1500['authors'] = df_1500['authors'].apply(lambda x: ast.literal_eval(x))

In [4]:
# ============================================================
# EXTRACT UNIQUE REAL AUTHORS FROM DBLP
# ============================================================

real_authors = set()
for author_list in df_1500['authors']:
    for a in author_list:
        real_authors.add(a.strip())

real_authors = sorted(list(real_authors))

print("Real authors extracted:", len(real_authors))

Real authors extracted: 4600


In [5]:
# ============================================================
# CREATE MEMBERS = REAL AUTHORS + EXTRA UNIVERSITY MEMBERS
# ============================================================

first_names = ["John", "Sarah", "Akhil", "David", "Maria", "Chen", "Ravi",
               "Anna", "Sofia", "Omar", "Ishan", "Emma", "Raj", "Julia",
               "Tom", "Lara", "Ethan", "Zhang", "Mei", "Carlos"]

last_names = ["Patel", "Kim", "Singh", "Mandalapu", "Lopez", "Chen", "Kumar",
              "Garcia", "Smith", "Brown", "Nguyen", "Iyer", "Müller",
              "Rao", "Hernandez", "Jones", "Taylor", "Lee", "Davis"]

member_list = []
member_id = 1

# Add real authors as Members (faculty + students)
for name in real_authors:
    email = name.lower().replace(" ", ".") + f"{member_id}@university.edu"

    # Assign role with probability: 70% Student, 30% Faculty
    role = random.choice(["Student"] * 7 + ["Faculty"] * 3)

    dept_id = random.randint(1, 20)

    member_list.append([member_id, name, email, role, dept_id])
    member_id += 1

# Add 300 more random university members
for _ in range(300):
    name = f"{random.choice(first_names)} {random.choice(last_names)}"
    email = name.lower().replace(" ", ".") + f"{member_id}@university.edu"
    role = random.choice(["Student"] * 7 + ["Faculty"] * 3)
    dept_id = random.randint(1, 20)
    member_list.append([member_id, name, email, role, dept_id])
    member_id += 1

member_df = pd.DataFrame(member_list,
                         columns=["member_id", "name", "email", "role", "dept_id"])

member_df = member_df.drop_duplicates(subset=['email'])
member_df = member_df.reset_index(drop=True)

# Reassign continuous member_id
member_df['member_id'] = range(1, len(member_df) + 1)

print("Final members count:", len(member_df))

Final members count: 4900


In [6]:
# ============================================================
# CREATE DEPARTMENT TABLE
# ============================================================

departments = [
    "Computer Science", "Electrical Engineering", "Mechanical Engineering",
    "Civil Engineering", "Mathematics", "Physics", "Chemistry",
    "Biotechnology", "Information Systems", "Data Science",
    "Business Analytics", "Economics", "Psychology",
    "Aerospace Engineering", "Robotics", "AI & Machine Learning",
    "Biomedical Engineering", "Material Science", "Statistics",
    "Cybersecurity"
]

dept_df = pd.DataFrame({
    "dept_id": range(1, 21),
    "dept_name": departments
})

In [7]:
# ============================================================
# PAPER TABLE (1500 papers)
# ============================================================

papers = []
for i, row in df_1500.iterrows():
    papers.append([
        i+1,
        row['title'][:250],
        str(row['abstract'])[:5000],
        int(row['year']) if str(row['year']).isdigit() else random.randint(2000, 2024),
        random.randint(1, 20),
        random.randint(1, 4),
        datetime.now().date(),
        row['url']
    ])

paper_df = pd.DataFrame(papers, columns=[
    "paper_id", "title", "abstract", "published_year",
    "dept_id", "category_id", "upload_date", "pdf_link"
])

valid_paper_ids = list(paper_df['paper_id'])

In [8]:
paper_df.head(10)

Unnamed: 0,paper_id,title,abstract,published_year,dept_id,category_id,upload_date,pdf_link
0,1,On the Georgiou-Lindquist approach to constrai...,We consider the Georgiou-Lindquist constrained...,2006,4,2,2025-12-02,https://university.edu/papers/4e8c562e-4e20-45...
1,2,DC/sup 2/ scheduling for aperiodic tasks in st...,The strongly partitioned real time system (SP-...,2000,17,3,2025-12-02,https://university.edu/papers/4c0b0839-78df-40...
2,3,Evolutionary learning of flexible neuro-fuzzy ...,"In the paper the evolutionary strategy (mu, la...",2008,19,4,2025-12-02,https://university.edu/papers/4fdcdf15-2b44-49...
3,4,How to Determine Output Schemas of XQuery Queries,The XQuery language is the standard query lang...,2007,4,3,2025-12-02,https://university.edu/papers/4eb105c8-cbe2-44...
4,5,The effects of periodic and continuous market ...,Simulation experiments are conducted on simple...,2008,19,3,2025-12-02,https://university.edu/papers/546e5cac-edec-42...
5,6,Species-appropriate computer mediated interaction,Given the importance of our non-human companio...,2009,18,3,2025-12-02,https://university.edu/papers/50ed49f8-702f-43...
6,7,A Local Information Exchange Based Coverage-Pr...,Coverage preserving solutions have been report...,2006,20,1,2025-12-02,https://university.edu/papers/5015a843-777d-4c...
7,8,Introducing Engineering in Elementary Educatio...,"Abstract#R##N##R##N#Engineering, when integrat...",2015,2,2,2025-12-02,https://university.edu/papers/54d3f8e9-4dcd-4b...
8,9,Research of Dispatching Method in Elevator Gro...,Elevator group control system (EGCS) with mult...,2009,11,4,2025-12-02,https://university.edu/papers/54fc0d1d-ad57-46...
9,10,Developing a review process for online resources,The democratization of content creation via ub...,2008,6,3,2025-12-02,https://university.edu/papers/4caf0a25-a0ea-44...


In [9]:
# ============================================================
# PAPER AUTHOR TABLE — REAL AUTHORS ONLY
# ============================================================

paper_author_list = []

for idx, row in df_1500.iterrows():
    pid = idx + 1
    for author_name in row['authors']:
        # find member_id of this author
        match = member_df[member_df['name'] == author_name]
        if len(match) > 0:
            mid = match.iloc[0]['member_id']
            paper_author_list.append([pid, mid])

paper_author_df = pd.DataFrame(paper_author_list, columns=['paper_id', 'member_id'])
paper_author_df = paper_author_df.drop_duplicates()

In [10]:
# ============================================================
# KEYWORDS + PAPERKEYWORD TABLE
# ============================================================

keyword_set = set()

def extract_keywords(title):
    words = re.findall(r'\b[a-zA-Z]{5,}\b', str(title))
    return [w.lower() for w in words][:5]

for t in paper_df['title']:
    for kw in extract_keywords(t):
        keyword_set.add(kw)

keyword_df = pd.DataFrame({
    "keyword_id": range(1, len(keyword_set)+1),
    "keyword": sorted(keyword_set)
})

kw_map = {k: i+1 for i, k in enumerate(sorted(keyword_set))}

paper_keyword_list = []
for _, row in paper_df.iterrows():
    for k in extract_keywords(row['title']):
        if k in kw_map:
            paper_keyword_list.append([row['paper_id'], kw_map[k]])

paper_keyword_df = pd.DataFrame(paper_keyword_list,
                                columns=['paper_id', 'keyword_id']).drop_duplicates()


In [11]:
# ============================================================
# COMMENT TABLE — MANY COMMENTS PER PAPER
# ============================================================

valid_member_ids = list(member_df['member_id'])
comment_texts = [
    "Very interesting work!", "Could be improved in methodology.",
    "Excellent contribution.", "I don’t agree with the assumptions.",
    "The results are very strong.", "Needs more experiments."
]

comment_list = []
comment_id = 1

for pid in valid_paper_ids:
    num_comments = random.randint(1, 10)
    for _ in range(num_comments):
        comment_list.append([
            comment_id,
            pid,
            random.choice(valid_member_ids),
            random.choice(comment_texts),
            datetime.now() - timedelta(days=random.randint(1, 400))
        ])
        comment_id += 1

comment_df = pd.DataFrame(comment_list,
                          columns=['comment_id','paper_id','member_id','comment_text','timestamp'])


In [12]:
# ============================================================
# REVIEW TABLE — 1–3 REVIEWS PER PAPER, FACULTY ONLY
# ============================================================

faculty_ids = list(member_df[member_df['role']=="Faculty"]['member_id'])

review_list = []
review_id = 1

for pid in valid_paper_ids:
    num_reviews = random.randint(1, 3)
    reviewers = random.sample(faculty_ids, num_reviews)

    for fid in reviewers:
        review_list.append([
            review_id, pid, fid,
            random.randint(1, 10),
            random.choice(["Good paper","Needs improvement","High impact","Weak methodology"]),
            datetime.now().date()
        ])
        review_id += 1

review_df = pd.DataFrame(review_list,
                         columns=['review_id','paper_id','faculty_id','score','feedback','review_date'])


In [13]:
# ============================================================
# SQL EXPORT FUNCTION
# ============================================================

def export_to_sql(df, table_name):
    lines = []
    for _, row in df.iterrows():
        vals = []
        for v in row:
            if isinstance(v, str):
                v = v.replace('"', "'")
                vals.append(f'"{v}"')
            else:
                vals.append(f'"{v}"')
        lines.append(f"INSERT INTO {table_name} VALUES ({','.join(vals)});")
    return lines

# ============================================================
# WRITE SQL FILES
# ============================================================

tables = {
    "Department.sql": (dept_df, "Department"),
    "Member.sql": (member_df, "Member"),
    "Category.sql": (pd.DataFrame({
        "category_id":[1,2,3,4],
        "category_name":["Reviewed","Unreviewed","Survey","Conference"]
    }), "Category"),
    "Paper.sql": (paper_df, "Paper"),
    "PaperAuthor.sql": (paper_author_df, "PaperAuthor"),
    "Keyword.sql": (keyword_df, "Keyword"),
    "PaperKeyword.sql": (paper_keyword_df, "PaperKeyword"),
    "Comment.sql": (comment_df, "Comment"),
    "Review.sql": (review_df, "Review")
}

for fname, (df_out, tname) in tables.items():
    with open(fname, "w") as f:
        for line in export_to_sql(df_out, tname):
            f.write(line + "\n")

print("Thje code ensures: ALL SQL FILES GENERATED SUCCESSFULLY!")
print("Authors ⊆ Members")
print("Max 3 reviews per paper")
print("Many keywords per paper")
print("Many comments per paper")
print("No duplicates, no FK issues")

Thje code ensures: ALL SQL FILES GENERATED SUCCESSFULLY!
Authors ⊆ Members
Max 3 reviews per paper
Many keywords per paper
Many comments per paper
No duplicates, no FK issues
