In [None]:
from data

In [30]:
import psycopg2
import time
import statistics
from datasets import load_dataset
from config import load_config

def create_table():
    cmd = """ CREATE TABLE IF NOT EXISTS corpus (
        sentence_id INTEGER PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
        sentence TEXT NOT NULL
        )"""
    try:
        config = load_config()
        with psycopg2.connect(**config) as conn:
            with conn.cursor() as cur:
                    cur.execute(cmd)
    except (psycopg2.DatabaseError, Exception) as error:
        print(error)

    print("Table corpus created")

def insert_sentences(ds_chunk):
    sql = """INSERT INTO corpus (sentence) VALUES (%s)"""
    text_chunks = []
    for example in ds_chunk:
        text = example["text"]
        text_chunks.append((text,))
    try:
        config = load_config()
        with psycopg2.connect(**config) as conn:
            with conn.cursor() as cur:
                cur.executemany(sql, text_chunks)
            conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)

if __name__ == '__main__':
    create_table()
    times = []
    ds = load_dataset("rojagtap/bookcorpus", split="train")
    chunk_size = 10000
    for i in range(0, len(ds), chunk_size):
        start_time = time.time()
        chunk = ds.select(range(i, min(i + chunk_size, len(ds))))
        insert_sentences(chunk)
        end_time = time.time()
        times.append(end_time - start_time)
    
    print("Finished storing textual data")
    min_time = min(times)
    max_time = max(times)
    avg_time = sum(times) / len(times)
    std_dev = statistics.stdev(times) if len(times) > 1 else 0
    print(f"Minimum time: {min_time:.6f} seconds")
    print(f"Maximum time: {max_time:.6f} seconds")
    print(f"Average time: {avg_time:.6f} seconds")
    print(f"Standard deviation: {std_dev:.6f} seconds")
    

Table corpus created
Finished storing textual data
Minimum time: 0.471947 seconds
Maximum time: 1.614666 seconds
Average time: 0.925559 seconds
Standard deviation: 0.070079 seconds
