In [47]:
import os
import json
import psycopg2
from psycopg2.extras import register_default_json
from langdetect import detect, LangDetectException
import google.generativeai as genai
from google.generativeai import types

In [48]:
DB_CONFIG = {
    'user': 'admin',
    'host': 'raghuserver',
    'database': 'SII',
    'password': 'raghu@123',
    'port': 5432
}

In [49]:
def connect_db(config):
    try:
        conn = psycopg2.connect(**config)
        register_default_json(conn)
        return conn
    except Exception as e:
        print(f"Error connecting to database: {e}")
        return None
    

In [50]:
import re, json

def extract_json(text):
    """
    Extract the first valid JSON object from a string.
    """
    # Remove any ``` or ```json fences
    text = re.sub(r'```(?:json)?\n?', '', text)
    
    # Find the first {...} span
    m = re.search(r'\{.*\}', text, flags=re.DOTALL)
    if not m:
        raise ValueError("No JSON object found")
    
    json_str = m.group(0)
    
    # Validate
    try:
        json.loads(json_str)
    except json.JSONDecodeError as e:
        raise ValueError(f"Invalid JSON: {e}")
    
    return json_str


def punctuate_and_split_model(text, model):
    """
    text: string containing a single review
    Returns: list of sentences
    """
    prompt = f"""
        You will receive a text. Your task is to:
        1. Insert appropriate punctuation without changing any words.
        2. Split the result into natural sentences.
        3. Return a JSON array of the punctuated sentences.

        RULES:
        • **Output must be valid JSON only**—no extra text, no explanations.
        • If the text cannot be split, return it as a single-element array.
        • Do not reorder, remove, or add any words.

        Input:
        {text}

        Output:
    """
    
    # Make up to 3 attempts to get valid JSON
    for _ in range(3):
        try:
            response = model.generate_content(prompt)
            json_str = extract_json(response.text)
            sentences = json.loads(json_str)
            if isinstance(sentences, list):
                return sentences
        except (ValueError, json.JSONDecodeError) as e:
            continue
        
    raise ValueError("Failed to get valid JSON after 3 attempts")


In [51]:
from sentencex import segment

def punctuate_and_split_single(text, model):
    """
    text: string containing a single review
    Returns: list of sentences
    """
    try:
        # Use sentencex to split the text into sentences
        sentences = list(segment('en', text))
        return sentences
    except Exception as e:
        print(f"Error splitting sentences: {e}")
        return [text]


In [52]:
from dotenv import load_dotenv
load_dotenv()

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "your-gemini-api-key-here")

def setup_gemini_api():
    genai.configure(api_key=GEMINI_API_KEY)
    model = genai.GenerativeModel('gemini-2.0-flash-lite')
    return model

In [53]:
from langdetect import detect, LangDetectException

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException as e:
        print(f"Error detecting language: {e}")
        return 'unknown'

In [None]:
def has_punctuation(text):
    """Check if text contains sentence-ending punctuation."""
    return any(char in text for char in '.!?')

def process_reviews():
    '''
    Connects to the database, fetches unprocessed reviews, splits them into sentences, and stores the results.
    Can be imported and called from other scripts.
    '''
    print("Connecting to database...")
    conn = connect_db(DB_CONFIG)
    if not conn:
        print("Failed to connect to database")
        return
    print("Setting up cursor...")
    cur = conn.cursor()
    processed = 0
    while True:
        cur.execute("""
            SELECT id, body
            FROM combined_reviews
            WHERE processed = FALSE
            AND body IS NOT NULL
            AND length(trim(body)) > 0
            ORDER BY id
            LIMIT 1
        """)
        row = cur.fetchone()
        if not row:
            print(f'No more reviews to process. Total processed: {processed}')
            break
        review_id, body = row
        print(f'\nProcessing review {review_id}: {body[:50]}...')
        lang = detect_language(body)
        if lang != 'en':
            print(f'Skipping non-English review (detected: {lang})')
            cur.execute("""
                UPDATE combined_reviews
                SET processed = TRUE
                WHERE id = %s
            """, (review_id,))
            continue
        try:
            sentences = punctuate_and_split_single(body, None)
            print(f'Split into {len(sentences)} sentences')
            for idx, sent in enumerate(sentences, start=1):
                cur.execute("""
                    INSERT INTO review_sentences (review_id, sentence_num, sentence)
                    VALUES (%s, %s, %s)
                """, (review_id, idx, sent))
            cur.execute("""
                UPDATE combined_reviews
                SET processed = TRUE
                WHERE id = %s
            """, (review_id,))
            conn.commit()
            processed += 1
            print(f'Successfully processed review {review_id}')
        except Exception as e:
            print(f'Error processing review {review_id}: {e}')
            conn.rollback()
    print(f'\nFinished processing. Total reviews processed: {processed}')
    cur.close()
    conn.close()

Connecting to database...
Setting up cursor...

Processing review 59945: Happy new owner,After a couple winters using the K...
Split into 6 sentences
Successfully processed review 59945

Processing review 59946: I loved it,I loved it...
Split into 1 sentences
Successfully processed review 59946

Processing review 59947: All season bike,Good quality build and easy to use...
Split into 3 sentences
Successfully processed review 59947

Processing review 59948: Wahoo Kickr Bike,Wahoo Kickr Bike...
Skipping non-English review (detected: so)

Processing review 59949: Great bike! Very smooth riding.,Great bike! Very s...
Split into 5 sentences
Successfully processed review 59949

Processing review 59950: Pricey but great quality,Excellent bit of kit. Pre...
Split into 2 sentences
Successfully processed review 59950

Processing review 59951: A very good bike,I’ve been using the kickr bike fo...
Split into 4 sentences
Successfully processed review 59951

Processing review 59952: Indoor training 