<a href="https://colab.research.google.com/github/Sumitk99/122ec0011/blob/master/winePredictor_(3).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade weaviate-client rapidfuzz openai tqdm pandas -q

!pip install -q transformers accelerate


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m433.0/433.0 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.1/755.1 kB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m92.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.0/240.0 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import weaviate, pandas as pd, time, re
from rapidfuzz import process
from google.colab import files, userdata
from weaviate.auth import AuthApiKey
from openai import OpenAI
from tqdm import tqdm


WEAVIATE_API_KEY = userdata.get("Weaviate").strip()
WEAVIATE_URL     = "https://5yinmh2nt6oomas2p5uxaw.c0.asia-southeast1.gcp.weaviate.cloud"
OPENAI_API_KEY   = userdata.get("OpenAI")


client_openai = OpenAI(api_key=OPENAI_API_KEY)
client = weaviate.connect_to_weaviate_cloud(
    cluster_url      = WEAVIATE_URL,
    auth_credentials = AuthApiKey(WEAVIATE_API_KEY),
    headers          = {"X-OpenAI-Api-Key": OPENAI_API_KEY},
)
collection = client.collections.get("LWIN")
print("✅ Connected to Weaviate")

print("📥 Upload your CSV …")
uploaded = files.upload()
csv_path = next(iter(uploaded))

df = pd.read_excel(csv_path)

def to_graphql_name(col):
    col = col.strip()
    col = re.sub(r'[\s\-]+', '_', col)
    col = re.sub(r'[^0-9a-zA-Z_]', '', col)
    if not re.match(r'^[_A-Za-z]', col):
        col = '_' + col
    return col

df.columns = [to_graphql_name(c) for c in df.columns]
def infer_column_type(series):
    try:
        pd.to_numeric(series.dropna())
        return "number"
    except Exception:
        return "string"

column_types = {col: infer_column_type(df[col]) for col in df.columns}
for col in df.columns:
    if column_types[col] == "number":
        df[col] = df[col].where(df[col].notnull(), 0)
    else:
        df[col] = df[col].fillna('')
wine_data = []
texts = []

for _, row in df.iterrows():
    obj = {}
    for col, value in row.items():
        if column_types[col] == "number":
            try:
                obj[col] = float(value) if value != '' else None
            except Exception:
                obj[col] = None
        else:
            obj[col] = str(value)
    obj['text'] = " | ".join(f"{k}: {v}" for k, v in obj.items())
    wine_data.append(obj)
    texts.append(obj['text'])

print(f"📝 Prepared {len(wine_data)} rows")

BATCH = 500
for start in tqdm(range(0, len(wine_data), BATCH), desc="🔄 Uploading"):
    batch_objs = wine_data[start:start+BATCH]
    batch_txts = texts[start:start+BATCH]

    resp    = client_openai.embeddings.create(
        input=batch_txts, model="text-embedding-3-small"
    )
    vectors = [d.embedding for d in resp.data]

    for obj, vec in zip(batch_objs, vectors):
        for attempt in range(3):
            try:
                collection.data.insert(properties=obj, vector=vec)
                break
            except Exception as e:
                if attempt == 2:
                    print(f"❌ skipped {obj.get('product_name', obj.get('wine-name', 'Unknown'))} → {e}")
                else:
                    time.sleep(1)
print("✅ All data pushed to Weaviate")

wine_name_col = None
for col in df.columns:
    if 'name' in col.lower():
        wine_name_col = col
        break
if wine_name_col is None:
    raise ValueError("No column found for wine names!")

wine_names = df[wine_name_col].dropna().unique().tolist()

def fuzzy_correct(token: str) -> str | None:
    match = process.extractOne(token, wine_names, score_cutoff=85)
    return match[0] if match else None

def gpt_correct(sentence: str) -> str:
    resp = client_openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system",
             "content": "You are a spell corrector for wine names. "
                        "Only fix the wine term; keep the rest unchanged."},
            {"role": "user", "content": sentence}
        ]
    )
    return resp.choices[0].message.content.strip()

def smart_correct(user_q: str) -> str:
    corrected = user_q
    changed   = False
    for token in re.findall(r"[A-Za-z\-']+", user_q):
        c = fuzzy_correct(token)
        if c and c.lower() != token.lower():
            corrected = re.sub(rf"\b{re.escape(token)}\b", c, corrected, flags=re.IGNORECASE)
            changed   = True
    return corrected if changed else gpt_correct(user_q)

def embed(q: str):
    return client_openai.embeddings.create(
        input=[q], model="text-embedding-3-small"
    ).data[0].embedding

# ── 🔍 INTERACTIVE LOOP ────────────────────────────────────────────
print("\n🔍 Ask anything about wines. Type 'exit' to quit.\n")
while True:
    user_q = input("🍷 User: ").strip()
    if user_q.lower() in {"exit", "quit"}:
        print("👋 Exiting. Enjoy your wine!")
        break

    try:
        corrected_q = smart_correct(user_q)
        print("🔍 Corrected Query:", corrected_q)

        q_vec = embed(corrected_q)

        res = collection.query.hybrid(
            query=corrected_q,
            vector=q_vec,
            alpha=0.6,
            limit=10
        )

        context = "\n".join(o.properties["text"] for o in res.objects)
        if not context:
            print("⚠️  No context found, trying exact LWIN fallback…")
        if not context:
            id_match = re.search(r"\d{7}", corrected_q)
            if id_match:
                lwin_id = id_match.group(0)
                res = collection.query.with_where({
                    "path": ["lwin7"],
                    "operator": "Equal",
                    "valueText": lwin_id
                }).limit(1)
                context = "\n".join(o.properties["text"] for o in res.objects)

        if not context:
            context = "No relevant context found."

        messages = [
            {"role": "system",
             "content": "You are a wine expert. Answer precisely using the context. Do all the filtering according to user's query."},
            {"role": "user",
             "content": f"Context:\n{context}\n\nQuestion: {corrected_q}"}
        ]

        answer = client_openai.chat.completions.create(
            model="gpt-4",
            messages=messages
        ).choices[0].message.content.strip()

        print("🍷 Answer:", answer, "\n")

    except Exception as e:
        print("❌ Error:", e, "\n")

✅ Connected to Weaviate
📥 Upload your CSV …


Saving wine-list (1).xlsx to wine-list (1) (13).xlsx
📝 Prepared 48 rows


🔄 Uploading: 100%|██████████| 1/1 [00:13<00:00, 13.93s/it]


✅ All data pushed to Weaviate

🔍 Ask anything about wines. Type 'exit' to quit.

🔍 Corrected Query: List Bordeaux wines from the 2010s
🍷 Answer: Here are the Bordeaux wines from the 2010s:

1. Wine Name: Chateau Petit-Village Pomerol 2010 (12x75cl)
   - Original Name: Petit Village  2010
   - Vintage: 2010
   - Case Size: 12x75cl
   - Country: France
   - Region: Bordeaux
   - Sub-Region: Pomerol
   - Producer: Chateau Petit-Village

2. Wine Name: Chateau Haut-Bailly Cru Classe Pessac-Leognan 2010 (12x75cl)
   - Original Name: Haut Bailly Cru Classe Graves 2010
   - Vintage: 2010
   - Case Size: 12x75cl
   - Country: France
   - Region: Bordeaux
   - Sub-Region: Pessac-Leognan
   - Producer: Chateau Haut-Bailly 

🔍 Corrected Query: List Bordeaux **wines** from the 2010s to 2015
🍷 Answer: Given the context provided, here are the Bordeaux wines from the 2010s:

1. Chateau Petit-Village Pomerol 2010 by Chateau Petit-Village. This is a Pomerol wine from the Bordeaux region of France.

2. C

KeyboardInterrupt: Interrupted by user