In [7]:
import os
import time
import random
import pandas as pd
import requests
import numpy as np
import pickle
from tqdm import tqdm
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from concurrent.futures import ThreadPoolExecutor

# -------------------- CONFIG --------------------
RAW_FILE = "Textile_data2.txt.csv"
CLEANED_FILE = "Enhanced_Textile_Dataset.csv"
PRICING_ANALYSIS_FILE = "Pricing_Analysis_Report.csv"
OPTIMIZED_PRICING_FILE = "Optimized_Pricing_Dataset.csv"
FINAL_ML_PREDICTION_FILE = "Final_ML_Prediction_Dataset.csv"
MODEL_FILE = "trained_model.pkl"

# ✅ Load dataset
if not os.path.exists(RAW_FILE):
    raise FileNotFoundError(f"❌ File '{RAW_FILE}' not found!")

df = pd.read_csv(RAW_FILE)

# ✅ Normalize column names
df.columns = df.columns.str.strip().str.lower()

# ✅ Check for missing columns
required_columns = ["item", "cost price", "sale price", "mrp", "quality", "availability", "season", "location"]
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    raise KeyError(f"❌ Missing columns: {missing_columns}")

# ✅ Remove missing values
df = df.dropna(subset=["item", "sale price"])

# ✅ Save cleaned dataset
df.to_csv(CLEANED_FILE, index=False)
print(f"✅ Cleaned dataset saved as '{CLEANED_FILE}'")

# -------------------- 1️⃣ PRICE SCRAPING (Amazon & Google) --------------------

def get_headers():
    ua = UserAgent()
    return {"User-Agent": ua.random}

def get_amazon_price(product_name):
    """Scrapes Amazon price"""
    try:
        url = f"https://www.amazon.in/s?k={product_name.replace(' ', '+')}"
        headers = get_headers()
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        price_element = soup.select_one("span.a-price-whole")
        return int(price_element.text.replace(",", "")) if price_element else None
    except:
        return None

def get_google_price(product_name):
    """Scrapes Google for Amazon prices"""
    try:
        search_url = f"https://www.google.com/search?q={product_name.replace(' ', '+')}+price+site:amazon.in"
        headers = get_headers()
        response = requests.get(search_url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")
        price_element = soup.find("div", class_="BNeawe iBp4i AP7Wnd")
        return int(price_element.text.replace("₹", "").replace(",", "")) if price_element else None
    except:
        return None

def get_price(row):
    row = row[1]  # Extract actual row data (Series)
    return get_amazon_price(row["item"]) or get_google_price(row["item"]) or max(row["sale price"] - 30, 0)

# ✅ Run price scraping in parallel
with ThreadPoolExecutor(max_workers=10) as executor:
    df["competitor price"] = list(tqdm(executor.map(get_price, df.iterrows()), total=len(df), desc="Fetching Competitor Prices"))

# ✅ Save updated dataset
df.to_csv(CLEANED_FILE, index=False)
print(f"✅ Competitor prices added and saved as '{CLEANED_FILE}'")

# -------------------- 2️⃣ PRICING ANALYSIS --------------------

df["price_difference"] = df["sale price"] - df["competitor price"]

def classify_price_status(row):
    if row["price_difference"] > 30:
        return "Overpriced"
    elif row["price_difference"] < -30:
        return "Underpriced"
    else:
        return "Competitive"

df["price_status"] = df.apply(classify_price_status, axis=1)

df.to_csv(PRICING_ANALYSIS_FILE, index=False)
print(f"✅ Pricing Analysis Report saved as {PRICING_ANALYSIS_FILE}")

# -------------------- 3️⃣ PRICE OPTIMIZATION --------------------

def adjust_price(row):
    sale_price = row["sale price"]
    competitor_price = row["competitor price"]
    cost_price = row["cost price"]

    if pd.isna(competitor_price) or competitor_price <= 0:
        return sale_price

    min_profit_margin = cost_price * 1.2

    if sale_price - competitor_price > 30:
        return max(competitor_price + 10, min_profit_margin)
    elif sale_price - competitor_price < -30:
        return min(competitor_price - 10, sale_price * 1.1)
    else:
        return sale_price

df["optimized price"] = df.apply(adjust_price, axis=1)

df.to_csv(OPTIMIZED_PRICING_FILE, index=False)
print(f"✅ Optimized pricing dataset saved as {OPTIMIZED_PRICING_FILE}")

# -------------------- 4️⃣ MACHINE LEARNING PRICE PREDICTION --------------------

def generate_seasonal_sales(row):
    base_sales = np.random.randint(50, 150)
    if "cotton" in row["item"].lower() and "summer" in row["season"].lower():
        return np.random.randint(200, 500)
    elif "sweater" in row["item"].lower() and "winter" in row["season"].lower():
        return np.random.randint(250, 600)
    return base_sales

df["total_sales"] = df.apply(generate_seasonal_sales, axis=1)

features = ["cost price", "sale price", "competitor price", "total_sales"]
target = "optimized price"

df = df.dropna(subset=features + [target])
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

with open(MODEL_FILE, "wb") as file:
    pickle.dump(model, file)

print(f"✅ Model saved successfully as {MODEL_FILE}!")

df.to_csv(FINAL_ML_PREDICTION_FILE, index=False)
print(f"✅ ML-Predicted pricing dataset saved as {FINAL_ML_PREDICTION_FILE}")


✅ Cleaned dataset saved as 'Enhanced_Textile_Dataset.csv'


Fetching Competitor Prices: 100%|██████████| 493/493 [02:50<00:00,  2.89it/s]

✅ Competitor prices added and saved as 'Enhanced_Textile_Dataset.csv'
✅ Pricing Analysis Report saved as Pricing_Analysis_Report.csv
✅ Optimized pricing dataset saved as Optimized_Pricing_Dataset.csv
✅ Model saved successfully as trained_model.pkl!
✅ ML-Predicted pricing dataset saved as Final_ML_Prediction_Dataset.csv



