In [None]:
import pandas as pd
import numpy as np
import json
from datetime import datetime, timedelta
import random

class PricingDatasetGenerator:
    
    def __init__(self, device_catalog_path, defect_db_path):
        with open(device_catalog_path, 'r') as f:
            self.device_catalog = json.load(f)
        
        with open(defect_db_path, 'r') as f:
            self.defect_db = json.load(f)['defects']
        self.current_date = datetime(2025, 1, 10)
        
        print(f"Loaded {len(self._get_all_devices())} device models")
        print(f" Loaded {len(self.defect_db)} defect types")
    
    def _get_all_devices(self):
        devices = []
        for brand_data in self.device_catalog['devices']:
            brand = brand_data['brand']
            for model in brand_data['models']:
                devices.append({
                    'brand': brand,
                    'model': model['name'],
                    'original_price': model['original_price'],
                    'release_year': model['release_year']
                })
        return devices
    
    def _calculate_age_months(self, release_year, purchase_year):
        purchase_date = datetime(purchase_year, random.randint(1, 12), 1)
        age = (self.current_date - purchase_date).days / 30
        return max(1, int(age))
    
    def _calculate_base_depreciation(self, original_price, age_months):
        if age_months <= 12:
            depreciation_rate = 0.15 * (age_months / 12)
        else:
            year_1_depreciation = 0.15
            additional_years = (age_months - 12) / 12
            additional_depreciation = additional_years * 0.10
            depreciation_rate = year_1_depreciation + additional_depreciation
        
        depreciation_rate = min(depreciation_rate, 0.70)
        
        return original_price * (1 - depreciation_rate)
    
    def _select_defects(self):
        if random.random() < 0.40:
            return []
        
        if random.random() < 0.67:
            return [random.choice(self.defect_db)]
        
        if random.random() < 0.75:
            return random.sample(self.defect_db, 2)
        
        return random.sample(self.defect_db, random.randint(3, 4))
    
    def _calculate_condition_score(self, defects):
        if not defects:
            return random.uniform(9.0, 10.0)
        
        total_severity = sum(d['severity_score'] for d in defects)
        avg_severity = total_severity / len(defects)
        
        condition = 10 - avg_severity
        
        condition += random.uniform(-0.5, 0.5)
        
        return max(0, min(10, condition))
    
    def _assign_condition_grade(self, condition_score):
        if condition_score >= 9:
            return 'A'
        elif condition_score >= 7:
            return 'B'
        elif condition_score >= 5:
            return 'C'
        elif condition_score >= 3:
            return 'D'
        else:
            return 'F'
    
    def _calculate_final_price(self, base_price, defects, condition_score):
        price = base_price
        
        for defect in defects:
            price *= (1 + defect['price_impact'])
        
        condition_factor = condition_score / 10
        price *= (0.7 + 0.3 * condition_factor)
        
        noise = random.uniform(0.95, 1.05)
        price *= noise
        
        price = round(price / 100) * 100
        
        return max(500, int(price))
    
    def generate_dataset(self, n_samples=2000):
        
        devices = self._get_all_devices()
        data = []
        
        for i in range(n_samples):
            device = random.choice(devices)
            
            years_old = random.randint(1, 4)
            purchase_year = self.current_date.year - years_old
            
            age_months = self._calculate_age_months(
                device['release_year'],
                purchase_year
            )
            
            base_price = self._calculate_base_depreciation(
                device['original_price'],
                age_months
            )
            
            defects = self._select_defects()
            
            condition_score = self._calculate_condition_score(defects)
            condition_grade = self._assign_condition_grade(condition_score)
            
            final_price = self._calculate_final_price(
                base_price,
                defects,
                condition_score
            )
            
            defect_ids = [d['id'] for d in defects]
            defect_names = [d['name'] for d in defects]
            
            has_screen_damage = any(d['category'] == 'screen' for d in defects)
            has_water_damage = any(d['category'] == 'water' for d in defects)
            has_battery_issue = any(d['category'] == 'battery' for d in defects)
            has_physical_damage = any(d['category'] == 'physical' for d in defects)
            has_critical_defect = any(d['critical'] for d in defects)
            
            total_severity = sum(d['severity_score'] for d in defects)
            avg_severity = total_severity / len(defects) if defects else 0
            total_repair_cost = sum(d['repair_cost'] for d in defects)
            
            sample = {
                'device_id': f"DEV_{i:05d}",
                'brand': device['brand'],
                'model': device['model'],
                'original_price': device['original_price'],
                'release_year': device['release_year'],
                'purchase_year': purchase_year,
                'age_months': age_months,
                'num_defects': len(defects),
                'defect_ids': ','.join(defect_ids) if defects else 'NONE',
                'defect_names': ','.join(defect_names) if defects else 'NONE',
                'has_screen_damage': int(has_screen_damage),
                'has_water_damage': int(has_water_damage),
                'has_battery_issue': int(has_battery_issue),
                'has_physical_damage': int(has_physical_damage),
                'has_critical_defect': int(has_critical_defect),
                'total_severity_score': total_severity,
                'avg_severity_score': round(avg_severity, 2),
                'total_repair_cost': total_repair_cost,
                'condition_score': round(condition_score, 2),
                'condition_grade': condition_grade,
                'base_price_after_depreciation': int(base_price),
                'resale_price': final_price
            }
            
            data.append(sample)
            
            if (i + 1) % 500 == 0:
                print(f"  Generated {i + 1}/{n_samples} samples...")
        
        df = pd.DataFrame(data)
        
        print(f"\nDataset generated!")
        print(f"   Shape: {df.shape}")
        print(f"   Price range: ₹{df['resale_price'].min():,} - ₹{df['resale_price'].max():,}")
        print(f"   Mean price: ₹{df['resale_price'].mean():,.0f}")
        print(f"   Devices with defects: {(df['num_defects'] > 0).sum()} ({(df['num_defects'] > 0).sum()/len(df)*100:.1f}%)")
        
        return df


if __name__ == "__main__":
    generator = PricingDatasetGenerator(
        device_catalog_path='data/device_catalog.json',
        defect_db_path='data/defect_database.json'
    )
    
    df = generator.generate_dataset(n_samples=2000)
    
    df.to_csv('data/pricing_dataset.csv', index=False)
    print(f"\n Saved to data/pricing_dataset.csv")
    
    print("\nSample rows:")
    print(df[['brand', 'model', 'age_months', 'num_defects', 
              'condition_grade', 'resale_price']].head(10))


In [None]:
import sys, os, time
sys.path.append("./CLIP/clip")

from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

import clip
import torch
import pandas as pd
from PIL import Image
from tqdm import tqdm
import torch
from PIL import Image
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.cuda.empty_cache()

from transformers import AutoProcessor, LlavaForConditionalGeneration, BitsAndBytesConfig

model_id = "llava-hf/llava-1.5-7b-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

processor = AutoProcessor.from_pretrained(model_id)

lmodel = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)


device = "cuda" if torch.cuda.is_available() else "cpu"

model, preprocess = clip.load("ViT-B/32", device=device)
defect_classes = [
    "cracked screen",
    "screen scratch",
    "broken hinge",
    "keyboard malfunction",
    "physical dent",
    "water damage",
    "charging port issue",
    "normal device",
    "battery drain",
    "battery swelling",
    "back panel damaging",
    "overheating",
    "audio issues",
    "camera defect",
    "button damage",
    "display flickering"
]
text_tokens = clip.tokenize(defect_classes).to(device)
def clip_predict(image_path):
    image = preprocess(Image.open(image_path).convert("RGB")).unsqueeze(0).to(device)


    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text_tokens)

        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        probs = (image_features @ text_features.T).softmax(dim=-1)

    idx = probs.argmax().item()
    return defect_classes[idx], probs[0][idx].item()
def llava_caption(image_path, defect_label):
    try:
        image = Image.open(image_path).convert("RGB")
    except:
        return f"The device shows {defect_label} with no other visible defects."

    prompt = f"""<image>
Describe this device's defected part of the image clearly for training a machine learning model.
The defect type is: {defect_label}.
Be concise and factual.
"""

    inputs = processor(images=image, text=prompt, return_tensors="pt").to(lmodel.device)

    with torch.no_grad():
        output = lmodel.generate(
            **inputs,
            max_new_tokens=80,
            do_sample=False
        )

    full_text = processor.decode(output[0], skip_special_tokens=True)
    caption = full_text.split("Be concise and factual.")[-1].strip()
    return caption



def build_dataset(image_dir, output_csv):
    dataset = []

    for img in tqdm(os.listdir(image_dir)):
        if not img.lower().endswith((".jpg", ".png", ".jpeg")):
            continue

        path = os.path.join(image_dir, img)

        label, conf = clip_predict(path)
        caption = caption = llava_caption(path, label)


        dataset.append({
            "image_path": path,
            "device_type": "phone",
            "defect_type": label,
            "text_prompt": caption
        })

    pd.DataFrame(dataset).to_csv(output_csv, index=False)
    print(f"Saved to {output_csv}")

build_dataset(
    image_dir="/kaggle/input/phone-s/phone_screen_defects",
    output_csv="training_data_phone2.csv"
)


In [None]:
import pandas as pd
import os

df = pd.read_csv(r"E:\fortransferee\mlproject7\aws_cust\data\laptop\training_data_laptop21.csv")

df["image_path"] = df["image_path"].apply(
    lambda x: os.path.join(
        "data\Laptop-damage-detection-testing.v1i.coco",
        os.path.basename(x)
    )
)

df.to_csv("training_data_laptop213.csv", index=False)

print(df.head())


                                          image_path device_type  \
0  data\Laptop-damage-detection-testing.v1i.coco\...      laptop   
1  data\Laptop-damage-detection-testing.v1i.coco\...      laptop   
2  data\Laptop-damage-detection-testing.v1i.coco\...      laptop   
3  data\Laptop-damage-detection-testing.v1i.coco\...      laptop   
4  data\Laptop-damage-detection-testing.v1i.coco\...      laptop   

           defect_type                                        text_prompt  
0   display flickering  The image is blurry and has a flickering display.  
1  back panel damaging           The back panel of the laptop is damaged.  
2   display flickering  The image shows a display with a flickering sc...  
3  back panel damaging           The back panel of the device is damaged.  
4  back panel damaging           The back panel of the laptop is damaged.  


In [None]:
import torch
from transformers import CLIPModel, CLIPProcessor
from PIL import Image, ImageFile
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import os
import json

ImageFile.LOAD_TRUNCATED_IMAGES = True


class CLIPEvaluator:
    """Evaluate and compare CLIP models"""
    
    def __init__(self):
        self.results = {}
    
    def load_model(self, model_path, model_name="Model"):
        """Load a CLIP model"""
        print(f"\nLoading {model_name}...")
        print(f"   Path: {model_path}")
        
        try:
            model = CLIPModel.from_pretrained(model_path)
            processor = CLIPProcessor.from_pretrained(model_path)
            model.eval()
            print(f" {model_name} loaded successfully")
            return model, processor
        except Exception as e:
            print(f"Failed to load {model_name}: {e}")
            return None, None
    
    def prepare_test_data(self, csv_path, max_samples=None):
        """Load test dataset"""
        print(f"\nLoading test data from {csv_path}...")
        
        df = pd.read_csv(csv_path)
        
        if max_samples:
            df = df.sample(n=min(max_samples, len(df)), random_state=42)
        
        valid_rows = []
        for idx, row in df.iterrows():
            if os.path.exists(row['image_path']):
                valid_rows.append(row)
        
        df = pd.DataFrame(valid_rows)
        
        print(f"Loaded {len(df)} test samples")
        print(f"   Unique defects: {df['defect_type'].nunique()}")
        
        return df
    
    def predict_batch(self, model, processor, images, texts, device='cpu'):
       
        model = model.to(device)
        
        inputs = processor(
            text=texts,
            images=images,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=77
        ).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
            logits_per_image = outputs.logits_per_image
            probs = logits_per_image.softmax(dim=1)
        
        return probs.cpu().numpy()
    
    def evaluate_model(
        self,
        model,
        processor,
        test_df,
        model_name="Model",
        batch_size=32,
        device='cpu'
    ):
       
        print(f"\n Evaluating {model_name}...")
        
        unique_defects = sorted(test_df['defect_type'].unique())
        defect_to_idx = {defect: idx for idx, defect in enumerate(unique_defects)}
        
        print(f"   Defect classes: {len(unique_defects)}")
        
        all_predictions = []
        all_true_labels = []
        all_confidences = []
        
        for start_idx in tqdm(range(0, len(test_df), batch_size), desc=f"Evaluating {model_name}"):
            batch_df = test_df.iloc[start_idx:start_idx + batch_size]
            
            images = []
            texts = []
            true_labels = []
            
            for _, row in batch_df.iterrows():
                try:
                    img = Image.open(row['image_path']).convert('RGB')
                    img.load()
                    images.append(img)
                    texts.append(str(row['text_prompt']))
                    true_labels.append(defect_to_idx[row['defect_type']])
                except Exception as e:
                    continue
            
            if not images:
                continue
            
            defect_descriptions = [str(row['text_prompt']) for _, row in test_df.iterrows()]
            defect_descriptions = list(set(defect_descriptions))
            
            for img, true_label in zip(images, true_labels):
                probs = self.predict_batch(
                    model,
                    processor,
                    [img] * len(unique_defects),
                    [test_df[test_df['defect_type'] == defect]['text_prompt'].iloc[0] 
                     for defect in unique_defects],
                    device=device
                )
                
                pred_idx = np.argmax(np.diag(probs))
                confidence = np.max(np.diag(probs))
                
                all_predictions.append(pred_idx)
                all_true_labels.append(true_label)
                all_confidences.append(confidence)
        
        accuracy = accuracy_score(all_true_labels, all_predictions)
        precision, recall, f1, _ = precision_recall_fscore_support(
            all_true_labels,
            all_predictions,
            average='weighted',
            zero_division=0
        )
        
        per_class_precision, per_class_recall, per_class_f1, support = precision_recall_fscore_support(
            all_true_labels,
            all_predictions,
            average=None,
            zero_division=0
        )
        
        cm = confusion_matrix(all_true_labels, all_predictions)
        
        results = {
            'model_name': model_name,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'avg_confidence': np.mean(all_confidences),
            'predictions': all_predictions,
            'true_labels': all_true_labels,
            'confidences': all_confidences,
            'confusion_matrix': cm,
            'per_class_metrics': {
                'defects': unique_defects,
                'precision': per_class_precision.tolist(),
                'recall': per_class_recall.tolist(),
                'f1': per_class_f1.tolist(),
                'support': support.tolist()
            }
        }
        
        print(f"\n{model_name} Results:")
        print(f"   Accuracy:       {accuracy:.2%}")
        print(f"   Precision:      {precision:.2%}")
        print(f"   Recall:         {recall:.2%}")
        print(f"   F1-Score:       {f1:.2%}")
        print(f"   Avg Confidence: {np.mean(all_confidences):.2%}")
        
        return results
    
    def compare_models(self, results_pretrained, results_finetuned, save_dir='evaluation'):
        """Compare two models and generate visualizations"""
        
        os.makedirs(save_dir, exist_ok=True)
        
        
        metrics = ['accuracy', 'precision', 'recall', 'f1', 'avg_confidence']
        
        print(f"\n{'Metric':<20} {'Pre-trained':<15} {'Fine-tuned':<15} {'Improvement':<15}")
        print("-" * 70)
        
        improvements = {}
        for metric in metrics:
            pretrained_val = results_pretrained[metric]
            finetuned_val = results_finetuned[metric]
            improvement = ((finetuned_val - pretrained_val) / pretrained_val) * 100
            improvements[metric] = improvement
            
            print(f"{metric.replace('_', ' ').title():<20} "
                  f"{pretrained_val:>13.2%}  "
                  f"{finetuned_val:>13.2%}  "
                  f"{improvement:>+13.1f}%")
        
        comparison = {
            'pre_trained': {k: float(v) if isinstance(v, (np.floating, float)) else v 
                           for k, v in results_pretrained.items() 
                           if k not in ['predictions', 'true_labels', 'confidences', 'confusion_matrix']},
            'fine_tuned': {k: float(v) if isinstance(v, (np.floating, float)) else v 
                          for k, v in results_finetuned.items() 
                          if k not in ['predictions', 'true_labels', 'confidences', 'confusion_matrix']},
            'improvements': {k: float(v) for k, v in improvements.items()}
        }
        
        with open(f'{save_dir}/comparison.json', 'w') as f:
            json.dump(comparison, f, indent=2)
        
        print(f"\n Comparison saved to {save_dir}/comparison.json")
        
        self.plot_comparison(results_pretrained, results_finetuned, save_dir)
        
        return improvements
    
    def plot_comparison(self, results_pretrained, results_finetuned, save_dir):
        """Generate comparison plots"""
        
        fig, ax = plt.subplots(figsize=(12, 6))
        
        metrics = ['accuracy', 'precision', 'recall', 'f1']
        pretrained_vals = [results_pretrained[m] for m in metrics]
        finetuned_vals = [results_finetuned[m] for m in metrics]
        
        x = np.arange(len(metrics))
        width = 0.35
        
        bars1 = ax.bar(x - width/2, pretrained_vals, width, label='Pre-trained', color='#3498db')
        bars2 = ax.bar(x + width/2, finetuned_vals, width, label='Fine-tuned', color='#2ecc71')
        
        ax.set_ylabel('Score', fontsize=12)
        ax.set_title('Pre-trained vs Fine-tuned CLIP Performance', fontsize=14, fontweight='bold')
        ax.set_xticks(x)
        ax.set_xticklabels([m.replace('_', ' ').title() for m in metrics])
        ax.legend()
        ax.grid(axis='y', alpha=0.3)
        ax.set_ylim([0, 1])
        
        for bars in [bars1, bars2]:
            for bar in bars:
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2., height,
                       f'{height:.1%}',
                       ha='center', va='bottom', fontsize=9)
        
        plt.tight_layout()
        plt.savefig(f'{save_dir}/metrics_comparison.png', dpi=300, bbox_inches='tight')
        print(f" Saved metrics comparison to {save_dir}/metrics_comparison.png")
        plt.close()
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
        
        cm1 = results_pretrained['confusion_matrix']
        sns.heatmap(cm1, annot=True, fmt='d', cmap='Blues', ax=ax1, cbar=False)
        ax1.set_title('Pre-trained CLIP\nConfusion Matrix', fontsize=12, fontweight='bold')
        ax1.set_ylabel('True Label')
        ax1.set_xlabel('Predicted Label')
        
        cm2 = results_finetuned['confusion_matrix']
        sns.heatmap(cm2, annot=True, fmt='d', cmap='Greens', ax=ax2, cbar=False)
        ax2.set_title('Fine-tuned CLIP\nConfusion Matrix', fontsize=12, fontweight='bold')
        ax2.set_ylabel('True Label')
        ax2.set_xlabel('Predicted Label')
        
        plt.tight_layout()
        plt.savefig(f'{save_dir}/confusion_matrices.png', dpi=300, bbox_inches='tight')
        print(f"Saved confusion matrices to {save_dir}/confusion_matrices.png")
        plt.close()
        
        fig, ax = plt.subplots(figsize=(10, 6))
        
        ax.hist(results_pretrained['confidences'], bins=30, alpha=0.5, 
               label='Pre-trained', color='#3498db', edgecolor='black')
        ax.hist(results_finetuned['confidences'], bins=30, alpha=0.5, 
               label='Fine-tuned', color='#2ecc71', edgecolor='black')
        
        ax.set_xlabel('Confidence Score', fontsize=12)
        ax.set_ylabel('Frequency', fontsize=12)
        ax.set_title('Prediction Confidence Distribution', fontsize=14, fontweight='bold')
        ax.legend()
        ax.grid(axis='y', alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(f'{save_dir}/confidence_distribution.png', dpi=300, bbox_inches='tight')
        print(f" Saved confidence distribution to {save_dir}/confidence_distribution.png")
        plt.close()


def main():

    print("CLIP MODEL EVALUATION: PRE-TRAINED VS FINE-TUNED")

    
    CONFIG = {
        'pretrained_model': 'openai/clip-vit-base-patch32',
        'finetuned_model': 'models/finetuned_clip/best_model',
        'test_csv_phone': 'data/splits/val.csv',
        'test_csv_laptop': 'data/splits/val.csv',
        'batch_size': 16,
        'device': 'cuda' if torch.cuda.is_available() else 'cpu',
        'max_test_samples': 500
    }
    
    print(f"\nConfiguration:")
    for key, value in CONFIG.items():
        print(f"   {key}: {value}")
    
    evaluator = CLIPEvaluator()
    
    pretrained_model, pretrained_processor = evaluator.load_model(
        CONFIG['pretrained_model'],
        "Pre-trained CLIP"
    )
    
    finetuned_model, finetuned_processor = evaluator.load_model(
        CONFIG['finetuned_model'],
        "Fine-tuned CLIP"
    )
    
    if not finetuned_model:
        print("\nFine-tuned model not found!")
        print("   Please train the model first using: python src/finetune_clip.py")
        return
    
    test_df = evaluator.prepare_test_data(
        CONFIG['test_csv_phone'],
        max_samples=CONFIG['max_test_samples']
    )
    
    results_pretrained = evaluator.evaluate_model(
        pretrained_model,
        pretrained_processor,
        test_df,
        model_name="Pre-trained CLIP",
        batch_size=CONFIG['batch_size'],
        device=CONFIG['device']
    )
    
    results_finetuned = evaluator.evaluate_model(
        finetuned_model,
        finetuned_processor,
        test_df,
        model_name="Fine-tuned CLIP",
        batch_size=CONFIG['batch_size'],
        device=CONFIG['device']
    )
    
    improvements = evaluator.compare_models(
        results_pretrained,
        results_finetuned,
        save_dir='evaluation'
    )
    
    print("EVALUATION COMPLETE")
    print(f"\n Results saved to: evaluation/")
    print(f"   • comparison.json")
    print(f"   • metrics_comparison.png")
    print(f"   • confusion_matrices.png")
    print(f"   • confidence_distribution.png")
    
    print(f" KEY IMPROVEMENT:")
    print(f"   Accuracy: {results_pretrained['accuracy']:.2%} → {results_finetuned['accuracy']:.2%} "
          f"({improvements['accuracy']:+.1f}%)")


if __name__ == "__main__":
    main()


In [None]:
import pandas as pd

val_df = pd.read_csv(r'E:\fortransferee\mlproject7\aws_cust\data\phone\phone_training_data2134.csv')

print("Validation Set Analysis:")
print(f"Total samples: {len(val_df)}")
print(f"\nDefect type distribution:")
print(val_df['defect_type'].value_counts())
print(f"\nDevice type distribution:")
print(val_df['device_type'].value_counts())
print(f"\nUnique defects: {val_df['defect_type'].nunique()}")


Validation Set Analysis:
Total samples: 209

Defect type distribution:
defect_type
cracked screen         176
back panel damaging     18
battery swelling         9
charging port issue      2
normal device            2
camera defect            1
broken hinge             1
Name: count, dtype: int64

Device type distribution:
device_type
phone    209
Name: count, dtype: int64

Unique defects: 7
