In [2]:
# model_preparation.py
import pandas as pd
import numpy as np
from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import os

def prepare_models():
    # Create models directory if it doesn't exist
    if not os.path.exists('models'):
        os.makedirs('models')
    
    print("Loading and preprocessing data...")
    
    # Load the data
    df = pd.read_csv('../Data/cierra_updated.csv')
    
    # Basic preprocessing
    # Handle missing values
    df['price'] = df['price'].replace({'KSh': '', ',': ''}, regex=True).astype(float)
    median_price = df['price'].median()
    df['price'] = df['price'].fillna(median_price)
    
    df['top'] = df['top'].fillna('Unknown')
    df['middle'] = df['middle'].fillna('Unknown')
    df['base'] = df['base'].fillna('Unknown')
    df['description'] = df['description'].fillna('Unknown')
    
    # Convert text to lowercase
    df['title'] = df['title'].str.lower()
    df['description'] = df['description'].str.lower()
    
    print("Processing fragrance notes...")
    
    # Process fragrance notes
    df['top_notes'] = df['top'].str.split(',').apply(lambda x: [note.strip().lower() for note in x] if isinstance(x, list) else ['unknown'])
    df['middle_notes'] = df['middle'].str.split(',').apply(lambda x: [note.strip().lower() for note in x] if isinstance(x, list) else ['unknown'])
    df['base_notes'] = df['base'].str.split(',').apply(lambda x: [note.strip().lower() for note in x] if isinstance(x, list) else ['unknown'])
    
    # Combine all notes
    df['all_notes'] = df['top_notes'] + df['middle_notes'] + df['base_notes']
    
    print("Creating feature matrices...")
    
    # Create feature matrices
    # 1. One-hot encode notes
    mlb = MultiLabelBinarizer()
    note_features = mlb.fit_transform(df['all_notes'])
    
    # 2. Encode categories
    label_encoder = LabelEncoder()
    category_encoded = label_encoder.fit_transform(df['category'])
    category_matrix = csr_matrix(category_encoded.reshape(-1, 1))
    
    # 3. Scale prices
    scaler = MinMaxScaler()
    normalized_prices = scaler.fit_transform(df[['price']])
    price_matrix = csr_matrix(normalized_prices)
    
    # Combine all features
    print("Combining features and calculating similarity matrix...")
    final_features = hstack([
        note_features,
        category_matrix,
        price_matrix
    ])
    
    # Calculate similarity matrix
    similarity_matrix = cosine_similarity(final_features)
    
    print("Saving model files...")
    
    # Save all necessary files
    files_to_save = {
        'preprocessed_data.pkl': df,
        'similarity_matrix.pkl': similarity_matrix,
        'mlb.pkl': mlb,
        'label_encoder.pkl': label_encoder,
        'scaler.pkl': scaler
    }
    
    for filename, data in files_to_save.items():
        with open(f'models/{filename}', 'wb') as f:
            pickle.dump(data, f)
            print(f"Saved {filename}")
    
    # Test loading
    print("\nTesting model loading...")
    try:
        # Try loading one of the saved files
        with open('models/preprocessed_data.pkl', 'rb') as f:
            test_load = pickle.load(f)
        print("Model files successfully saved and loadable!")
        
        # Print some basic statistics
        print("\nDataset Statistics:")
        print(f"Total number of perfumes: {len(df)}")
        print(f"Number of unique notes: {note_features.shape[1]}")
        print(f"Price range: KSh{df['price'].min():.2f} - KSh{df['price'].max():.2f}")
        print(f"Categories: {', '.join(label_encoder.classes_)}")
        
    except Exception as e:
        print(f"Error testing model files: {str(e)}")

if __name__ == "__main__":
    try:
        prepare_models()
    except Exception as e:
        print(f"Error during model preparation: {str(e)}")

Loading and preprocessing data...
Processing fragrance notes...
Creating feature matrices...
Combining features and calculating similarity matrix...
Saving model files...
Saved preprocessed_data.pkl
Saved similarity_matrix.pkl
Saved mlb.pkl
Saved label_encoder.pkl
Saved scaler.pkl

Testing model loading...
Model files successfully saved and loadable!

Dataset Statistics:
Total number of perfumes: 2479
Number of unique notes: 9391
Price range: KSh3500.00 - KSh97500.00
Categories: Men, Unisex, Women
