In [None]:
# ---
# Title: Fashion Account Segmentation
# Author: Jinji Shen
# Date: 2025-02-16
# Description:
#   This notebook implements a weighted fashion segmentation model using KMeans clustering.
#   It creates segments based on fashion categories, engagement metrics, and follower counts.
# ---

# ### 1. Imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import logging
from typing import Dict, List, Tuple
import matplotlib.pyplot as plt
import seaborn as sns

# Set global styles for plots
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 8)

# ### 2. Define FashionSegmenter Class

class FashionSegmenter:
    def __init__(self, posts_df, labels_df, segmentations_df):
        """
        Initialize the FashionSegmenter with data and configuration.
        
        Args:
            posts_df (pd.DataFrame): Posts data with engagement metrics.
            labels_df (pd.DataFrame): Image labels data.
            segmentations_df (pd.DataFrame): Author segmentation data.
        """
        self.posts_df = posts_df
        self.labels_df = labels_df
        self.segmentations_df = segmentations_df
        self.logger = self._setup_logging()
        
        # Define weighted fashion categories
        self.fashion_categories = {
            'luxury': {
                'weight': 2.2,  # Higher weight for luxury brands
                'labels': ['chanel', 'louisvuitton', 'gucci', 'prada', 'dior', 'hermes', 
                           'fendi', 'bottegaveneta', 'saintlaurent', 'burberry', 'celine']
            },
            'sportswear': {
                'weight': 1.8,  # Moderate weight for sportswear
                'labels': ['nike', 'adidas', 'new_balance', 'sportbasketballsneakers',
                           'sneakersrunning', 'trainerrunninsneakers']
            },
            'footwear': {
                'weight': 1.6,  # Moderate weight for footwear
                'labels': ['sneakers', 'boots', 'pumps', 'heels', 'sandals',
                           'sneakerlowtop', 'sneakerhightop', 'bootshiking']
            },
            'accessories': {
                'weight': 1.4,  # Lower weight for accessories
                'labels': ['bag', 'belt', 'eyewear', 'neckwear', 'wristlet', 'handbag']
            },
            'clothing': {
                'weight': 1.2,  # Lower weight for clothing
                'labels': ['dress', 'coat', 'pants', 'top', 'skirt', 'shorts']
            }
        }
        
        # Define minimum segment size
        self.MIN_SEGMENT_SIZE = 200
        
        # Preprocess data
        self._preprocess_data()

    def _setup_logging(self):
        """Set up logging for the class."""
        logger = logging.getLogger('FashionSegmentation')
        logger.setLevel(logging.INFO)
        handler = logging.StreamHandler()
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        handler.setFormatter(formatter)
        logger.addHandler(handler)
        return logger

    def _preprocess_data(self):
        """Preprocess data with improved handling of edge cases."""
        # Convert and clean numeric columns in posts_df
        for col in ['NB_LIKES', 'COMMENT_COUNT']:
            self.posts_df[col] = pd.to_numeric(self.posts_df[col], errors='coerce')
            self.posts_df[col] = self.posts_df[col].clip(lower=0)  # No negative values
        
        # Clean follower counts
        self.segmentations_df['NB_FOLLOWERS'] = pd.to_numeric(
            self.segmentations_df['NB_FOLLOWERS'], errors='coerce'
        )
        # Replace 0 or null followers with median of bottom 10%
        bottom_10_median = self.segmentations_df['NB_FOLLOWERS'].quantile(0.1)
        self.segmentations_df['NB_FOLLOWERS'] = self.segmentations_df['NB_FOLLOWERS'].replace(
            [0, np.nan], bottom_10_median
        )

    def create_feature_matrix(self) -> pd.DataFrame:
        """Create improved feature matrix with balanced category representation."""
        self.logger.info("Creating feature matrix...")
        
        # Merge posts with labels
        author_labels = pd.merge(
            self.posts_df[['AUTHORID', 'IMAGE_ID', 'NB_LIKES', 'COMMENT_COUNT']],
            self.labels_df[['IMAGE_ID', 'LABEL_NAME', 'TYPE']],
            on='IMAGE_ID',
            how='left'
        )
        
        # Calculate weighted category scores
        features = pd.DataFrame()
        
        # Calculate post counts and basic metrics per author
        author_metrics = author_labels.groupby('AUTHORID').agg({
            'IMAGE_ID': 'nunique',
            'NB_LIKES': lambda x: np.log1p(x.mean()),
            'COMMENT_COUNT': lambda x: np.log1p(x.mean())
        }).rename(columns={'IMAGE_ID': 'post_count'})
        
        features = author_metrics
        
        # Calculate category scores
        for category, info in self.fashion_categories.items():
            # Calculate weighted score based on label presence
            category_mask = author_labels['LABEL_NAME'].isin(info['labels'])
            category_posts = author_labels[category_mask].groupby('AUTHORID')['IMAGE_ID'].nunique()
            
            # Calculate category ratio and apply weight
            features[f'{category}_ratio'] = (
                category_posts / features['post_count'].clip(lower=1)
            ) * info['weight']
        
        # Add follower data
        features = features.join(
            self.segmentations_df.set_index('AUTHORID')['NB_FOLLOWERS']
        )
        
        # Calculate engagement rate using log transformation
        features['engagement_rate'] = (
            features['NB_LIKES'] / 
            np.log1p(features['NB_FOLLOWERS'])
        ).clip(0, 10)  # Cap extreme values
        
        # Fill NaN values
        features = features.fillna(0)
        
        return features

    def segment_accounts(self, features: pd.DataFrame, n_clusters: int = 10) -> pd.DataFrame:
        """Create balanced segments with minimum size enforcement."""
        self.logger.info("Creating segments...")
        
        # Select features for clustering
        clustering_features = [
            'engagement_rate'
        ] + [f'{category}_ratio' for category in self.fashion_categories.keys()]
        
        X = features[clustering_features].copy()
        
        # Normalize features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # Try different numbers of clusters to find optimal segmentation
        best_clusters = None
        best_score = -np.inf
        
        for k in range(8, 13):  # Try 8-12 clusters
            kmeans = KMeans(n_clusters=k, random_state=42)
            labels = kmeans.fit_predict(X_scaled)
            
            # Calculate minimum segment size
            min_size = pd.Series(labels).value_counts().min()
            
            if min_size >= self.MIN_SEGMENT_SIZE:
                score = kmeans.score(X_scaled)
                if score > best_score:
                    best_score = score
                    best_clusters = labels
        
        if best_clusters is None:
            # If no solution meets minimum size, use original n_clusters
            kmeans = KMeans(n_clusters=n_clusters, random_state=42)
            best_clusters = kmeans.fit_predict(X_scaled)
        
        features['cluster'] = best_clusters
        
        # Create segment labels
        features = self._create_segment_labels(features)
        
        return features

    def _create_segment_labels(self, features: pd.DataFrame) -> pd.DataFrame:
        """Create meaningful and balanced segment labels."""
        # Calculate engagement percentiles
        engagement_percentiles = features['engagement_rate'].quantile([0.33, 0.66])
        
        def get_engagement_level(rate):
            if rate > engagement_percentiles[0.66]:
                return "High"
            elif rate > engagement_percentiles[0.33]:
                return "Medium"
            return "Standard"
        
        def get_influence_tier(followers):
            """More balanced influence tier calculation."""
            if followers > 50000:  # Increased threshold
                return "High-Influence"
            elif followers > 15000:  # Adjusted mid threshold
                return "Mid-Influence"
            return "Micro-Influence"
                
        segment_labels = {}
        min_segment_size = 25  # Minimum segment size
        
        for cluster in features['cluster'].unique():
            cluster_data = features[features['cluster'] == cluster]
            
            # Skip very small clusters
            if len(cluster_data) < min_segment_size:
                continue
                
            # Get influence level
            avg_followers = cluster_data['NB_FOLLOWERS'].mean()
            influence_tier = get_influence_tier(avg_followers)
            
            # Get weighted category ratios
            category_ratios = {
                cat: cluster_data[f'{cat}_ratio'].mean() * info['weight']
                for cat, info in self.fashion_categories.items()
            }
            sorted_cats = sorted(category_ratios.items(), key=lambda x: x[1], reverse=True)
            
            # Create more balanced category combinations
            primary_cat = sorted_cats[0][0].title()
            if len(sorted_cats) > 1 and sorted_cats[1][1] > sorted_cats[0][1] * 0.5:
                secondary_cat = sorted_cats[1][0].title()
                label = f"{primary_cat} & {secondary_cat}"
            else:
                label = f"{primary_cat}"
            
            # Add engagement level
            avg_engagement = cluster_data['engagement_rate'].mean()
            engagement_level = get_engagement_level(avg_engagement)
            engagement_suffix = f" ({engagement_level} Engagement)" if engagement_level == "High" else ""
            
            segment_labels[cluster] = f"{influence_tier} {label}{engagement_suffix}"
        
        features['segment'] = features['cluster'].map(segment_labels)

        # Handle any unlabeled segments
        features = features.assign(segment=features['segment'].fillna(f"{influence_tier} Fashion General"))
       
        return features

# ### 3. Main Function

def main():
    """Main function to run the segmentation pipeline."""
    # Load data
    DATA_DIR = "/Users/jinjishen/Desktop/my_new_project/heuritech-technical-test/data/raw"
    posts_df = pd.read_csv(f"{DATA_DIR}/MART_IMAGES_OF_POSTS.csv", low_memory=False)
    labels_df = pd.read_csv(f"{DATA_DIR}/MART_IMAGES_LABELS.csv", low_memory=False)
    segmentations_df = pd.read_csv(f"{DATA_DIR}/MART_AUTHORS_SEGMENTATIONS.csv", low_memory=False)
    
    # Initialize segmenter
    segmenter = FashionSegmenter(posts_df, labels_df, segmentations_df)
    
    try:
        # Create feature matrix
        features = segmenter.create_feature_matrix()
        
        # Create segments
        segmented_df = segmenter.segment_accounts(features, n_clusters=10)  
        
        # Save results
        segmented_df.to_csv('/Users/jinjishen/Desktop/my_new_project/heuritech-technical-test/data/processed/fashion_segments_final.csv', index=True)
        
        # Print summary
        print("\nSegmentation Summary:")
        print(segmented_df['segment'].value_counts())
        
        # Print detailed analysis
        print("\nDetailed Segment Analysis:")
        for segment in segmented_df['segment'].unique():
            segment_data = segmented_df[segmented_df['segment'] == segment]
            print(f"\n{segment}:")
            print(f"Count: {len(segment_data)}")
            print(f"Avg Followers: {segment_data['NB_FOLLOWERS'].mean():,.0f}")
            print(f"Avg Engagement: {segment_data['engagement_rate'].mean():.4f}")
            
            # Show top categories with weights applied
            weighted_ratios = {
                cat: segment_data[f'{cat}_ratio'].mean()
                for cat in segmenter.fashion_categories.keys()
            }
            top_cats = sorted(weighted_ratios.items(), key=lambda x: x[1], reverse=True)[:3]
            print("Top Categories:", ", ".join(f"{cat.title()}: {val:.3f}" 
                  for cat, val in top_cats))
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        raise

# ### 4. Run the Pipeline
if __name__ == "__main__":
    main()

2025-02-16 21:31:25,644 - FashionSegmentation - INFO - Creating feature matrix...
2025-02-16 21:31:37,395 - FashionSegmentation - INFO - Creating segments...



Segmentation Summary:
segment
High-Influence Accessories & Clothing                      7724
Micro-Influence Clothing & Accessories                     5193
High-Influence Footwear & Clothing (High Engagement)       4143
High-Influence Accessories & Clothing (High Engagement)    4085
Mid-Influence Clothing & Accessories                       3090
High-Influence Clothing (High Engagement)                  2973
High-Influence Footwear                                    1341
Mid-Influence Sportswear & Footwear (High Engagement)       458
High-Influence Luxury & Footwear                             74
Name: count, dtype: int64

Detailed Segment Analysis:

Micro-Influence Clothing & Accessories:
Count: 5193
Avg Followers: 13,238
Avg Engagement: 0.3074
Top Categories: Clothing: 0.194, Accessories: 0.144, Footwear: 0.063

Mid-Influence Clothing & Accessories:
Count: 3090
Avg Followers: 19,229
Avg Engagement: 0.1960
Top Categories: Clothing: 0.785, Accessories: 0.510, Footwear: 0.326

High-I