In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_parquet("train_data.pq")
sample = pd.read_csv("sample_submission.csv")

In [2]:
import pandas as pd
from tqdm import tqdm
import numpy as np

class ConfigurableRecommender:
    def __init__(self, config):
        self.config = config
        self.train = None
        self.sample = None
        self.prepared_data = {}
        
    def load_data(self, train_path, sample_path):
        """–ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö"""
        self.train = pd.read_parquet(train_path)
        self.sample = pd.read_csv(sample_path)
        return self
    
    def prepare_data(self):
        """–ü—Ä–µ–¥–≤–∞—Ä–∏—Ç–µ–ª—å–Ω–∞—è –ø–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö —Å–æ–≥–ª–∞—Å–Ω–æ –∫–æ–Ω—Ñ–∏–≥—É"""
        print("–ü–æ–¥–≥–æ—Ç–∞–≤–ª–∏–≤–∞–µ–º –¥–∞–Ω–Ω—ã–µ...")
        
        last_day = self.train["date"].max()
        self.prepared_data['last_day'] = last_day
        k = self.config['time_window']['k']
        
        # –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö –¥–ª—è –∫–∞–∂–¥–æ–≥–æ –∏—Å—Ç–æ—á–Ω–∏–∫–∞
        for source_name, source_config in self.config['sources'].items():
            if not source_config['enabled']:
                continue
                
            if source_name == 'multiple_clicks_last_day':
                self._prepare_multiple_clicks_last_day(last_day)
                
            elif source_name == 'last_day_clicks':
                self._prepare_last_day_clicks(last_day)
                
            elif source_name == 'previous_days':
                self._prepare_previous_days(last_day, k)
                
            elif source_name == 'global_popular':
                self._prepare_global_popular(last_day)
        
        return self
    
    def _prepare_multiple_clicks_last_day(self, last_day):
        """–ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö –ø–æ –º–Ω–æ–∂–µ—Å—Ç–≤–µ–Ω–Ω—ã–º –∫–ª–∏–∫–∞–º –≤ –ø–æ—Å–ª–µ–¥–Ω–∏–π –¥–µ–Ω—å"""
        last_day_data = self.train[self.train["date"] == last_day]
        user_multiple_clicks = {}
        
        last_day_counts = last_day_data.groupby(["user_id", "item_id"]).size()
        for (user_id, item_id), count in last_day_counts.items():
            if count > self.config['sources']['multiple_clicks_last_day']['min_clicks']:
                if user_id not in user_multiple_clicks:
                    user_multiple_clicks[user_id] = []
                user_multiple_clicks[user_id].append(item_id)
        
        self.prepared_data['multiple_clicks_last_day'] = user_multiple_clicks
        print(f"  multiple_clicks_last_day: {len(user_multiple_clicks)} –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π")
    
    def _prepare_last_day_clicks(self, last_day):
        """–ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö –ø–æ –≤—Å–µ–º –∫–ª–∏–∫–∞–º –≤ –ø–æ—Å–ª–µ–¥–Ω–∏–π –¥–µ–Ω—å"""
        last_day_data = self.train[self.train["date"] == last_day]
        user_last_clicks = last_day_data.groupby("user_id")["item_id"].apply(list).to_dict()
        self.prepared_data['last_day_clicks'] = user_last_clicks
        print(f"  last_day_clicks: {len(user_last_clicks)} –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π")
    
    def _prepare_previous_days(self, last_day, k):
        """–ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö –ø–æ –ø—Ä–µ–¥—ã–¥—É—â–∏–º –¥–Ω—è–º"""
        previous_days_candidates = {}
        thresholds = self.config['sources']['previous_days']['thresholds']
        
        for day_offset in range(1, k + 1):
            target_date = last_day - day_offset
            day_data = self.train[self.train['date'] == target_date]
            
            if not day_data.empty:
                threshold = thresholds[day_offset - 1] if day_offset <= len(thresholds) else thresholds[-1]
                
                day_counts = day_data.groupby(['user_id', 'item_id']).size().reset_index(name='count')
                valid_items = day_counts[day_counts['count'] >= threshold]
                
                for _, row in valid_items.iterrows():
                    user_id = row['user_id']
                    item_id = row['item_id']
                    
                    if user_id not in previous_days_candidates:
                        previous_days_candidates[user_id] = []
                    previous_days_candidates[user_id].append(item_id)
        
        self.prepared_data['previous_days'] = previous_days_candidates
        print(f"  previous_days: {len(previous_days_candidates)} –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π")
    
    def _prepare_global_popular(self, last_day):
        """–ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –≥–ª–æ–±–∞–ª—å–Ω–æ –ø–æ–ø—É–ª—è—Ä–Ω—ã—Ö —Ç–æ–≤–∞—Ä–æ–≤"""
        if self.config['sources']['global_popular']['use_recent_days']:
            # –ò—Å–ø–æ–ª—å–∑—É–µ–º –Ω–µ—Å–∫–æ–ª—å–∫–æ –ø–æ—Å–ª–µ–¥–Ω–∏—Ö –¥–Ω–µ–π –¥–ª—è –ø–æ–ø—É–ª—è—Ä–Ω–æ—Å—Ç–∏
            recent_days = self.config['sources']['global_popular']['recent_days']
            recent_data = self.train[self.train['date'] >= (last_day - recent_days)]
            global_popular = recent_data['item_id'].value_counts().head(100).index.tolist()
        else:
            # –¢–æ–ª—å–∫–æ –ø–æ—Å–ª–µ–¥–Ω–∏–π –¥–µ–Ω—å
            last_day_data = self.train[self.train["date"] == last_day]
            global_popular = last_day_data["item_id"].value_counts().head(100).index.tolist()
        
        self.prepared_data['global_popular'] = global_popular
        print(f"  global_popular: {len(global_popular)} —Ç–æ–≤–∞—Ä–æ–≤")
    
    def recommend_for_user(self, uid):
        """–ì–µ–Ω–µ—Ä–∞—Ü–∏—è —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–π –¥–ª—è –æ–¥–Ω–æ–≥–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è"""
        recommendations = []
        used_items = set()
        
        # –ü—Ä–æ—Ö–æ–¥–∏–º –ø–æ –≤—Å–µ–º –∏—Å—Ç–æ—á–Ω–∏–∫–∞–º –≤ –ø–æ—Ä—è–¥–∫–µ –ø—Ä–∏–æ—Ä–∏—Ç–µ—Ç–∞
        for source_name in self.config['priority_order']:
            source_config = self.config['sources'][source_name]
            
            if not source_config['enabled'] or len(recommendations) >= 20:
                continue
            
            max_candidates = source_config['max_candidates']
            candidates = self._get_candidates_from_source(uid, source_name)
            
            # –î–æ–±–∞–≤–ª—è–µ–º –∫–∞–Ω–¥–∏–¥–∞—Ç–æ–≤, –ø–æ–∫–∞ –Ω–µ –¥–æ—Å—Ç–∏–≥–Ω–µ–º –ª–∏–º–∏—Ç–∞
            remaining_slots = min(20 - len(recommendations), max_candidates)
            for item in candidates:
                if item not in used_items and remaining_slots > 0:
                    recommendations.append(item)
                    used_items.add(item)
                    remaining_slots -= 1
                if len(recommendations) >= 20:
                    break
        
        # –ì–∞—Ä–∞–Ω—Ç–∏—Ä—É–µ–º 20 —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–π
        if len(recommendations) < 20:
            self._fill_remaining_slots(recommendations, used_items)
        
        return recommendations[:20]
    
    def _get_candidates_from_source(self, uid, source_name):
        """–ü–æ–ª—É—á–µ–Ω–∏–µ –∫–∞–Ω–¥–∏–¥–∞—Ç–æ–≤ –∏–∑ —É–∫–∞–∑–∞–Ω–Ω–æ–≥–æ –∏—Å—Ç–æ—á–Ω–∏–∫–∞"""
        if source_name == 'multiple_clicks_last_day':
            return self.prepared_data.get('multiple_clicks_last_day', {}).get(uid, [])
        
        elif source_name == 'last_day_clicks':
            return self.prepared_data.get('last_day_clicks', {}).get(uid, [])
        
        elif source_name == 'previous_days':
            return self.prepared_data.get('previous_days', {}).get(uid, [])
        
        elif source_name == 'global_popular':
            return self.prepared_data.get('global_popular', [])
        
        return []
    
    def _fill_remaining_slots(self, recommendations, used_items):
        """–ó–∞–ø–æ–ª–Ω–µ–Ω–∏–µ –æ—Å—Ç–∞–≤—à–∏—Ö—Å—è —Å–ª–æ—Ç–æ–≤ –≥–ª–æ–±–∞–ª—å–Ω–æ –ø–æ–ø—É–ª—è—Ä–Ω—ã–º–∏ —Ç–æ–≤–∞—Ä–∞–º–∏"""
        global_popular = self.prepared_data.get('global_popular', [])
        remaining_slots = 20 - len(recommendations)
        
        for item in global_popular:
            if item not in used_items and remaining_slots > 0:
                recommendations.append(item)
                used_items.add(item)
                remaining_slots -= 1
            if remaining_slots <= 0:
                break
        
        # –ï—Å–ª–∏ –≤—Å–µ –µ—â–µ –Ω–µ —Ö–≤–∞—Ç–∞–µ—Ç, –∏—Å–ø–æ–ª—å–∑—É–µ–º —Å–∞–º—ã–µ –ø–æ–ø—É–ª—è—Ä–Ω—ã–µ –∏–∑ –≤—Å–µ–≥–æ –¥–∞—Ç–∞—Å–µ—Ç–∞
        if remaining_slots > 0:
            all_popular = self.train['item_id'].value_counts().head(100).index.tolist()
            for item in all_popular:
                if item not in used_items and remaining_slots > 0:
                    recommendations.append(item)
                    used_items.add(item)
                    remaining_slots -= 1
                if remaining_slots <= 0:
                    break
    
    def generate_submission(self, output_path):
        """–ì–µ–Ω–µ—Ä–∞—Ü–∏—è —Ñ–∏–Ω–∞–ª—å–Ω–æ–≥–æ —Å–∞–±–º–∏—Ç–∞"""
        print("–ì–µ–Ω–µ—Ä–∞—Ü–∏—è —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–π...")
        
        test_users = self.sample["user_id"].unique()
        recs = []
        
        for uid in tqdm(test_users):
            items = self.recommend_for_user(uid)
            for item in items:
                recs.append((uid, item))
        
        submission = pd.DataFrame(recs, columns=["user_id", "item_id"])
        
        # –ü—Ä–æ–≤–µ—Ä–∫–∞ –∫–æ—Ä—Ä–µ–∫—Ç–Ω–æ—Å—Ç–∏
        user_counts = submission['user_id'].value_counts()
        if not (user_counts == 20).all():
            print("–ò—Å–ø—Ä–∞–≤–ª—è–µ–º –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π –±–µ–∑ 20 —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–π...")
            submission = self._fix_submission(submission)
        
        submission.to_csv(output_path, index=False)
        print(f"‚úÖ –°–∞–±–º–∏—Ç —Å–æ—Ö—Ä–∞–Ω–µ–Ω: {output_path}")
        print(f"   –í—Å–µ–≥–æ —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–π: {len(submission)}")
        print(f"   –£–Ω–∏–∫–∞–ª—å–Ω—ã—Ö –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π: {submission['user_id'].nunique()}")
        print(f"   –£–Ω–∏–∫–∞–ª—å–Ω—ã—Ö —Ç–æ–≤–∞—Ä–æ–≤: {submission['item_id'].nunique()}")
        
        return submission
    
    def _fix_submission(self, submission):
        """–ò—Å–ø—Ä–∞–≤–ª–µ–Ω–∏–µ —Å–∞–±–º–∏—Ç–∞ –¥–ª—è –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π –±–µ–∑ 20 —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–π"""
        global_popular = self.prepared_data.get('global_popular', [])
        fixed_recs = []
        
        for uid in self.sample["user_id"].unique():
            user_items = submission[submission['user_id'] == uid]['item_id'].tolist()
            
            if len(user_items) < 20:
                # –î–æ–±–∏—Ä–∞–µ–º –Ω–µ–¥–æ—Å—Ç–∞—é—â–∏–µ –≥–ª–æ–±–∞–ª—å–Ω–æ –ø–æ–ø—É–ª—è—Ä–Ω—ã–º–∏
                additional_items = [item for item in global_popular if item not in user_items]
                user_items.extend(additional_items[:20 - len(user_items)])
            
            # –ì–∞—Ä–∞–Ω—Ç–∏—Ä—É–µ–º —Ä–æ–≤–Ω–æ 20
            for item in user_items[:20]:
                fixed_recs.append((uid, item))
        
        return pd.DataFrame(fixed_recs, columns=["user_id", "item_id"])

In [15]:
# # –ö–æ–Ω—Ñ–∏–≥ 3: –°–±–∞–ª–∞–Ω—Å–∏—Ä–æ–≤–∞–Ω–Ω—ã–π (—Ä–µ–∫–æ–º–µ–Ω–¥—É–µ–º—ã–π)
CONFIG_BALANCED = {
    'time_window': {
        'k': 12
    },
    'sources': {
        'multiple_clicks_last_day': {
            'enabled': True,
            'min_clicks': 1,
            'max_candidates': 20
        },
        'last_day_clicks': {
            'enabled': True,
            'max_candidates': 20
        },
        'previous_days': {
            'enabled': True,
            'thresholds': [1]*11,
            'max_candidates': 20
        },
        'global_popular': {
            'enabled': True,
            'use_recent_days': True,
            'recent_days': 1,
            'max_candidates': 20
        }
    },
    'priority_order': [
        'multiple_clicks_last_day',
        'last_day_clicks',
        'previous_days',
        'global_popular'
    ]
}

In [16]:
# ==================== –ò–°–ü–û–õ–¨–ó–û–í–ê–ù–ò–ï ====================

if __name__ == "__main__":
    # –í—ã–±–µ—Ä–∏—Ç–µ –∫–æ–Ω—Ñ–∏–≥ –¥–ª—è —ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞
    CURRENT_CONFIG = CONFIG_BALANCED
    
    # –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –∏ –∑–∞–ø—É—Å–∫
    recommender = ConfigurableRecommender(CURRENT_CONFIG)
    recommender.load_data(
        "train_data.pq",
        "sample_submission.csv"
    )
    
    # –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö
    recommender.prepare_data()
    
    # –ì–µ–Ω–µ—Ä–∞—Ü–∏—è —Å–∞–±–º–∏—Ç–∞
    submission = recommender.generate_submission("bilibert.csv")
    
    print("\nüéØ –ö–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏—è –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∞:")
    for source in CURRENT_CONFIG['priority_order']:
        if CURRENT_CONFIG['sources'][source]['enabled']:
            config = CURRENT_CONFIG['sources'][source]
            print(f"   - {source}: max_{config['max_candidates']}")

–ü–æ–¥–≥–æ—Ç–∞–≤–ª–∏–≤–∞–µ–º –¥–∞–Ω–Ω—ã–µ...
  multiple_clicks_last_day: 0 –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π
  last_day_clicks: 98917 –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π
  previous_days: 962600 –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π
  global_popular: 100 —Ç–æ–≤–∞—Ä–æ–≤
–ì–µ–Ω–µ—Ä–∞—Ü–∏—è —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–π...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 293230/293230 [00:03<00:00, 73509.32it/s]


‚úÖ –°–∞–±–º–∏—Ç —Å–æ—Ö—Ä–∞–Ω–µ–Ω: bilibert.csv
   –í—Å–µ–≥–æ —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–π: 5864600
   –£–Ω–∏–∫–∞–ª—å–Ω—ã—Ö –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π: 293230
   –£–Ω–∏–∫–∞–ª—å–Ω—ã—Ö —Ç–æ–≤–∞—Ä–æ–≤: 135243

üéØ –ö–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏—è –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∞:
   - multiple_clicks_last_day: max_20
   - last_day_clicks: max_20
   - previous_days: max_20
   - global_popular: max_20


In [23]:
check = pd.read_csv('da_bilo_x2.csv')
f = list(submission['item_id'])
s = list(check['item_id'])

In [24]:
eq = 0
for i in range(len(f)):
    if f[i] == s[i]:
        eq += 1
print(eq)

5407810


In [22]:
len(f)

5864600