In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer


test_users = pd.read_csv('test_users.csv')
users = pd.read_csv('users.csv')
rubrics = pd.read_csv('rubrics.csv')
reviews = pd.read_csv('reviews.csv', low_memory=False)
features = pd.read_csv('features.csv')
aspects = pd.read_csv('aspects.csv')
orgs = pd.read_csv('organisations.csv')


reviews = reviews.merge(users[['user_id', 'city']], on='user_id')
reviews = reviews.rename({'city': 'user_city'}, axis=1)

reviews = reviews.merge(orgs[['org_id', 'city', 'rating']], on='org_id')
reviews = reviews.rename({'city': 'org_city'}, axis=1)

reviews = reviews[reviews['rating_x'] >= 4.0]
reviews = reviews[reviews['rating_y'] >= 4.0]

reviews.drop('aspects', axis=1, inplace=True)

In [2]:
test_users.head()

Unnamed: 0,user_id
0,3545210947248911048
1,15271987121288045390
2,15016858616184265932
3,12457244142928722989
4,13339684649926251468


In [3]:
reviews.sample(5)

Unnamed: 0,user_id,org_id,rating_x,ts,user_city,org_city,rating_y
623270,915817146845801972,1135274990644701922,5.0,461,msk,msk,4.420231
2141217,16626175088275833957,6425522008634357838,5.0,1108,msk,msk,4.656613
449020,1894891031476493552,3783475571739324296,5.0,1187,spb,spb,4.091766
3045877,540932745278269946,12685264777658042487,5.0,1021,spb,spb,4.344828
1337368,3566495472361475380,9575991199022647715,4.0,212,msk,msk,4.36443


In [4]:
reviews.isna().sum()

user_id      0
org_id       0
rating_x     0
ts           0
user_city    0
org_city     0
rating_y     0
dtype: int64

In [5]:
reviews.ts.sort_values()

1215352       0
2032903       0
386646        0
2402566       0
2681506       0
           ... 
979310     1216
243938     1216
2521420    1216
1601868    1216
353143     1216
Name: ts, Length: 2591266, dtype: int64

In [6]:
reviews = reviews.query('ts > 1116 & rating_y > 4 & user_city != org_city')

In [7]:
reviews['rcount'] = reviews['org_id'].apply(lambda x: reviews['org_id'].value_counts()[x])

In [8]:
reviews = reviews.query('rcount > 10')

In [9]:
topr = reviews.sort_values('rcount', ascending=False)

In [10]:
topr_m = topr[topr['org_city'] == 'msk']
topr_s = topr[topr['org_city'] == 'spb']

In [11]:
msk_orgs = topr_m.groupby('org_id')['rcount'].mean().sort_values(ascending=False).index[:20].to_list()
print(len(msk_orgs))
msk_orgs = str(' '.join(map(str, msk_orgs)))


20


In [12]:
spb_orgs = topr_s.groupby('org_id')['rcount'].mean().sort_values(ascending=False).index[:20].to_list()
print(len(spb_orgs))
spb_orgs = str(' '.join(map(str, spb_orgs)))


20


In [13]:
test_users['city'] = test_users.merge(users, on='user_id')['city']

choose = lambda x: spb_orgs if x['city'] == 'msk' else msk_orgs
target = test_users.apply(choose, axis=1)

predictions = test_users[['user_id']]
predictions['target'] = target

predictions.head()

Unnamed: 0,user_id,target
0,3545210947248911048,12046097390037935713 14814427257061788801 2070...
1,15271987121288045390,12046097390037935713 14814427257061788801 2070...
2,15016858616184265932,12046097390037935713 14814427257061788801 2070...
3,12457244142928722989,12046097390037935713 14814427257061788801 2070...
4,13339684649926251468,15250345250621165867 15684663803879321952 9104...


In [14]:
predictions.to_csv('answers.csv', index=None)