In [1]:
import pandas as pd
import numpy as np

import os

In [2]:
!pip install -U pyarrow

Requirement already up-to-date: pyarrow in /usr/local/lib/python3.6/dist-packages (1.0.1)


In [3]:
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

from pathlib import Path
if IN_COLAB:
    google.colab.drive.mount("/content/drive")
    
    AUX_DATA_ROOT = Path("/content/drive/My Drive/NEW_HACK_DATA")
    
    assert AUX_DATA_ROOT.is_dir(), "Have you forgot to 'Add a shortcut to Drive'?"
    
    import sys
    sys.path.insert(0, str(AUX_DATA_ROOT))
else:
    AUX_DATA_ROOT = Path(".")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
from IPython.display import display, HTML

def print_df(df):
    display(HTML(df.to_html()))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [5]:
!pip install dask[dataframe]

import dask.dataframe as dd



In [6]:
trans = pd.read_parquet(os.path.join(AUX_DATA_ROOT, 't.parquet'), engine='pyarrow', use_threads=True)

In [7]:
clients = pd.read_csv(os.path.join(AUX_DATA_ROOT, 'c.csv'))
materials = pd.read_csv(os.path.join(AUX_DATA_ROOT, 'm.csv'))
plants = pd.read_csv(os.path.join(AUX_DATA_ROOT, 'p.csv'))

In [8]:
def age_group_func(x):
    if x<=20:
        return '0-20'
    elif x<=45:
        return '20-45'
    elif x<=60:
        return '45-60'
    else:
        return '60+'

cur_year = 2017

most_common = clients.loc[clients.birthyear!='N', 'birthyear'].astype(float).astype(int).median()
clients['birthyear'] = clients['birthyear'].apply(lambda x: most_common if x=='N' else x)
clients['Age'] = cur_year - clients['birthyear'].astype(float).astype(int)
clients['Age_group'] = clients['Age'].map(age_group_func)
clients.drop('Age', axis=1, inplace=True)

trans = trans[['chq_id', 'client_id', 'material', 'sales_count', 'sales_sum', 'is_promo']]

trans = pd.merge(trans, clients[['client_id', 'gender', 'city', 'Age_group']], on='client_id', how='left')


In [12]:
random_indices = np.random.choice(len(trans), int(0.05*len(trans)), replace=False)
# random_indices = [0, 1]

In [None]:
trans_chunk = trans.loc[random_indices, :]
popular_df = trans_chunk.groupby(['Age_group', "gender", 'material']).\
                            filter(lambda x: x['sales_count'].sum()>\
                            trans_chunk[(trans_chunk.Age_group.isin(x.Age_group))&(trans_chunk.gender.isin(x.gender))].\
                            groupby('material')[['sales_count']].sum().quantile(0.95)).\
                                    groupby(['Age_group', "gender", 'material'])[['sales_sum']].sum()

path = Path("/content/drive/My Drive/Lenta_hack")
popular_df.to_csv(os.path.join(path, 'popular_df.csv'))

In [None]:
p_opp = 0.02
p_acc = 0.1
add_income = popular_df['sales_sum'].sum() * p_opp * p_acc * 20

print("Lenta's estimated additional income:", add_income)

In [None]:
print(5)

## Example of how the above code works

In [None]:
df = pd.DataFrame({'age': [1,1,1,1,1,1,1, 1, 1],
                    'gender' : ['foo', 'foo', 'foo', 'foo', 'foo',
                           'foo', 'foo', 'foo', 'foo'],
                    'mat' : [1, 1, 1, 1, 1, 2, 2,3,3],
                   'sales_count':[1,20,10,1,1,1,1,20,10],
                    'sales_sum' : [2.0, 5., 6, 8., 1., 2., 9.,100, 100]})
df

In [None]:
df.groupby(['age', "gender", 'mat']).filter(lambda x: x['sales_count'].sum()>\
                                            df[(df.age.isin(x.age))&(df.gender.isin(x.gender))].groupby('mat')[['sales_count']].sum().quantile(0.3))

In [None]:
df.groupby(['age', "gender", 'mat']).filter(lambda x: x['sales_count'].sum()>\
                                            df[(df.age.isin(x.age))&(df.gender.isin(x.gender))].groupby('mat')[['sales_count']].sum().quantile(0.3)).groupby(['age', "gender", 'mat'])[['sales_sum']].sum()