# DATA PREPROCESSING PART

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip -P Skoltech drive/My\ Drive/hack_data.zip 

Archive:  drive/My Drive/hack_data.zip
  inflating: clients.csv             
  inflating: materials.csv           
  inflating: plants.csv              
  inflating: transactions.parquet    


In [1]:
import pandas as pd
!pip install -U pyarrow > /dev/null

# Part 1. Looking at structure date of each file

## clients.csv

In [66]:
df_clients = pd.read_csv('clients.csv')
df_clients.head()

Unnamed: 0,client_id,gender,city,birthyear
0,1a47d62dddacc03fe90c15652f7ae1a4,M,Other,1990.0
1,9cb909f701d25d548e953bff81192b56,F,Other,1969.0
2,d5da0f5b841b4f83383202807995027a,M,Other,1976.0
3,13ed7f16810b17b8cee6de834ac79a48,F,Moscow,1966.0
4,215fe3ea7d5bf0415e5504e2a7f33551,F,Other,1988.0


In [4]:
df_clients.gender.unique()

array(['M', 'F', nan], dtype=object)

In [5]:
df_clients = df_clients.fillna('N')

In [7]:
df_clients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99995 entries, 0 to 99994
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   client_id  99995 non-null  object
 1   gender     99995 non-null  object
 2   city       99995 non-null  object
 3   birthyear  99995 non-null  object
dtypes: object(4)
memory usage: 3.1+ MB


## materials.csv

In [8]:
df_materials = pd.read_csv('materials.csv')
df_materials.head()

Unnamed: 0,material,hier_level_1,hier_level_2,hier_level_3,hier_level_4,vendor,is_private_label,is_alco
0,35cbdf61de9e19c8b417327aaef14c88,NONFOOD,ea5d2f1c4608232e07d3aa3d998e5135,99cad265a1768cc2dd013f0e740300ae,9eed45f71360b4b1e2590637467220e5,212a38db0ddcd009f1e164cc8483485c,0,0
1,c0b0bf24d4ec71da3d304f761ec555d8,NONFOOD,ea5d2f1c4608232e07d3aa3d998e5135,99cad265a1768cc2dd013f0e740300ae,de2eb747e0896c050905a6b635ab800a,9b0b6c7d55413ad3b67761b7b125b534,0,0
2,f0fc5e654a81a7c4b8ba8d7c26546e14,NONFOOD,ea5d2f1c4608232e07d3aa3d998e5135,99cad265a1768cc2dd013f0e740300ae,8a969031832c535daf96e0c2aed8e814,9b0b6c7d55413ad3b67761b7b125b534,0,0
3,353693e64fb5f9e2d29746d7fe6edf1e,NONFOOD,ea5d2f1c4608232e07d3aa3d998e5135,b58f7d184743106a8a66028b7a28937c,4c0dc012ebb679a18b244c53c6f59b5a,a3c8be149d718771e892619bd310b961,0,0
4,5d9e0b4302ce95448cae72165ff4cf5b,NONFOOD,ea5d2f1c4608232e07d3aa3d998e5135,b58f7d184743106a8a66028b7a28937c,4c0dc012ebb679a18b244c53c6f59b5a,a3c8be149d718771e892619bd310b961,0,0


In [9]:
df_materials.isna().sum()

material            0
hier_level_1        0
hier_level_2        0
hier_level_3        0
hier_level_4        0
vendor              0
is_private_label    0
is_alco             0
dtype: int64

In [10]:
len(df_materials['hier_level_4'].unique())

2006

In [11]:
df_materials.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105609 entries, 0 to 105608
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   material          105609 non-null  object
 1   hier_level_1      105609 non-null  object
 2   hier_level_2      105609 non-null  object
 3   hier_level_3      105609 non-null  object
 4   hier_level_4      105609 non-null  object
 5   vendor            105609 non-null  object
 6   is_private_label  105609 non-null  int64 
 7   is_alco           105609 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 6.4+ MB


## transactions.parquet

In [12]:
df_transactions = pd.read_parquet('transactions.parquet', engine='pyarrow', use_threads=True)

In [13]:
df_transactions.shape, df_transactions.isna().sum()

((32109414, 9), chq_id          0
 plant           0
 chq_date        0
 chq_position    0
 client_id       0
 material        0
 sales_count     0
 sales_sum       0
 is_promo        0
 dtype: int64)

In [14]:
df_transactions.head()

Unnamed: 0,chq_id,plant,chq_date,chq_position,client_id,material,sales_count,sales_sum,is_promo
0,ce5b3d1c531348c5d0d8e859b6d0cf91,7cd86ecb09aa48c6e620b340f6a74592,2016-11-01,4,90fca68123e692a005a4edeadd94b2f2,232cfe8b28970434f685b737737b26d4,2.0,146.98,0
1,ce5b3d1c531348c5d0d8e859b6d0cf91,7cd86ecb09aa48c6e620b340f6a74592,2016-11-01,3,90fca68123e692a005a4edeadd94b2f2,12ec64451b8e426db16f3f05ea770901,1.0,249.99,1
2,ce5b3d1c531348c5d0d8e859b6d0cf91,7cd86ecb09aa48c6e620b340f6a74592,2016-11-01,12,90fca68123e692a005a4edeadd94b2f2,1212ff8ecf53aebef093ca8ae216db68,1.0,47.99,1
3,ce5b3d1c531348c5d0d8e859b6d0cf91,7cd86ecb09aa48c6e620b340f6a74592,2016-11-01,13,90fca68123e692a005a4edeadd94b2f2,1212ff8ecf53aebef093ca8ae216db68,1.0,47.99,1
4,ce5b3d1c531348c5d0d8e859b6d0cf91,7cd86ecb09aa48c6e620b340f6a74592,2016-11-01,11,90fca68123e692a005a4edeadd94b2f2,571c6d3b559db8445e05265649eb30ab,2.0,53.98,1


In [15]:
df_transactions.chq_date.min(), df_transactions.chq_date.max()

(Timestamp('2016-10-04 00:00:00'), Timestamp('2017-10-04 00:00:00'))

In [16]:
(df_transactions.client_id.unique()).shape

(100000,)

In [17]:
df_transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32109414 entries, 0 to 32109413
Data columns (total 9 columns):
 #   Column        Dtype         
---  ------        -----         
 0   chq_id        object        
 1   plant         object        
 2   chq_date      datetime64[ns]
 3   chq_position  object        
 4   client_id     object        
 5   material      object        
 6   sales_count   float64       
 7   sales_sum     float64       
 8   is_promo      int64         
dtypes: datetime64[ns](1), float64(2), int64(1), object(5)
memory usage: 2.2+ GB


## plants.csv

In [18]:
df_plants = pd.read_csv('plants.csv')
df_plants.head()

Unnamed: 0,plant,plant_type,city
0,95b09698fda1f64af16708ffb859eab9,HM,St. Petersburg
1,926abae84a4bd33c834bc6b981b8cf30,HM,St. Petersburg
2,ae2bac2e4b4da805d01b2952d7e35ba4,HM,St. Petersburg
3,0e7e3cf0ded4d9db8b376b317c007f99,HM,St. Petersburg
4,540bd55a2cf295b8ea9cd78650e89d03,HM,St. Petersburg


In [19]:
df_plants.isna().sum()

plant         0
plant_type    0
city          0
dtype: int64

# Part 2. Encoding data

In [None]:
!pip install dask[dataframe] > /dev/null
import dask.dataframe as dd

from sklearn.preprocessing import LabelEncoder

In [22]:
enc_client = LabelEncoder()
df_transactions['client_id'] = enc_client.fit_transform(df_transactions['client_id'])
df_clients['client_id'] = enc_client.transform(df_clients['client_id'])

In [23]:
enc_store = LabelEncoder()
df_plants['plant'] = enc_store.fit_transform(df_plants['plant'])
df_transactions['plant'] = enc_store.transform(df_transactions['plant'])

In [24]:
enc_material = LabelEncoder()
df_transactions['material'] = enc_material.fit_transform(df_transactions['material'])
df_materials['material'] = enc_material.transform(df_materials['material'])

In [25]:
enc_vendor = LabelEncoder()
df_materials['vendor'] = enc_vendor.fit_transform(df_materials['vendor'])
enc_vendor = LabelEncoder()
df_materials['hier_level_2'] = enc_vendor.fit_transform(df_materials['hier_level_2'])
enc_vendor = LabelEncoder()
df_materials['hier_level_3'] = enc_vendor.fit_transform(df_materials['hier_level_3'])
enc_vendor = LabelEncoder()
df_materials['hier_level_4'] = enc_vendor.fit_transform(df_materials['hier_level_4'])

In [29]:
df_transactions.head()

Unnamed: 0,chq_id,plant,chq_date,chq_position,client_id,material,sales_count,sales_sum,is_promo
0,ce5b3d1c531348c5d0d8e859b6d0cf91,179,2016-11-01,4,56437,14213,2.0,146.98,0
1,ce5b3d1c531348c5d0d8e859b6d0cf91,179,2016-11-01,3,56437,7629,1.0,249.99,1
2,ce5b3d1c531348c5d0d8e859b6d0cf91,179,2016-11-01,12,56437,7264,1.0,47.99,1
3,ce5b3d1c531348c5d0d8e859b6d0cf91,179,2016-11-01,13,56437,7264,1.0,47.99,1
4,ce5b3d1c531348c5d0d8e859b6d0cf91,179,2016-11-01,11,56437,35784,2.0,53.98,1


In [32]:
enc_city = LabelEncoder()
df_clients['city'] = enc_city.fit_transform(df_clients['city'])
df_plants['city'] = enc_city.transform(df_plants['city'])

In [None]:
test = LabelEncoder()
df_plants['plant_type'] = enc_gender1.fit_transform(df_plants['plant_type'])

In [None]:
new_trans = df_transactions.merge(frequency_df, on='Customer ID')

In [35]:
enc_vendor = LabelEncoder()
df_clients['gender'] = enc_vendor.fit_transform(df_clients['gender'])

Unnamed: 0,client_id,gender,city,birthyear
0,10144,1,1,1990
1,61042,0,1,1969
2,83456,1,1,1976
3,7725,0,0,1966
4,12977,0,1,1988


In [37]:
enc_chq = LabelEncoder()
df_transactions['chq_id'] = enc_chq.fit_transform(df_transactions['chq_id'])

In [None]:
enc_chq1 = LabelEncoder()
df_materials['hier_level_1'] = enc_chq1.fit_transform(df_materials['hier_level_1'])

In [59]:
cp new_hack_data.zip drive/My\ Drive/

In [58]:
!zip -r new_hack_data.zip c.csv p.csv m.csv t.parquet

  adding: c.csv (deflated 67%)
  adding: p.csv (deflated 64%)
  adding: m.csv (deflated 72%)
  adding: t.parquet (deflated 16%)


# Encoded and compressed data you can find here: 
### https://drive.google.com/file/d/1041MtZcK-DuiXZXYsx_s0cx8TE__I7Y3/view?usp=sharing