# Example usage

To use `cat2cat` in a project:

### Load data

In [16]:
# cat2cat datasets
from cat2cat.datasets import load_trans, load_occup
trans = load_trans()
occup = load_occup()

### Low-level functions

In [17]:

# Low-level functions
from cat2cat.mappings import get_mappings, get_freqs, cat_apply_freq


mappings = get_mappings(trans)
codes_new = occup.code[occup.year == 2010].map(str).values
freqs = get_freqs(codes_new)
mapp_new_p = cat_apply_freq(mappings["to_new"], freqs)
# base period mappings and their frequencies probabilities
(mappings["to_new"]['3481'], mapp_new_p['3481'])

(['441401', '441402', '441403', '441490'], [0.0, 0.6, 0.0, 0.4])

### cat2cat procedure

In [18]:
from cat2cat import cat2cat
from cat2cat.dataclass import cat2cat_data, cat2cat_mappings, cat2cat_ml

from pandas import DataFrame, concat

o_2006 = occup.loc[occup.year == 2006, :].copy()
o_2008 = o_old = occup.loc[occup.year == 2008, :].copy()
o_2010 = o_new = occup.loc[occup.year == 2010, :].copy()
o_2012 = occup.loc[occup.year == 2012, :].copy()

data = cat2cat_data(o_old, o_new, "code", "code", "year")
mappings = cat2cat_mappings(trans, "backward")

res = cat2cat(data, mappings)
data_final = concat([res["old"], res["new"]])

sub_cols = ["id", "edu", "code", "year", "index_c2c", "g_new_c2c", "rep_c2c", "wei_naive_c2c", "wei_freq_c2c"]
data_final.groupby(["year"]).sample(5).loc[:, sub_cols]

Unnamed: 0,id,edu,code,year,index_c2c,g_new_c2c,rep_c2c,wei_naive_c2c,wei_freq_c2c
161573,28717,4,2241,2008,12176,222222,24,0.041667,0.0
126576,25978,8,5159,2008,9437,541906,23,0.043478,0.022599
40770,19514,1,3422,2008,2973,333106,9,0.111111,0.0
216833,32883,6,9132,2008,16342,524601,18,0.055556,0.001294
69883,21681,1,2321,2008,5140,233008,34,0.029412,0.104824
43838,43839,2,816014,2010,10075,816014,1,1.0,1.0
38496,38497,2,222101,2010,4733,222101,1,1.0,1.0
35620,35621,2,334201,2010,1857,334201,1,1.0,1.0
42113,42114,1,233025,2010,8350,233025,1,1.0,1.0
44535,44536,2,213203,2010,10772,213203,1,1.0,1.0


### With ML

In [19]:
from sklearn.neighbors import KNeighborsClassifier

ml = cat2cat_ml(
    o_new, 
    "code", 
    ["salary", "age", "edu"], 
    [KNeighborsClassifier()]
)

c2c = cat2cat(data, mappings, ml)
data_final = concat([c2c["old"], c2c["new"]])

sub_cols = ["id", "year", "wei_naive_c2c", "wei_freq_c2c", "wei_KNeighborsClassifier_c2c"]
data_final.groupby(["year"]).sample(5).loc[:, sub_cols]

Unnamed: 0,id,year,wei_naive_c2c,wei_freq_c2c,wei_KNeighborsClassifier_c2c
210201,32382,2008,0.052632,0.001908,0.0
182212,30242,2008,0.055556,0.010349,0.2
116904,25196,2008,0.0625,0.03125,0.0
196231,31256,2008,0.029412,0.007421,0.0
89141,22981,2008,0.021739,0.009259,0.0
43880,43881,2010,1.0,1.0,1.0
36729,36730,2010,1.0,1.0,1.0
34284,34285,2010,1.0,1.0,1.0
42774,42775,2010,1.0,1.0,1.0
35291,35292,2010,1.0,1.0,1.0


With 4 periods, one mapping table and backward direction:

In [20]:
from cat2cat.cat2cat_utils import dummy_c2c

o_2006 = occup.loc[occup.year == 2006, :].copy()
o_2008 = occup.loc[occup.year == 2008, :].copy()
o_2010 = occup.loc[occup.year == 2010, :].copy()
o_2012 = occup.loc[occup.year == 2012, :].copy()


data = cat2cat_data(o_2008, o_2010, "code", "code", "year")
mappings = cat2cat_mappings(trans, "backward")

occup_back_2008_2010 = cat2cat(data, mappings)
data = cat2cat_data(
    o_2006, occup_back_2008_2010["old"], 
    "code", "g_new_c2c", "year"
)
occup_back_2006_2008 = cat2cat(data, mappings)

o_2006_n = occup_back_2006_2008["old"]
o_2008_n = occup_back_2006_2008["new"] # or occup_back_2008_2010["old"]
o_2010_n = occup_back_2008_2010["new"]
o_2012_n = dummy_c2c(o_2012, "code")

data_final = concat([o_2006_n, o_2008_n, o_2010_n, o_2012_n])

sub_cols = ["id", "edu", "code", "year", "index_c2c",
 "g_new_c2c", "rep_c2c", "wei_naive_c2c", "wei_freq_c2c"]
data_final.groupby(["year"]).sample(5).loc[:, sub_cols]

Unnamed: 0,id,edu,code,year,index_c2c,g_new_c2c,rep_c2c,wei_naive_c2c,wei_freq_c2c
115409,8962,1,2331,2006,8961,234118,19,0.052632,0.052632
49379,4303,1,2231,2006,4302,221237,70,0.014286,0.014286
132594,10325,6,5221,2006,10324,522304,19,0.052632,0.051803
195190,15210,1,2321,2006,15209,233003,34,0.029412,0.029412
17886,1663,6,8323,2006,1662,833202,3,0.333333,0.333333
23218,18139,5,3431,2008,1598,334201,11,0.090909,0.220994
163223,28822,1,2231,2008,12281,221259,70,0.014286,0.021429
16100,17622,1,2231,2008,1081,221253,70,0.014286,0.007143
8695,17085,4,2241,2008,544,222206,24,0.041667,0.0
129947,26220,1,2321,2008,9679,233001,34,0.029412,0.039889
