# Example usage of `OptimizedUnivariateEncoding`

This example shows how to generate a model and compute statistics on both numerical and categorical variables.

In [2]:
import pandas
import kuplift
ue = kuplift.OptimizedUnivariateEncoding()
# Import data and set proper column types for categorical columns not already detected as such.
(df := pandas.read_csv("../data/data_uplift_missing.csv").astype({"VAR2": object, "CIBLE": object}))

Unnamed: 0,VAR1,VAR2,VAR3,VAR4,TRAITEMENT,CIBLE
0,,0,11.076782,0.985791,T1,1
1,,0,11.980937,0.606120,T1,1
2,,0,11.800886,0.809460,T1,1
3,7.233434,7,24.466868,0.990217,T1,0
4,6.477910,6,22.955819,0.091321,T1,1
...,...,...,...,...,...,...
1995,6.100756,6,22.201511,0.400879,T0,0
1996,6.108354,6,22.216709,0.896717,T0,0
1997,3.531834,3,17.063668,0.785622,T0,1
1998,9.805668,9,29.611336,0.853881,T0,1


In [3]:
ue.fit(df[df.columns[:-2]], df["TRAITEMENT"], df["CIBLE"])

In [4]:
ue.get_levels()

[('VAR2', 0.973101), ('VAR1', 0.883716), ('VAR3', 0.881227), ('VAR4', 0)]

In [5]:
print(ue.get_partition("VAR1"))

Interval partition
    10 intervals:
      - []
      - [-inf, 2.0004[
      - [2.0004, 3.00025[
      - [3.00025, 4[
      - [4, 4.9983[
      - [4.9983, 5.991[
      - [5.991, 7.0002[
      - [7.0002, 8.0009[
      - [8.0009, 9[
      - [9, inf[


In [6]:
print(ue.get_partition("VAR2"))

Value group partition
    2 groups ("*" indicates the default group):
        - {8, 6, 0, 4, 2}
      * - {3, 7, 1, 5, 9}


## Statistics on numerical variable `VAR1`

In [7]:
ue.get_target_frequencies("VAR1")

Unnamed: 0,Part,P(1|T1),P(1|T0),P(0|T1),P(0|T0)
0,[],110,0,0,100
1,"[-inf, 2.0004[",0,92,100,0
2,"[2.0004, 3.00025[",95,0,0,97
3,"[3.00025, 4[",0,108,107,0
4,"[4, 4.9983[",97,0,0,106
5,"[4.9983, 5.991[",0,93,97,1
6,"[5.991, 7.0002[",106,0,0,113
7,"[7.0002, 8.0009[",1,91,103,0
8,"[8.0009, 9[",101,0,0,118
9,"[9, inf[",0,81,83,0


In [8]:
ue.get_target_probabilities("VAR1")

Unnamed: 0,Part,P(1|T1),P(1|T0),P(0|T1),P(0|T0)
0,[],0.055,0.0,0.0,0.05
1,"[-inf, 2.0004[",0.0,0.046,0.05,0.0
2,"[2.0004, 3.00025[",0.0475,0.0,0.0,0.0485
3,"[3.00025, 4[",0.0,0.054,0.0535,0.0
4,"[4, 4.9983[",0.0485,0.0,0.0,0.053
5,"[4.9983, 5.991[",0.0,0.0465,0.0485,0.0005
6,"[5.991, 7.0002[",0.053,0.0,0.0,0.0565
7,"[7.0002, 8.0009[",0.0005,0.0455,0.0515,0.0
8,"[8.0009, 9[",0.0505,0.0,0.0,0.059
9,"[9, inf[",0.0,0.0405,0.0415,0.0


In [9]:
ue.get_uplift(1, "T0", "VAR1")

Unnamed: 0,Part,Uplift 1 T1
0,[],0.055
1,"[-inf, 2.0004[",-0.046
2,"[2.0004, 3.00025[",0.0475
3,"[3.00025, 4[",-0.054
4,"[4, 4.9983[",0.0485
5,"[4.9983, 5.991[",-0.0465
6,"[5.991, 7.0002[",0.053
7,"[7.0002, 8.0009[",-0.045
8,"[8.0009, 9[",0.0505
9,"[9, inf[",-0.0405


## Statistics on categorical variable `VAR2`

In [10]:
ue.get_target_frequencies("VAR2")

Unnamed: 0,Part,P(1|T1),P(1|T0),P(0|T1),P(0|T0)
0,"[8, 6, 0, 4, 2]",510,0,0,535
1,"[3, 7, 1, 5, 9]",0,465,490,0


In [11]:
ue.get_target_probabilities("VAR2")

Unnamed: 0,Part,P(1|T1),P(1|T0),P(0|T1),P(0|T0)
0,"[8, 6, 0, 4, 2]",0.255,0.0,0.0,0.2675
1,"[3, 7, 1, 5, 9]",0.0,0.2325,0.245,0.0


In [12]:
ue.get_uplift(1, "T0", "VAR2")

Unnamed: 0,Part,Uplift 1 T1
0,"[8, 6, 0, 4, 2]",0.255
1,"[3, 7, 1, 5, 9]",-0.2325
