# 1. Settings

## 1.1 User variables

In [1]:
from config import *

NB_RUNS = 30

TARGETS = [TARGET]
HEURISTICS = ["try_all_atoms", "max_coverage_static"] #["max_coverage_static"] #["try_all_atoms", "max_coverage_dynamic", "max_coverage_static", "max_diversity"]
THREADS = 8
USE_NAN = True

DATA_PATH = "tmp/data_discrete.csv"

# 1.2 Dev variables

In [2]:
RANDOM_SEED = 0

INPUT_DATA_DISCRETE = "tmp/data_discrete.csv"
OUTPUT_CSV = "tmp/lfit_output.csv"

VERBOSE = True

USE_GULA = True

## 1.3 Imports

In [3]:
import pandas as pd
import numpy
import random
import pylfit
import os

random.seed(RANDOM_SEED)
numpy.random.seed(RANDOM_SEED)

## 1.4 Constants

In [4]:
# Constants
DEBUG = False

# 2. Data Loading

# DEBUG

In [5]:
# DEBUG
print("Training dataset:")
df = pd.read_csv(DATA_PATH)
cols = [c for c in df.columns if c not in TARGETS]

#df = df.drop(columns=["Poaceae_prev"])

display(df)

for target in TARGETS:
    for val in df[target].unique():
        print(val,len(df[df[target] == val]))

Training dataset:


Unnamed: 0,Picea,Pinus,Alnus,Betula,Quercus,Ambrosia,Chenopodiaceae,Synth_0,Synth_1,Synth_2,Synth_3,Synth_4,Synth_5,Synth_6,Synth_7,Synth_8,Synth_9
0,1,-1,1,-1,1,-1,1,1,-1,-1,-1,-1,-1,-1,1,1,-1
1,-1,1,1,1,-1,1,-1,1,1,-1,-1,-1,-1,1,1,1,-1
2,1,1,-1,-1,-1,1,-1,1,-1,1,1,1,1,-1,1,1,1
3,-1,-1,1,1,1,-1,1,-1,1,-1,-1,1,1,1,1,-1,-1
4,1,-1,-1,-1,1,-1,-1,1,-1,-1,-1,1,1,-1,-1,1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,-1,1,1,-1,-1,-1,1,-1,-1,1,1,1,-1,-1,-1,-1,1
57,1,-1,1,1,1,1,-1,-1,1,-1,1,-1,-1,1,-1,-1,-1
58,1,-1,-1,1,-1,-1,-1,1,-1,-1,-1,-1,-1,-1,-1,1,1
59,-1,1,1,1,-1,1,1,1,1,-1,-1,-1,-1,1,-1,1,-1


-1 34
1 27


In [6]:
# Load training dataset
cols = [c for c in df.columns if c not in list(df.select_dtypes(exclude=["number"]).columns)]
df[cols] = df[cols].astype('Int64')
df[cols] = df[cols].astype('string')

if not USE_NAN:
    df = df.dropna()
else:
    df = df.fillna("?")

col_order = list(df.columns)

df_output = pd.DataFrame(columns=["head_variable", "head_value", "nb_supports", "rule", "supports", "rule_str"])

if USE_GULA:
    FEATURES = [c for c in df.columns if c not in TARGETS]
    df.to_csv("tmp/lfit_input.csv")

    # Convert array data as a DiscreteStateTransitionsDataset using pylfit.preprocessing
    dataset = pylfit.preprocessing.discrete_state_transitions_dataset_from_csv(path="tmp/lfit_input.csv", \
    feature_names=FEATURES, target_names=TARGETS,unknown_values=["?"])

    # Initialize a DMVLP with the dataset variables and set GULA as learning algorithm
    model = pylfit.models.DMVLP(features=dataset.features, targets=dataset.targets)
    model.compile(algorithm="gula") # model.compile(algorithm="pride")
    model.fit(dataset=dataset, options={"heuristics":HEURISTICS, "verbose":0, "threads":THREADS, "supported_only":True})
    rules_supports = model.supports(dataset)

    # Save rules
    rules_data = []

    for r in model.rules:
        rules_data.append([r.head.variable, r.head.value, len(rules_supports[r]), r, [(tuple(s1),tuple(s2)) for s1,s2 in rules_supports[r]], r.to_string()]) #, r.to_string()])

    df_output = pd.DataFrame(rules_data, columns=df_output.columns) #, "rule_raw_form"])

    nb_rules_of_run = len(df_output)
    
    print("rules found:", len(df_output))

    df_output.to_csv(OUTPUT_CSV,index=False)

else:

    FEATURES = [c for c in df.columns if c not in TARGETS]
    original_dataset = pylfit.preprocessing.discrete_state_transitions_dataset_from_csv(path="tmp/lfit_input.csv", \
    feature_names=FEATURES, target_names=TARGETS,unknown_values=["?"])

    for run_id in range(NB_RUNS):
        # Shuffle variables
        random.shuffle(col_order)
        df = df[col_order]

        FEATURES = [c for c in df.columns if c not in TARGETS]

        df.to_csv("tmp/lfit_input.csv")

        # Convert array data as a DiscreteStateTransitionsDataset using pylfit.preprocessing
        dataset = pylfit.preprocessing.discrete_state_transitions_dataset_from_csv(path="tmp/lfit_input.csv", \
        feature_names=FEATURES, target_names=TARGETS,unknown_values=["?"])

        # Initialize a DMVLP with the dataset variables and set GULA as learning algorithm
        model = pylfit.models.DMVLP(features=dataset.features, targets=dataset.targets)
        model.compile(algorithm="pride") # model.compile(algorithm="pride")
        model.fit(dataset=dataset, options={"heuristics":HEURISTICS, "verbose":0, "threads":THREADS})
        rules_supports = model.supports(original_dataset)

        # Save rules
        rules_data = []

        for r in model.rules:
            rules_data.append([r.head.variable, r.head.value, len(rules_supports[r]), r, [(tuple(s1),tuple(s2)) for s1,s2 in rules_supports[r]], r.to_string()]) #, r.to_string()])

        df_rules = pd.DataFrame(rules_data, columns=df_output.columns) #, "rule_raw_form"])

        nb_rules_of_run = len(df_rules)
        nb_rules_total_before = len(df_output)

        df_output = pd.concat([df_output,df_rules]).reset_index(drop=True)
        df_output.drop_duplicates(subset=["rule_str"], inplace=True)

        nb_new_rules = len(df_output) - nb_rules_total_before
        nb_duplicate_rules = nb_rules_of_run - nb_new_rules
        
        print("Run",str(run_id+1)+"/"+str(NB_RUNS),"- new rules found:", nb_new_rules, "- total:",len(df_output),"- duplicate:",nb_duplicate_rules)

    df_output = df_output[["head_variable", "head_value", "nb_supports", "rule", "supports"]]
    df_output.to_csv(OUTPUT_CSV,index=False)

df_output = df_output.sort_values(["nb_supports"],ascending=False)

display(df_output[df_output["head_value"] == "1"])
display(df_output[df_output["head_value"] == "-1"])

rules found: 3519


Unnamed: 0,head_variable,head_value,nb_supports,rule,supports,rule_str
2175,Synth_3,1,16,"Synth_3(1) :- Betula(-1), Pinus(1), Synth_4(1).","[((1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1, 1, -1...","Synth_3(1) :- Betula(-1), Pinus(1), Synth_4(1)."
1823,Synth_3,1,16,"Synth_3(1) :- Pinus(1), Synth_1(-1).","[((1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1, 1, -1...","Synth_3(1) :- Pinus(1), Synth_1(-1)."
2101,Synth_3,1,16,"Synth_3(1) :- Pinus(1), Synth_4(1), Synth_6(-1).","[((1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1, 1, -1...","Synth_3(1) :- Pinus(1), Synth_4(1), Synth_6(-1)."
1733,Synth_3,1,15,"Synth_3(1) :- Synth_4(1), Synth_9(1).","[((1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1, 1, -1...","Synth_3(1) :- Synth_4(1), Synth_9(1)."
1743,Synth_3,1,14,"Synth_3(1) :- Synth_2(1), Synth_4(1).","[((1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1, 1, -1...","Synth_3(1) :- Synth_2(1), Synth_4(1)."
...,...,...,...,...,...,...
2384,Synth_3,1,1,"Synth_3(1) :- Alnus(-1), Betula(1), Pinus(-1),...","[((1, -1, -1, 1, 1, -1, 1, -1, 1, 1, -1, -1, 1...","Synth_3(1) :- Alnus(-1), Betula(1), Pinus(-1),..."
2372,Synth_3,1,1,"Synth_3(1) :- Betula(1), Chenopodiaceae(1), Qu...","[((1, -1, -1, 1, 1, -1, 1, -1, 1, 1, -1, -1, 1...","Synth_3(1) :- Betula(1), Chenopodiaceae(1), Qu..."
2382,Synth_3,1,1,"Synth_3(1) :- Alnus(-1), Betula(-1), Chenopodi...","[((-1, 1, -1, -1, -1, 1, 1, -1, -1, -1, 1, 1, ...","Synth_3(1) :- Alnus(-1), Betula(-1), Chenopodi..."
2379,Synth_3,1,1,"Synth_3(1) :- Alnus(-1), Betula(1), Quercus(1)...","[((1, -1, -1, 1, 1, -1, 1, -1, 1, 1, -1, -1, 1...","Synth_3(1) :- Alnus(-1), Betula(1), Quercus(1)..."


Unnamed: 0,head_variable,head_value,nb_supports,rule,supports,rule_str
76,Synth_3,-1,23,"Synth_3(-1) :- Synth_0(1), Synth_4(-1).","[((1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, ...","Synth_3(-1) :- Synth_0(1), Synth_4(-1)."
1119,Synth_3,-1,20,"Synth_3(-1) :- Synth_2(-1), Synth_4(-1), Synth...","[((1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, ...","Synth_3(-1) :- Synth_2(-1), Synth_4(-1), Synth..."
1118,Synth_3,-1,19,"Synth_3(-1) :- Synth_1(1), Synth_2(-1), Synth_...","[((-1, 1, 1, 1, -1, 1, -1, 1, 1, -1, -1, -1, 1...","Synth_3(-1) :- Synth_1(1), Synth_2(-1), Synth_..."
1121,Synth_3,-1,18,"Synth_3(-1) :- Synth_2(-1), Synth_6(1), Synth_...","[((-1, 1, 1, 1, -1, 1, -1, 1, 1, -1, -1, -1, 1...","Synth_3(-1) :- Synth_2(-1), Synth_6(1), Synth_..."
1141,Synth_3,-1,16,"Synth_3(-1) :- Synth_4(-1), Synth_8(1), Synth_...","[((1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, ...","Synth_3(-1) :- Synth_4(-1), Synth_8(1), Synth_..."
...,...,...,...,...,...,...
941,Synth_3,-1,1,"Synth_3(-1) :- Pinus(1), Synth_4(1), Synth_6(1...","[((-1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, 1...","Synth_3(-1) :- Pinus(1), Synth_4(1), Synth_6(1..."
943,Synth_3,-1,1,"Synth_3(-1) :- Alnus(1), Betula(-1), Quercus(-...","[((-1, 1, 1, -1, -1, -1, -1, 1, 1, -1, -1, 1, ...","Synth_3(-1) :- Alnus(1), Betula(-1), Quercus(-..."
946,Synth_3,-1,1,"Synth_3(-1) :- Betula(-1), Picea(-1), Quercus(...","[((-1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1, ...","Synth_3(-1) :- Betula(-1), Picea(-1), Quercus(..."
953,Synth_3,-1,1,"Synth_3(-1) :- Quercus(-1), Synth_0(-1), Synth...","[((-1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, ...","Synth_3(-1) :- Quercus(-1), Synth_0(-1), Synth..."


# Influence Graph extraction

In [7]:
total_rules = len(df_output)
print("Total rules:", total_rules)
df_tmp = df_output[df_output["nb_supports"] >= MIN_SUPPORT]
print("With min support", len(df_tmp))

df_up = df_tmp[df_tmp["head_value"] == "1"]
df_down = df_tmp[df_tmp["head_value"] == "-1"]
display(df_up)
display(df_down)

influences = {var:{-1:set(),1:set()} for var in df.columns}

for (df_tmp, influence_type) in [(df_up, 1),(df_down, -1)]:
    for idx, row in df_tmp.iterrows():
        rule = row["rule"]

        for feature in rule.body:
            if rule.get_condition(feature).value == "1":
                influences[feature][influence_type].update(row["supports"])
            if rule.get_condition(feature).value == "-1":
                influences[feature][-influence_type].update(row["supports"])

            #print(feature, "+", len(influences[feature][influence_type]), influences[feature][influence_type], )
            #print(feature, "-", len(influences[feature][-influence_type]), influences[feature][-influence_type])
            

data = []
for var in influences:
    data += [[var, len(influences[var][1]), len(influences[var][-1])]]

df_influences = pd.DataFrame(data, columns=["Feature","positive_influence","negative_influence"])
df_influences["influence"] = df_influences["positive_influence"] - df_influences["negative_influence"]
df_influences = df_influences.sort_values(by=["influence"],ascending=False)
display(df_influences)

Total rules: 3519
With min support 3519


Unnamed: 0,head_variable,head_value,nb_supports,rule,supports,rule_str
2175,Synth_3,1,16,"Synth_3(1) :- Betula(-1), Pinus(1), Synth_4(1).","[((1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1, 1, -1...","Synth_3(1) :- Betula(-1), Pinus(1), Synth_4(1)."
1823,Synth_3,1,16,"Synth_3(1) :- Pinus(1), Synth_1(-1).","[((1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1, 1, -1...","Synth_3(1) :- Pinus(1), Synth_1(-1)."
2101,Synth_3,1,16,"Synth_3(1) :- Pinus(1), Synth_4(1), Synth_6(-1).","[((1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1, 1, -1...","Synth_3(1) :- Pinus(1), Synth_4(1), Synth_6(-1)."
1733,Synth_3,1,15,"Synth_3(1) :- Synth_4(1), Synth_9(1).","[((1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1, 1, -1...","Synth_3(1) :- Synth_4(1), Synth_9(1)."
1743,Synth_3,1,14,"Synth_3(1) :- Synth_2(1), Synth_4(1).","[((1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1, 1, -1...","Synth_3(1) :- Synth_2(1), Synth_4(1)."
...,...,...,...,...,...,...
2384,Synth_3,1,1,"Synth_3(1) :- Alnus(-1), Betula(1), Pinus(-1),...","[((1, -1, -1, 1, 1, -1, 1, -1, 1, 1, -1, -1, 1...","Synth_3(1) :- Alnus(-1), Betula(1), Pinus(-1),..."
2372,Synth_3,1,1,"Synth_3(1) :- Betula(1), Chenopodiaceae(1), Qu...","[((1, -1, -1, 1, 1, -1, 1, -1, 1, 1, -1, -1, 1...","Synth_3(1) :- Betula(1), Chenopodiaceae(1), Qu..."
2382,Synth_3,1,1,"Synth_3(1) :- Alnus(-1), Betula(-1), Chenopodi...","[((-1, 1, -1, -1, -1, 1, 1, -1, -1, -1, 1, 1, ...","Synth_3(1) :- Alnus(-1), Betula(-1), Chenopodi..."
2379,Synth_3,1,1,"Synth_3(1) :- Alnus(-1), Betula(1), Quercus(1)...","[((1, -1, -1, 1, 1, -1, 1, -1, 1, 1, -1, -1, 1...","Synth_3(1) :- Alnus(-1), Betula(1), Quercus(1)..."


Unnamed: 0,head_variable,head_value,nb_supports,rule,supports,rule_str
76,Synth_3,-1,23,"Synth_3(-1) :- Synth_0(1), Synth_4(-1).","[((1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, ...","Synth_3(-1) :- Synth_0(1), Synth_4(-1)."
1119,Synth_3,-1,20,"Synth_3(-1) :- Synth_2(-1), Synth_4(-1), Synth...","[((1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, ...","Synth_3(-1) :- Synth_2(-1), Synth_4(-1), Synth..."
1118,Synth_3,-1,19,"Synth_3(-1) :- Synth_1(1), Synth_2(-1), Synth_...","[((-1, 1, 1, 1, -1, 1, -1, 1, 1, -1, -1, -1, 1...","Synth_3(-1) :- Synth_1(1), Synth_2(-1), Synth_..."
1121,Synth_3,-1,18,"Synth_3(-1) :- Synth_2(-1), Synth_6(1), Synth_...","[((-1, 1, 1, 1, -1, 1, -1, 1, 1, -1, -1, -1, 1...","Synth_3(-1) :- Synth_2(-1), Synth_6(1), Synth_..."
1141,Synth_3,-1,16,"Synth_3(-1) :- Synth_4(-1), Synth_8(1), Synth_...","[((1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, ...","Synth_3(-1) :- Synth_4(-1), Synth_8(1), Synth_..."
...,...,...,...,...,...,...
941,Synth_3,-1,1,"Synth_3(-1) :- Pinus(1), Synth_4(1), Synth_6(1...","[((-1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, 1...","Synth_3(-1) :- Pinus(1), Synth_4(1), Synth_6(1..."
943,Synth_3,-1,1,"Synth_3(-1) :- Alnus(1), Betula(-1), Quercus(-...","[((-1, 1, 1, -1, -1, -1, -1, 1, 1, -1, -1, 1, ...","Synth_3(-1) :- Alnus(1), Betula(-1), Quercus(-..."
946,Synth_3,-1,1,"Synth_3(-1) :- Betula(-1), Picea(-1), Quercus(...","[((-1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1, ...","Synth_3(-1) :- Betula(-1), Picea(-1), Quercus(..."
953,Synth_3,-1,1,"Synth_3(-1) :- Quercus(-1), Synth_0(-1), Synth...","[((-1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, ...","Synth_3(-1) :- Quercus(-1), Synth_0(-1), Synth..."


Unnamed: 0,Feature,positive_influence,negative_influence,influence
11,Synth_4,51,10,41
9,Synth_2,46,15,31
16,Synth_9,43,18,25
1,Pinus,42,19,23
14,Synth_7,33,28,5
0,Picea,32,29,3
12,Synth_5,31,30,1
10,Synth_3,0,0,0
6,Chenopodiaceae,30,31,-1
4,Quercus,29,32,-3


In [8]:
import networkx as nx
import matplotlib.pyplot as plt

MIN_INFLUENCE = 20

# Create a knowledge graph
G = nx.Graph()
for _, row in df_influences.iterrows():
    if abs(row["influence"]) >= MIN_INFLUENCE:
        G.add_edge(row['Feature'], TARGET, label=row['influence'])

# Visualize the knowledge graph
pos = nx.spring_layout(G, seed=42, k=0.9)
labels = nx.get_edge_attributes(G, 'label')
plt.figure(figsize=(12, 10))
nx.draw(G, pos, with_labels=True, font_size=10, node_size=700, node_color='lightblue', edge_color='gray', alpha=0.6)
nx.draw_networkx_edge_labels(G, pos, edge_labels=labels, font_size=8, label_pos=0.3, verticalalignment='baseline')
plt.title('Knowledge Graph')
plt.show()

ModuleNotFoundError: No module named 'networkx'

# TODO