In [1]:
! pip install bamt

Collecting catboost>=1.0.6 (from bamt)
  Using cached catboost-1.2.7-cp312-cp312-macosx_11_0_universal2.whl.metadata (1.2 kB)
Using cached catboost-1.2.7-cp312-cp312-macosx_11_0_universal2.whl (27.0 MB)
Installing collected packages: catboost
Successfully installed catboost-1.2.7


In [2]:
from dataclasses import dataclass
from enum import Enum, auto
from typing import Self
from uuid import uuid4
from bamt.preprocessors import Preprocessor
import pandas as pd
from sklearn import preprocessing as pp
from bamt.networks import HybridBN

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats


class Sex(Enum):
    male = auto()
    female = auto()


class MaritalStatus(Enum):
    single = auto()
    maried = auto()


class Education(Enum):
    secondary = auto()  # среднее
    secondary_proffesional = auto()
    higher = auto()


@dataclass
class Agent:
    id = uuid4()
    age: int
    sex: Sex
    salary: int
    education: Education | str
    mortgage_dept: int | None = None
    marital_status: MaritalStatus | str | None = None
    children: list[Self] | None = None
    parents: list[Self] | None = None

In [3]:
import pandas as pd


df = pd.read_csv("salary.csv")

In [4]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [5]:
def parse_sex(v):
    if v == " Male":
        return Sex.male
    if v == " Female":
        return Sex.female
    return None


population = []
for index, row in df.iterrows():
    agent = Agent(
        age=row["age"], sex=parse_sex(row["sex"]), education=row["education"], salary=1
    )
    population.append(agent)

In [6]:
population[23]

Agent(age=43, sex=<Sex.male: 1>, salary=1, education=' 11th', mortgage_dept=None, marital_status=None, children=None, parents=None)

In [26]:
rlms = pd.read_excel("Data_RLMS.xlsx")
rlms.columns

Index(['idind', 'psu', 'status', 'age', 'male', 'industry', 'lnwage', 'public',
       'internet', 'children', 'urban', 'educ', 'id1', 'id2', 'id3', 'id4',
       'id5', 'id6', 'id7', 'id8', 'id9', 'id10', 'id11', 'id12', 'id13',
       'id14', 'id15', 'id16', 'id17', 'id18', 'id19', 'id20', 'id21', 'id22',
       'id23', 'id24', 'id25', 'id26', 'id27', 'id28', 'id29', 'id30', 'id31',
       'id32', 'id33', 'id34', 'id35', 'id36', 'id37', 'id38', 'id39', 'id40',
       'id41', 'id42', 'id43', 'id44', 'id45', 'id46', 'id47', 'id48', 'id49',
       'id50', 'id51', 'id52', 'id53', 'id54', 'id55', 'id56', 'id57', 'id58',
       'id59', 'id60', 'id61', 'id62', 'id63', 'id64', 'id65', 'id66', 'id67',
       'id68', 'id69', 'id70', 'id71', 'id72', 'id73', 'id74', 'id75'],
      dtype='object')

In [60]:
data = rlms[
    ["age", "male", "industry", "lnwage", "public", "children", "urban", "educ"]
]

data = rlms[
    [
        "age",
        "lnwage",
        # "children",
    ]
]

# data["public"] = rlms.public.astype(str)
data["male"] = rlms.male.astype(str)
# data["industry"] = rlms.industry.astype(str)
# data["urban"] = rlms.urban.astype(str)
data["educ"] = rlms.educ.astype(str)

In [61]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3871 entries, 0 to 3870
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     3871 non-null   int64  
 1   lnwage  3871 non-null   float64
 2   male    3871 non-null   object 
 3   educ    3871 non-null   object 
dtypes: float64(1), int64(1), object(2)
memory usage: 121.1+ KB


In [72]:
data

Unnamed: 0,age,lnwage,male,educ
0,59,8.954980,1,1
1,40,10.628960,0,3
2,53,10.341270,0,1
3,47,9.753487,1,1
4,54,10.158950,0,3
...,...,...,...,...
3866,34,10.015850,0,1
3867,39,11.139780,1,2
3868,32,11.362930,1,3
3869,19,12.461540,1,2


In [62]:
# data.industry.unique()

In [63]:
# set encoder and discretizer
encoder = pp.LabelEncoder()
discretizer = pp.KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform")

# create preprocessor object with encoder and discretizer
p = Preprocessor([("encoder", encoder), ("discretizer", discretizer)])

# discretize data for structure learning
discretized_data, est = p.apply(data)

# get information about data
info = p.info

In [64]:
info

{'types': {'age': 'disc_num',
  'lnwage': 'cont',
  'male': 'disc',
  'educ': 'disc'},
 'signs': {'lnwage': 'pos'}}

In [65]:
# initialize network object
bn = HybridBN(use_mixture=True, has_logit=True)

# add nodes to network
bn.add_nodes(info)

# using mutual information as scoring function for structure learning
bn.add_edges(
    discretized_data,
    #  scoring_function=('MI',)
)

# or use evolutionary algorithm to learn structure

bn.add_edges(discretized_data)

bn.fit_parameters(data)

  0%|          | 0/1000000 [00:00<?, ?it/s]

  0%|          | 0/1000000 [00:00<?, ?it/s]

In [71]:
# bn.plot('rlms_bn.html')

In [67]:
bn.validate(info)

True

In [68]:
bn.save("pop.json")

True

In [69]:
sampled_data = bn.sample(10_000, progress_bar=False)
sampled_data = sampled_data[sampled_data.age != "nan"]

In [15]:
sampled_data.children = sampled_data.children.astype(int)
sampled_data.public = sampled_data.public.astype(int)
sampled_data.male = sampled_data.male.astype(int)
sampled_data.age = sampled_data.age.astype(int)
sampled_data.industry = sampled_data.industry.astype(str)

In [16]:
sampled_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9766 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   industry  9766 non-null   object 
 1   public    9766 non-null   int64  
 2   male      9766 non-null   int64  
 3   lnwage    9766 non-null   float64
 4   children  9766 non-null   int64  
 5   age       9766 non-null   int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 534.1+ KB


In [17]:
data.age.info()

<class 'pandas.core.series.Series'>
RangeIndex: 3871 entries, 0 to 3870
Series name: age
Non-Null Count  Dtype
--------------  -----
3871 non-null   int64
dtypes: int64(1)
memory usage: 30.4 KB


In [18]:
fig = go.Figure()
fig.update_layout(title="Age")

fig.add_trace(
    go.Histogram(
        x=sorted(sampled_data.age), histnorm="probability density", name=f"sampled"
    )
)
fig.add_trace(
    go.Histogram(x=sorted(data.age), histnorm="probability density", name=f"original")
)

In [19]:
fig = go.Figure()
fig = go.Figure()
fig.update_layout(title="children")
fig.add_trace(
    go.Histogram(
        x=sampled_data.children, histnorm="probability density", name=f"sampled"
    )
)
fig.add_trace(
    go.Histogram(x=data.children, histnorm="probability density", name=f"original")
)

In [None]:
#  git rev-parse HEAD
# b50c5928543775bdca471ac9de2c5e2226efbb61 - hash of git commit

In [69]:
sd = pd.read_csv("migration/training ready/superdataset-00.csv")

In [72]:
# migration/ITMO-2/migforecasting/datasets

In [None]:
# migration/ITMO-2/migforecasting/mig whereabouts/inflow LO.csv
# migration/ITMO-2/popsize/data0.xlsx - распределение по полу и возрасту в каждом районе
# migration/ITMO-2/migforecasting/clustering/superdataset-24 alltime-clust (oktmo+name).csv - характеристики по городу и октмо
# (?) где найти распределение по возрастам и полам для городов/районов ЛО?

In [118]:
sd = pd.read_csv(
    "migration/ITMO-2/migforecasting/clustering/superdataset-24 alltime-clust (oktmo+name).csv"
)
YEAR = 2022
sd = sd[sd.year == YEAR]

In [119]:
xl = pd.ExcelFile("migration/ITMO-2/popsize/data0.xlsx")

In [120]:
# xl.sheet_names

In [122]:
sd = sd[(sd.popsize > 0.55) & (sd.popsize < 0.8)]
len(sd)

17

In [123]:
popsize_by_oktmo = sd[["oktmo", "popsize"]].set_index("oktmo").popsize.to_dict()

In [124]:
POP_MULTIPLIER = 41_000 / 0.31

In [125]:
popsize_by_oktmo

{3605000: 0.7472045820546321,
 3630000: 0.6919191151894504,
 4723000: 0.5508492601257938,
 18720000: 0.6409407189085716,
 22505000: 0.5617118896417611,
 25738000: 0.5979611679985415,
 37705000: 0.5588557017410593,
 60606000: 0.6906885235939352,
 60730000: 0.7817523016620582,
 63611000: 0.631103582388867,
 66605000: 0.5575035702348758,
 70628000: 0.5607699553340828,
 70644000: 0.5820318434565951,
 75649000: 0.5758029230348516,
 90640000: 0.7621539910668166,
 92636000: 0.60108322445383,
 92659000: 0.5603445656467443}

In [126]:
population_by_oktmo = {
    oktmo: bn.sample(int(ps * POP_MULTIPLIER)) for oktmo, ps in popsize_by_oktmo.items()
}

100%|██████████| 98823/98823 [00:15<00:00, 6408.73it/s]
100%|██████████| 91511/91511 [00:14<00:00, 6386.13it/s]
100%|██████████| 72854/72854 [00:11<00:00, 6254.33it/s]
100%|██████████| 84769/84769 [00:13<00:00, 6176.40it/s]
100%|██████████| 74290/74290 [00:11<00:00, 6344.87it/s]
100%|██████████| 79085/79085 [00:12<00:00, 6548.01it/s]
100%|██████████| 73913/73913 [00:11<00:00, 6360.18it/s]
100%|██████████| 91349/91349 [00:15<00:00, 6053.65it/s]
100%|██████████| 103393/103393 [00:16<00:00, 6413.13it/s]
100%|██████████| 83468/83468 [00:12<00:00, 6497.30it/s]
100%|██████████| 73734/73734 [00:11<00:00, 6472.73it/s]
100%|██████████| 74166/74166 [00:11<00:00, 6516.99it/s]
100%|██████████| 76978/76978 [00:12<00:00, 6323.11it/s]
100%|██████████| 76154/76154 [00:11<00:00, 6463.91it/s]
100%|██████████| 100801/100801 [00:15<00:00, 6497.38it/s]
100%|██████████| 79498/79498 [00:12<00:00, 6268.40it/s]
100%|██████████| 74110/74110 [00:11<00:00, 6438.20it/s]


In [128]:
population_by_oktmo[3605000]

Unnamed: 0,industry,public,male,lnwage,children,age
0,ЭНЕРГЕТИЧЕСК,0,0,8.841369,2,40
1,СТРОИТЕЛЬСТВ,0,0,9.977330,1,39
2,"ЛЕГКАЯ, ПИЩЕ",1,0,8.057901,2,22
3,"ТОРГОВЛЯ, БЫ",0,1,11.006228,3,54
4,"ТОРГОВЛЯ, БЫ",0,0,9.148642,0,39
...,...,...,...,...,...,...
98818,ДЕРЕВООБРАБА,0,1,10.422251,5,
98819,ОРГАНЫ УПРАВ,1,0,10.348275,1,33
98820,ОБРАЗОВАНИЕ,1,0,9.514918,1,45
98821,"ТОРГОВЛЯ, БЫ",0,1,9.774108,0,21


In [34]:
# sd

In [None]:
# https://drive.google.com/drive/folders/1WxMMqMdb7ZQ0EKelXuF9c7OgFnPBEZ7J - характеристики МО по ОКТМО

### Росстат. Характеристики МО

In [7]:
age_sex_distr = pd.read_excel("migration/ITMO-2/popsize/data0.xlsx", sheet_name=2)

In [20]:
# lo_county_counts = 18
# for i in range(lo_county_counts):
#     print(pd.read_excel("migration/ITMO-2/popsize/data0.xlsx", sheet_name=i).shape)

In [21]:
allmun = pd.read_csv("rosstat_allmun/allmuns/livarea (allmun).csv")

pop_size_by_lo_county={ # 2023
    41603000:50977, # Бокситогорский муниципальный район
    41606000:50211, # Волосовский муниципальный район
    41609000:79417, # Волховский муниципальный район
    41612000:554288, # Всеволожский муниципальный район
    41615000:195374, # Выборгский муниципальный район
    
}
allmun = allmun[allmun.oktmo.isin(pop_size_by_lo_county)& (allmun.year==2022)]
allmun = allmun[["oktmo","name"]].set_index("oktmo")
allmun

Unnamed: 0_level_0,name
oktmo,Unnamed: 1_level_1
41603000,Бокситогорский муниципальный район
41606000,Волосовский муниципальный район
41609000,Волховский муниципальный район
41612000,Всеволожский муниципальный район
41615000,Выборгский муниципальный район


In [22]:
mo_features = allmun

In [37]:
mo_features = allmun
files = [
    "shoparea",
    "foodseats",
    "roadslen",
    "agrprod",
    "beforeschool",
    # "schoolnum", # тут нет требуемых районов
    # "museums",
    # "theatres",
    # "musartschool",
    # "hospitalcap",
    # "pollutionvol",
    "popsize",
    "retailturnover",
    "livarea",

]
for f in files:
    d = pd.read_csv(f"rosstat_allmun/allmuns/{f} (allmun).csv")
    d = d[d.oktmo.isin(pop_size_by_lo_county) & (d.year == 2022)]
    if len(d)==0:
        print(f"error in {f}")
    mo_features = mo_features.join(d.set_index("oktmo").iloc[:,-1:], )

In [38]:
mo_features

Unnamed: 0_level_0,name,shoparea,foodseats,roadslen,agrprod,beforeschool,popsize,retailturnover,livarea
oktmo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
41603000,Бокситогорский муниципальный район,39624.7,717.0,329.4,1201767.0,2150.0,47236.0,5122194.2,30.42
41606000,Волосовский муниципальный район,40372.9,1154.0,100.1,7403017.0,2213.0,51600.0,5324450.3,33.38
41609000,Волховский муниципальный район,74669.0,2960.0,156.5,4515881.0,4871.0,85927.0,26474961.3,29.63
41612000,Всеволожский муниципальный район,459000.7,12238.0,29.7,8094861.0,23293.0,506289.0,152482868.7,32.63
41615000,Выборгский муниципальный район,134401.6,7784.0,22.4,14344493.0,7447.0,193863.0,33774007.6,28.98
