In [1]:
import numpy as np
import pandas as pd
import unidecode
import json
import os

In [2]:
data = pd.read_csv(r"data/total.csv", low_memory=False)

SBD = data['SBD']
diem = data.drop(columns=['SBD'])

info = pd.DataFrame([diem.median(), diem.mode().iloc[0], diem.mean(numeric_only=True).round(2)], index=('Median', 'Mode', 'Mean'))
subjects = tuple(diem.columns)
subjects_lower = tuple([unidecode.unidecode(sj).lower() for sj in subjects])

# Load file geojson chua cac tinh tren ban do VN
with open(r'data/diaphantinh.geojson', encoding='utf8') as f:
    vn_map = json.load(f)
    
# DataFrame chua ma so giao duc va ten tinh
tinh = []

# Chia cac mien:

## Lay ma so giao duc cua cac tinh o moi mien
id_bac = (8, 13, 62, 23, 7, 14, 5, 6, 11, 10, 9, 12, 15, 18, 17, 19, 24, 1, 21, 3, 22, 25, 27, 26, 16)
id_tru = (28, 29, 30, 31, 32, 33, 4, 34, 35, 37, 39, 41, 45, 47, 36, 38, 40, 63, 42)
id_nam = (43, 44, 48, 46, 52, 2, 49, 50, 51, 56, 57, 58, 54, 64, 59, 60, 61, 55, 53)

## Chia cac tinh vao cac mien
bac = []
trung = []
nam = []

for val in vn_map['features']:
    id_ = val['properties']["id"]
    name = val['properties']["ten_tinh"]
    pack = [id_, name]
    
    tinh.append(pack)
    if id_ in id_bac:
        bac.append(pack)
    elif id_ in id_tru:
        trung.append(pack)
    else:
        nam.append(pack)

tinh, bac, trung, nam = map(pd.DataFrame, [tinh, bac, trung, nam])

bac.to_pickle(r'data/map/bac.gz')
trung.to_pickle(r'data/map/trung.gz')
nam.to_pickle(r'data/map/nam.gz')
tinh.to_pickle(r'data/map/tinh.gz')

In [3]:
def diem_theo_tinh(ma_tinh, diem = diem, sbd = SBD):
    return diem.loc[SBD // 1000000 == int(ma_tinh)]

def count_cao_hon_hoac_bang(muc_diem, mon, df_diem = diem, percent = False, ma_sgd = 0):

    if not isinstance(mon, dict):
        if ma_sgd:
            try:
                df_diem[mon] # Kiem tra xem `mon` co dau, viet hoa khong
                mon = subjects_lower[subjects.index(mon)]
            except KeyError: # Trong truong hop nay, `mon` la ten mon hoc (khong dau, viet thuong). Ta chi can load file da tinh san
                pass
            
            df = pd.read_pickle(f'data/{ma_sgd}/{mon}.gz')
            
            temp = df['count'].loc[df['muc_diem'] >= muc_diem].max()
            if not temp: # There might be no row that sastified the condition. In that case we want the function to return 0
                return 0
            return temp / df['count'].max() * 100 if percent else temp

        else:
            try:
                custom_data = df_diem[mon]
            except KeyError:
                custom_data = df_diem[subjects[subjects_lower.index(mon)]]
    else:
        custom_data = sum([(df_diem[subjects[subjects_lower.index(val)]] * mon[val]) for val in mon.keys()])
    res = custom_data.loc[custom_data >= muc_diem].dropna().count()
    if percent:
        np.seterr('ignore')
        return np.nan_to_num(res/custom_data.count()) * 100
    return res

In [4]:
for id_ in tinh[0]:
    # Tao thu muc chua data cua tinh
    path = f"data/{id_}"

    try:
        os.mkdir(path)
    except FileExistsError:
        pass

    diem_tinh = diem_theo_tinh(id_, diem, SBD)

    for subject in diem_tinh.columns:

        # Create a list of tuple to store score and the number of people that have higher score than that
        temp_score_storage = [] # Only temporary cuz we need to serialize it later
        
        for score in diem_tinh[subject].unique():
            temp_score_storage.append((score, count_cao_hon_hoac_bang(muc_diem = score, df_diem=diem_tinh, mon = subject, percent=False)))
        if not temp_score_storage: # No record
            temp_score_storage.append((10, 0))

        pd.DataFrame(temp_score_storage, columns=('muc_diem', 'count')).sort_values(by = 'muc_diem').dropna().to_pickle(path = f'{path}/{subjects_lower[subjects.index(subject)]}.gz')

In [5]:
count_cao_hon_hoac_bang(10, 'ngu van', diem)

5

In [6]:
df = pd.read_pickle('data/22/toan.gz')

In [7]:
df

Unnamed: 0,muc_diem,count
46,0.6,13734
45,1.0,13733
42,1.2,13730
43,1.4,13725
39,1.6,13717
41,1.8,13702
40,2.0,13673
37,2.2,13632
33,2.4,13578
34,2.6,13511


In [16]:
import plotly.graph_objects as go 
import plotly.express as px

def choropleth_map(mon, muc_diem, percent = True, region = 'all'):

    location_str = 'trên cả nước' if region == 'all' else 'tại miền Bắc' if region == 'bac' else 'tại miền Nam' if region == 'nam' else 'tại miền Trung'
    regions = {'bac': bac, 'nam': nam, 'trung': trung, 'all': tinh}
    region = regions[region] # Du lieu dau vao la 1 str

    diem_moi_tinh = pd.concat([region, region[0].map(lambda x: count_cao_hon_hoac_bang(muc_diem = muc_diem, mon = mon, percent=percent, ma_sgd = x))], axis = 1).fillna(0)
    diem_moi_tinh.columns = ['id', 'name', 'value']
    title = f'{"Số lượng" if not percent else "Tỉ lệ"} thí sinh đạt mức điểm cao hơn hoặc bằng {muc_diem:.2f} {f"trong tổ hợp {sjs}" if isinstance(mon, dict) else f"ở môn {mon}" if mon in subjects else f"ở môn {subjects[subjects_lower.index(mon)]}"} {location_str}'
    
    fig = go.Figure(data=
                    go.Choropleth(
                        geojson=vn_map,
                        locations = diem_moi_tinh['id'],
                        featureidkey='properties.id',
                        z=diem_moi_tinh['value'],
                        text=diem_moi_tinh['name'],
                        colorscale='blugrn',
                        colorbar_title = 'Phần trăm (%)' if percent else 'Số lượng (người)',
                        ))
    
    fig.update_geos(fitbounds = 'locations', visible = False)

    if isinstance(mon, dict): # Hiển thị tổ hợp nếu `mon` là 1 dictionary
        if list(mon.values())[0] in subjects:
            sjs = ", ".join([f'{val} (hệ số {mon[val]})' for val in mon.keys()])
        else:
            sjs = ", ".join([f'{subjects[subjects_lower.index(val)]} (hệ số {mon[val]})' for val in mon.keys()])
    fig.update_layout(
        title_text=f'{"Số lượng" if not percent else "Tỉ lệ"} thí sinh đạt mức điểm cao hơn hoặc bằng {muc_diem:.2f} {f"trong tổ hợp {sjs}" if isinstance(mon, dict) else f"ở môn {mon}" if mon in subjects else f"ở môn {subjects[subjects_lower.index(mon)]}"} {location_str}',
        margin=dict(l=0,r=0,b=0,t=50),
        width = 1680,
        height = 1050
    )
    return fig



In [9]:
def get_img(subj):
    try:
        os.mkdir(f'images/{subj}')
    except FileExistsError:
        pass
    for val in np.linspace(0, 10, 1001):
        fig = choropleth_map(mon = subj, muc_diem=val, percent=True, region = 'all')
        fig.write_image(f"images/{subj}/{val:.2f}.png")

In [25]:
from plotly.subplots import make_subplots
def stacked_choro(mon, muc_diem, region = 'all'):

    percent = False

    location_str = 'trên cả nước' if region == 'all' else 'tại miền Bắc' if region == 'bac' else 'tại miền Nam' if region == 'nam' else 'tại miền Trung'
    regions = {'bac': bac, 'nam': nam, 'trung': trung, 'all': tinh}
    region = regions[region] # Du lieu dau vao la 1 str

    diem_moi_tinh = pd.concat([region, region[0].map(lambda x: count_cao_hon_hoac_bang(muc_diem = muc_diem, mon = mon, percent=percent, ma_sgd = x))], axis = 1).fillna(0)
    diem_moi_tinh.columns = ['id', 'name', 'value']
    title = f'{"Số lượng" if not percent else "Tỉ lệ"} thí sinh đạt mức điểm cao hơn hoặc bằng {muc_diem:.2f} {f"trong tổ hợp {sjs}" if isinstance(mon, dict) else f"ở môn {mon}" if mon in subjects else f"ở môn {subjects[subjects_lower.index(mon)]}"} {location_str}'
    
    fig = make_subplots(rows = 1, cols = 2, specs=[[{"type": "mapbox"}, {"type": "mapbox"}]])
    fig.add_trace(go.Choropleth(
                        geojson=vn_map,
                        locations = diem_moi_tinh['id'],
                        featureidkey='properties.id',
                        z=diem_moi_tinh['value'],
                        text=diem_moi_tinh['name'],
                        colorscale='blugrn',
                        colorbar_title = 'Phần trăm (%)' if percent else 'Số lượng (người)',
                        ),
                row = 1, col = 1)
    fig.add_trace(choropleth_map(mon = mon, muc_diem=muc_diem, percent = False),
                row = 1, col = 2)
    fig.show()

In [26]:
stacked_choro(mon = 'toan', muc_diem=8)

ValueError: Trace type 'choropleth' is not compatible with subplot type 'mapbox'
at grid position (1, 1)

See the docstring for the specs argument to plotly.subplots.make_subplots
for more information on subplot types