# Проанализируем данные

In [1]:
import pandas as pd
import polars as pl


def read(n: str, **kwargs):
    return pl.read_csv(f"../data/{n}", **kwargs).to_pandas()


public = read("public/data.csv")
private = read("private/data.csv")

In [2]:
def analayze(data: pd.DataFrame):
    missing_values_count = data.isnull().sum()
    missing_values_percent = round(
        (data.isnull().sum() / len(data)) * 100
    ).astype(int)
    unique_values_count = data.nunique()
    unique_values_percent = round((data.nunique() / len(data)) * 100).astype(
        int
    )

    return pd.DataFrame(
        {
            "Missing Values Count": missing_values_count,
            "Missing Values Percent (%)": missing_values_percent,
            "Unique Values Count": unique_values_count,
            "Unique Values Percent (%)": unique_values_percent,
        }
    )

In [3]:
analayze(public)

Unnamed: 0,Missing Values Count,Missing Values Percent (%),Unique Values Count,Unique Values Percent (%)
blend_id,0,0,341,25
smiles,0,0,53,4
oil_property_param_value,12,1,322,23


In [6]:
analayze(private)

Unnamed: 0,Missing Values Count,Missing Values Percent (%),Unique Values Count,Unique Values Percent (%)
oil_type,588,1,4,0
blend_id,0,0,343,1
oil_property_param_title,0,0,1,0
oil_property_param_value,411,1,324,1
component_name,0,0,109,0
component_class,16775,38,13,0
polymer,16775,38,2,0
component_property_param_title,0,0,43,0
component_property_param_value,28402,64,1015,2
smiles,16775,38,51,0


In [53]:
private_oilppv = private.loc[private["oil_property_param_title"] == "ad7e6027-00b8-4c27-918c-d1561f949ad8"].drop_duplicates().reset_index(drop=True)
display(private_oilppv.shape)
private_oilppv["blend_id"].nunique(), private_oilppv["oil_property_param_value"].nunique(), private_oilppv["smiles"].nunique()

(343, 10)

(343, 324, 11)

In [59]:
public.shape, public["blend_id"].nunique(), public[
    "oil_property_param_value"
].nunique(), public["smiles"].nunique()

((1387, 3), 341, 322, 53)

In [66]:
test = private_oilppv.merge(public, on="blend_id").sort_index(axis=1).drop_duplicates()
display(test.shape)
test

(1387, 12)

Unnamed: 0,blend_id,component_class,component_name,component_property_param_title,component_property_param_value,oil_property_param_title,oil_property_param_value_x,oil_property_param_value_y,oil_type,polymer,smiles_x,smiles_y
0,49743a76-a614-11ee-9529-005056921581,базовое масло 1 гр,615537f6-1f8f-4240-a5e9-8f7be344ecd3,02236ee6-5eec-4368-a2e4-6f2e73fb0f96,0.0,ad7e6027-00b8-4c27-918c-d1561f949ad8,103300.0,103300.0,3fa07e0a-415c-496d-b88b-557855cb3e77,no,CCCCC,CCCCC
1,49743a76-a614-11ee-9529-005056921581,базовое масло 1 гр,615537f6-1f8f-4240-a5e9-8f7be344ecd3,02236ee6-5eec-4368-a2e4-6f2e73fb0f96,0.0,ad7e6027-00b8-4c27-918c-d1561f949ad8,103300.0,103300.0,3fa07e0a-415c-496d-b88b-557855cb3e77,no,CCCCC,CCCC(C)CCC
2,49743a76-a614-11ee-9529-005056921581,базовое масло 1 гр,615537f6-1f8f-4240-a5e9-8f7be344ecd3,02236ee6-5eec-4368-a2e4-6f2e73fb0f96,0.0,ad7e6027-00b8-4c27-918c-d1561f949ad8,103300.0,103300.0,3fa07e0a-415c-496d-b88b-557855cb3e77,no,CCCCC,CCC(C(OC)=O)CC
3,49743a76-a614-11ee-9529-005056921581,базовое масло 1 гр,615537f6-1f8f-4240-a5e9-8f7be344ecd3,02236ee6-5eec-4368-a2e4-6f2e73fb0f96,0.0,ad7e6027-00b8-4c27-918c-d1561f949ad8,103300.0,103300.0,3fa07e0a-415c-496d-b88b-557855cb3e77,no,CCCCC,CCCCC(C)C
4,49743a76-a614-11ee-9529-005056921581,базовое масло 1 гр,615537f6-1f8f-4240-a5e9-8f7be344ecd3,02236ee6-5eec-4368-a2e4-6f2e73fb0f96,0.0,ad7e6027-00b8-4c27-918c-d1561f949ad8,103300.0,103300.0,3fa07e0a-415c-496d-b88b-557855cb3e77,no,CCCCC,CC(C)(C)CC(C)(C)C
...,...,...,...,...,...,...,...,...,...,...,...,...
1382,6babd070-4bf3-11ee-9c35-005056921581,базовое масло 5 гр,5011713b-7c70-465a-a9d2-8eff131ae671,02236ee6-5eec-4368-a2e4-6f2e73fb0f96,,ad7e6027-00b8-4c27-918c-d1561f949ad8,189150.0,189150.0,60c1209c-8e77-467e-a01f-b6321801a260,yes,CC(C(OCCCC)=O)C(OCCCC)=O,CCCCCCCCC(CCCCCC)CC(C)CCCCCCCC
1383,45f1e44a-9410-11ee-8abf-005056921581,базовое масло 3 гр,6314fabc-8e78-42be-87fb-948ba3bee8f9,02236ee6-5eec-4368-a2e4-6f2e73fb0f96,0.0,ad7e6027-00b8-4c27-918c-d1561f949ad8,12510.0,12510.0,60c1209c-8e77-467e-a01f-b6321801a260,no,CCCC(C)CCC,CCCC(C)CCC
1384,45f1e44a-9410-11ee-8abf-005056921581,базовое масло 3 гр,6314fabc-8e78-42be-87fb-948ba3bee8f9,02236ee6-5eec-4368-a2e4-6f2e73fb0f96,0.0,ad7e6027-00b8-4c27-918c-d1561f949ad8,12510.0,12510.0,60c1209c-8e77-467e-a01f-b6321801a260,no,CCCC(C)CCC,CCC(C(OC)=O)CC
1385,45f1e44a-9410-11ee-8abf-005056921581,базовое масло 3 гр,6314fabc-8e78-42be-87fb-948ba3bee8f9,02236ee6-5eec-4368-a2e4-6f2e73fb0f96,0.0,ad7e6027-00b8-4c27-918c-d1561f949ad8,12510.0,12510.0,60c1209c-8e77-467e-a01f-b6321801a260,no,CCCC(C)CCC,CCCCCCC


In [62]:
check_public = read("public/check.csv")

check_public.shape, check_public["blend_id"].nunique(), check_public["smiles"].nunique()

((536, 2), 138, 56)

In [64]:
check_private = read("private/check.csv").loc[private["oil_property_param_title"] == "ad7e6027-00b8-4c27-918c-d1561f949ad8"]

check_private.shape, check_private["blend_id"].nunique(), public[
    "oil_property_param_value"
].nunique(), check_private["smiles"].nunique()

((14889, 10), 91, 322, 47)