In [1]:
from pathlib import Path
import sys, sqlite3, pandas as pd
from IPython.display import display, Markdown

# Текущая рабочая директория Jupyter
cwd = Path.cwd()

# Если CWD = Project/notebooks, то корень проекта = parent
project_root = str(cwd.parent)

# Добавляем корень проекта в sys.path (в начало, чтобы он имел приоритет)
if project_root not in sys.path:
    sys.path.insert(0, project_root)

pd.set_option("display.width", 2000)     # максимальная ширина в символах
pd.set_option("display.max_colwidth", None)

from config import DB_PATH, PARQUETS_DIR
from processing_cpp import normalize_includes
from label_functions import LFS

In [2]:
with sqlite3.connect(DB_PATH) as conn:
    recommendations = pd.read_sql_query("""
                                        SELECT ec.error_code, r.recommendation_id, r.recommendation, r.recommendation_code
                                        FROM error_codes ec
                                        JOIN recommendations r ON r.error_code_id = ec.error_code_id
                                        WHERE r.is_active = 1
                                        ORDER BY ec.error_code, r.recommendation_id
                                        """, conn)

In [3]:
default_recommendations = (
    recommendations
    .sort_values(['error_code', 'recommendation_id'])
    .groupby('error_code', as_index=False)
    .first()
)

In [4]:
code_to_label_code = dict(zip(default_recommendations['error_code'], default_recommendations['recommendation_code']))

In [5]:
df = pd.read_parquet(PARQUETS_DIR / "data_marked_after_EOF.parquet")

In [6]:
df['label_code'] = df['error_code'].map(code_to_label_code)

In [7]:
def add_line_numbers(code : str) -> str:
    if not isinstance(code, str):
        return code

    lines = code.split('\n')
    if not lines:
        return code

    width = len(str(len(lines)))
    numbered = [
        f"{i:>{width}d} | {line}"
        for i, line in enumerate(lines, start=1)
    ]
    return "\n".join(numbered)

def show_table(df : pd.DataFrame, mask : pd.Series | None = None, rows = None):
    subset = df if mask is None else df[mask]
    print(len(subset))
    
    cols = ['label_code', 'error_text', 'is_in_train', 'error_line', 'source_code']

    subset = subset.copy()
    subset['label_code'] = subset['label_code'].astype(str)
    subset['is_in_train'] = subset['is_in_train'].astype(str)
    subset['source_code'] = subset['source_code'].apply(normalize_includes)
    subset['source_code'] = subset['source_code'].str.replace('\\n', '\n')
    subset['source_code'] = subset['source_code'].apply(add_line_numbers)

    if rows is not None:
        if isinstance(rows, slice):
            subset = subset.iloc[rows]
        else:
            subset = subset.loc[rows]
    styler = subset[cols].style.format(escape="html")

    styler = styler.set_table_styles([
        {
            'selector': 'th, td',
            'props': [('text-align', 'left')],
        },
        {
            'selector': 'td',
            'props': [
                ('max-width', 'none'),
                ('overflow', 'visible'),
            ],
        },
    ])
    styler = styler.set_properties(
        subset=['source_code'],
        **{
            'white-space': 'pre',
            'font-family': 'monospace',
        }
    )
    styler = styler.set_properties(
        subset=['label_code', 'error_text', 'is_in_train', 'error_line'],
        **{
            'font-family': 'sans-serif',
        }
    )
    return styler

def show_row(df : pd.DataFrame, i : int) -> None:
    row = df.loc[i]
    src = normalize_includes(row['source_code']).replace('\\n', '\n') # type: ignore
    src = add_line_numbers(src)

    md = (
        f"### Row {i}\n"
        f"**label_code**: `{row.get('label_code', '')}`  \n\n"
        f"**is_in_train**: `{row['is_in_train']}`  \n\n"
        f"**error_text**:\n"
        f"`{row['error_text']}`\n\n"
        f"**error_line**:\n"
        f"`{row['error_line']}`\n\n"
        f"```cpp\n"
        f"{src}\n"
        f"```\n"
    )
    display(Markdown(md))

In [8]:
def apply_labeling_functions(df : pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    def apply_row(row : pd.Series):
        suggestions = []
        for lf in LFS:
            code = lf(row)
            if code is not None:
                suggestions.append(code)

        if not suggestions:
            return row['label_code']

        if len(set(suggestions)) == 1:
            # Все ЛФ согласились
            return suggestions[0]

        # print(f"Conflict")
        return suggestions[-1]

    df['label_code'] = df.apply(apply_row, axis=1)
    return df

In [9]:
# df_slice = df[(df["error_code"] == "C2679") & df["is_in_train"] == 1].iloc[:100]

In [11]:
ids_not_in_train = [
    # C2065
    # непонятный main
    430, 431, 434, 437, 442, 443, 444, 459, 460, 464,
    64928, 
    # C2187
    # подключить библиотеку
    2813,
    # C2143
    97264,
    # C2146
    15971, 77142, 105064, 105065, 121104, 131125,
    # C2059
    158055, 239448,
    # C2440
    368, 369, 370, 128153, 177319, 280170,
    # C2446
    1560, 2004, 7512, 9383, 9767, 15144, 
    # C2676
    116, 18965,
    # C2678
    192940, 
]

ids_FIX_NAME_SPELLING = [
    10, 23, 54, 84, 85, 121, 166, 220, 257, 258, 347, 350, 393, 400, 412, 413, 414, 415, 419, 420, 424, 426, 458, 468, 497, 498, 507, 508
]

In [12]:
new_df = apply_labeling_functions(df)

In [13]:
new_df.loc[ids_not_in_train, "is_in_train"] = 0
new_df.loc[ids_FIX_NAME_SPELLING, "label_code"] = "FIX_NAME_SPELLING"

In [14]:
new_df.to_parquet(PARQUETS_DIR / "data_marked_after_EOF.parquet")

In [None]:
# mask1 = (df["error_code"] & (df["is_in_train"] == 1))
# # mask2 = (df["error_code"] == "C1083") & ~df["error_text"].str.contains("pch.h")
# show_table(df, mask1, slice(0, 100))

In [None]:
# msk1 = (new_df["error_code"] == "C2679") & (new_df["label_code"] == "FIX_OPERATOR") & (new_df["is_in_train"] == 1)
# # msk2 = (df["is_in_train"] == 0)
# show_table(df, msk1, slice(4, 7069, 100))