In [2]:
from pathlib import Path
import sys, sqlite3, pandas as pd
from IPython.display import display, Markdown

# Текущая рабочая директория Jupyter
cwd = Path.cwd()

# Если CWD = Project/notebooks, то корень проекта = parent
project_root = cwd.parent

# Добавляем корень проекта в sys.path (в начало, чтобы он имел приоритет)
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

pd.set_option("display.width", 2000)     # максимальная ширина в символах
pd.set_option("display.max_colwidth", None)

from config import DB_PATH, PARQUETS_DIR
from processing_cpp import normalize_includes
from lf_functions import LFS

In [3]:
with sqlite3.connect(DB_PATH) as conn:
    recommendations = pd.read_sql_query("""
                                        SELECT ec.error_code, r.recommendation_id, r.recommendation, r.recommendation_code
                                        FROM error_codes ec
                                        JOIN recommendations r ON r.error_code_id = ec.error_code_id
                                        WHERE r.is_active = 1
                                        ORDER BY ec.error_code, r.recommendation_id
                                        """, conn)

In [4]:
default_recommendations = (
    recommendations
    .sort_values(['error_code', 'recommendation_id'])
    .groupby('error_code', as_index=False)
    .first()
)

In [5]:
code_to_label_code = dict(zip(default_recommendations['error_code'], default_recommendations['recommendation_code']))

In [6]:
df = pd.read_parquet(PARQUETS_DIR / "data_filtered_error_codes.parquet")

In [7]:
df['label_code'] = df['error_code'].map(code_to_label_code)

In [8]:
def add_line_numbers(code : str) -> str:
    if not isinstance(code, str):
        return code

    lines = code.split('\n')
    if not lines:
        return code

    width = len(str(len(lines)))
    numbered = [
        f"{i:>{width}d} | {line}"
        for i, line in enumerate(lines, start=1)
    ]
    return "\n".join(numbered)

def show_table(df : pd.DataFrame, mask : pd.Series | None = None, rows = None):
    subset = df if mask is None else df[mask]
    print(len(subset))
    
    cols = ['label_code', 'error_text', 'is_in_train', 'error_line', 'source_code']

    subset = subset.copy()
    subset['label_code'] = subset['label_code'].astype(str)
    subset['is_in_train'] = subset['is_in_train'].astype(str)
    subset['source_code'] = subset['source_code'].apply(normalize_includes)
    subset['source_code'] = subset['source_code'].str.replace('\\n', '\n')
    subset['source_code'] = subset['source_code'].apply(add_line_numbers)

    if rows is not None:
        if isinstance(rows, slice):
            subset = subset.iloc[rows]
        else:
            subset = subset.loc[rows]
    styler = subset[cols].style.format(escape="html")

    styler = styler.set_table_styles([
        {
            'selector': 'th, td',
            'props': [('text-align', 'left')],
        },
        {
            'selector': 'td',
            'props': [
                ('max-width', 'none'),
                ('overflow', 'visible'),
            ],
        },
    ])
    styler = styler.set_properties(
        subset=['source_code'],
        **{
            'white-space': 'pre',
            'font-family': 'monospace',
        }
    )
    styler = styler.set_properties(
        subset=['label_code', 'error_text', 'is_in_train', 'error_line'],
        **{
            'font-family': 'sans-serif',
        }
    )
    return styler

def show_row(df : pd.DataFrame, i : int) -> None:
    row = df.loc[i]
    src = normalize_includes(row['source_code']).replace('\\n', '\n') # type: ignore
    src = add_line_numbers(src)

    md = (
        f"### Row {i}\n"
        f"**label_code**: `{row.get('label_code', '')}`  \n\n"
        f"**is_in_train**: `{row['is_in_train']}`  \n\n"
        f"**error_text**:\n"
        f"`{row['error_text']}`\n\n"
        f"**error_line**:\n"
        f"`{row['error_line']}`\n\n"
        f"```cpp\n"
        f"{src}\n"
        f"```\n"
    )
    display(Markdown(md))

In [9]:
def apply_labeling_functions(df : pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    def apply_row(row : pd.Series):
        suggestions = []
        for lf in LFS:
            code = lf(row)
            if code is not None:
                suggestions.append(code)

        if not suggestions:
            return row['label_code']

        if len(set(suggestions)) == 1:
            # Все ЛФ согласились
            return suggestions[0]

        # print(f"Conflict")
        return suggestions[-1]

    df['label_code'] = df.apply(apply_row, axis=1)
    return df

In [10]:
df_slice = df[(df["error_code"] == "C2146") & df["is_in_train"] == 1].iloc[300:400]

In [11]:
new_df = apply_labeling_functions(df)

In [12]:
# show_row(df, 137930)

In [None]:
pattern = r"(?m)^(?!\s*#include\s*<).*?\b(?:[A-Za-z_]\w*(?:::\w+)*)\s*<\s*[A-Za-z_:\w,\s]+>"
# mask1 = df["error_code"] == "C2146"
mask2 = (new_df["error_code"] == "C2146") & new_df["is_in_train"] == 1
show_table(new_df, mask2, slice(0, 20000, 100))

In [16]:
msk1 = (new_df["error_code"] == "C2146") & (new_df["label_code"] == "INCORRECT_CONSTRUCTION_OR_SYMBOLS") & new_df["is_in_train"] == 1
show_table(new_df, msk1, slice(2, 60))

472


Unnamed: 0,label_code,error_text,is_in_train,error_line,source_code
472,INCORRECT_CONSTRUCTION_OR_SYMBOLS,"C2146: синтаксическая ошибка: отсутствие "";"" перед идентификатором ""j""",1,5,"1 | #include <stdio.h>  2 | 3 | int main(void) {  4 | for (int i = 0; i <= 9; i++) {  5 | for (int j = 0 j <= 9; j++) {  6 | printf(""%dx%d=%d  7 | "", i, j, i*j);  8 | }  9 | } 10 | return 0; 11 | }"
662,INCORRECT_CONSTRUCTION_OR_SYMBOLS,"C2146: синтаксическая ошибка: отсутствие "";"" перед идентификатором ""i""",1,16,1 | #include <iostream>  2 | 3 | int main() {  4 | int heights[16] = { 0 };  5 | int temp_result = 0;  6 | int index = 12;  7 | 8 | for(int i = 0; i < 10; i++) {  9 | std::cin >> heights[i]; 10 | } 11 | 12 | for(int j = 0; j < 3; j++) { 13 | heights[index] = 0; 14 | index = 0; 15 | temp_result = 0; 16 | for(int i = 0; i < 10 i++) { 17 | if(temp_result < heights[i]) { 18 | temp_result = heights[i]; 19 | index = i; 20 | } 21 | } 22 | std::cout << temp_result << std::endl; 23 | } 24 | 25 | 26 | return 0; 27 | }
1174,INCORRECT_CONSTRUCTION_OR_SYMBOLS,"C2146: синтаксическая ошибка: отсутствие "";"" перед идентификатором ""a""",1,15,"1 | #include <iostream>  2 | using namespace std;  3 | int main(){  4 | int a,b,c;  5 | while ( cin >> a >> b){  6 | if(cin.eof()){break;}  7 | c = a+b;  8 | if (c/1000000>=1)  9 | cout << ""7"" << endl; 10 | else if (c/100000>=1) 11 | cout << ""6"" << endl; 12 | else if (c/10000>=1) 13 | cout << ""5"" << endl; 14 | else if (c/1000>=1) 15 | cout << ""4"" a<< endl; 16 | else if (c/100>=1) 17 | cout << ""3"" << endl; 18 | else if (c/10>=1) 19 | cout << ""2"" << endl; 20 | else 21 | cout << ""1"" << endl; 22 | } 23 | }"
1392,INCORRECT_CONSTRUCTION_OR_SYMBOLS,"C2146: синтаксическая ошибка: отсутствие "";"" перед идентификатором ""ss""",1,10,"1 | #include<string>  2 | #include<algorithm>  3 | #include<iostream>  4 | #include<cstdio>  5 | using namespace std;  6 | int main()  7 | {  8 | int a,b;  9 | cin >> a>>b; 10 | string stream ss; 11 | ss << number; 12 | string x=ss.str() 13 | cout<<x.size()<<endl; 14 | return 0; 15 | } 16 | string IntToString(int number) 17 | { 18 | string stream ss; 19 | ss << number; 20 | return ss.str(); 21 | }"
1393,INCORRECT_CONSTRUCTION_OR_SYMBOLS,"C2146: синтаксическая ошибка: отсутствие "";"" перед идентификатором ""ss""",1,10,"1 | #include<string>  2 | #include<algorithm>  3 | #include<iostream>  4 | #include<cstdio>  5 | using namespace std;  6 | int main()  7 | {  8 | int a,b;  9 | cin >> a>>b; 10 | string stream ss; 11 | ss << number; 12 | string x=ss.str(); 13 | cout<<x.size()<<endl; 14 | return 0; 15 | } 16 | string IntToString(int number) 17 | { 18 | string stream ss; 19 | ss << number; 20 | return ss.str(); 21 | }"
1394,INCORRECT_CONSTRUCTION_OR_SYMBOLS,"C2146: синтаксическая ошибка: отсутствие "";"" перед идентификатором ""ss""",1,10,"1 | #include<string>  2 | #include<algorithm>  3 | #include<iostream>  4 | #include<cstdio>  5 | using namespace std;  6 | int main()  7 | {  8 | int a,b;  9 | cin >> a>>b; 10 | string stream ss; 11 | ss << number; 12 | string x=ss.str(); 13 | cout<<x.size()<<endl; 14 | return 0; 15 | }"
1395,INCORRECT_CONSTRUCTION_OR_SYMBOLS,"C2146: синтаксическая ошибка: отсутствие "";"" перед идентификатором ""ss""",1,10,"1 | #include<string>  2 | #include<algorithm>  3 | #include<iostream>  4 | #include<cstdio>  5 | using namespace std;  6 | int main()  7 | {  8 | int a,b;  9 | cin >> a>>b; 10 | string stream ss; 11 | ss << a+b; 12 | string x=ss.str(); 13 | cout<<x.size()<<endl; 14 | return 0; 15 | }"
1396,INCORRECT_CONSTRUCTION_OR_SYMBOLS,"C2146: синтаксическая ошибка: отсутствие "";"" перед идентификатором ""ss""",1,10,"1 | #include<string>  2 | #include<algorithm>  3 | #include<iostream>  4 | #include<cstdio>  5 | using namespace std;  6 | int main()  7 | {  8 | int a,b;  9 | cin >> a>>b; 10 | string stream ss; 11 | string ss; 12 | ss << a+b; 13 | string x=ss.str(); 14 | cout<<x.size()<<endl; 15 | return 0; 16 | }"
1536,INCORRECT_CONSTRUCTION_OR_SYMBOLS,"C2146: синтаксическая ошибка: отсутствие "")"" перед идентификатором ""num""",1,5,"1 | #include <iostream>  2 | 3 | using namespace std;  4 | 5 | int counts(char[] num) {  6 | int count = 0;  7 | for (int i = 0; i < num.length; i++) {  8 | if (num[i] != ""0"") count++;  9 | } 10 | return count; 11 | } 12 | 13 | int main() { 14 | for (int a, b; cin >> a >> b && a && b) { 15 | char[] as = (char[])a; 16 | char[] bs = (char[])b; 17 | cout << counts(as)+counts(bs) << endl; 18 | } 19 | }"
5040,INCORRECT_CONSTRUCTION_OR_SYMBOLS,"C2146: синтаксическая ошибка: отсутствие "";"" перед идентификатором ""max_""",1,6,"1 | #include <iostream>  2 | using namespace std;  3 | 4 | int main(){  5 | int n, a;  6 | long long ing max_, tmp;  7 | while(cin >> n){  8 | max_=0; tmp=0; tmp=0;  9 | for(int i=0; i<n; i++){ 10 | cin >> a; 11 | tmp = max(0,tmp+a); 12 | max_ = max(tmp,max_); 13 | } 14 | cout << max_ << endl; 15 | } 16 | return 0; 17 | }"
