In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import os
import pickle

In [6]:
df = pd.read_csv('/home/tak/pred-sentencing/resources/data/tagging/drugs/gt/sentence_tagging.csv')
df['text'] = df['text'].astype(str)

df_cleaned = df.drop_duplicates(subset=['text'])
duplicates = df_cleaned.duplicated(subset=['text'])
label_columns = [col for col in df_cleaned.columns if not col.startswith('Unnamed') and col != 'verdict' and col != 'text']



In [7]:
def value_vounts(df):
    verdict_grouped = df.groupby('verdict').size().reset_index(name='total_sentences')
    label_counts = df.groupby('verdict')[label_columns].sum().reset_index()

    combined_df = pd.merge(verdict_grouped, label_counts, on='verdict')

    return combined_df

def balance_dataframe(df):
    
    min_count = df['label'].value_counts().min()
    balanced_df = df.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)

    return balanced_df

def create_label_dict(df, balance=True, text_column='text', ):    
    
    label_dict = {}
    for label in label_columns:
        label_df = df[[text_column, label]]
        label_df.rename(columns={label: 'label'}, inplace=True)
        if balance:
            label_df = balance_dataframe(label_df)
        label_dict[label] = label_df

    return label_dict



def save_dict_as_pickle(label_dict, file_name='label_dataframes.pkl'):
    with open(file_name, 'wb') as f:
        pickle.dump(label_dict, f)
    print(f"Dictionary saved as {file_name}")
    
def load_dict_from_pickle(file_name='label_dataframes.pkl'):
    with open(file_name, 'rb') as f:
        label_dict = pickle.load(f)
    print(f"Dictionary loaded from {file_name}")
    return label_dict

def save_dfs_pickle(df, save_path, type_, balance=False):
    label_dataframes = create_label_dict(df, balance)
    save_dict_as_pickle(label_dataframes, os.path.join(save_path, type_ + '_label_dataframes.pkl'))
    return label_dataframes

In [8]:
df_cleaned['verdict'].unique()

array(['ME-22-11-1990-79', 'ME-22-12-47987-640', 'ME-23-01-53640-602',
       'ME-23-05-49875-541', 'ME-23-05-52567-993', 'ME-23-12-14553-571',
       'SH-17-10-12320-80', 'SH-19-02-58269-573', 'SH-19-03-50607-561',
       'SH-19-11-6047-641', 'SH-20-06-57459-683', 'ME-20-12-1338-335',
       'SH-20-09-43084-871', 'SH-21-02-39690-282', 'SH-21-08-36546-752',
       'SH-21-09-42879-201', 'ME-21-11-35998-405', 'SH-22-01-33898-590',
       'SH-22-03-40650-599', 'ME-22-04-7867-732', 'SH-22-11-37606-732',
       'ME-22-06-54208-991', 'SH-22-12-20664-221', 'SH-23-05-72468-401',
       'SH-23-08-30009-79', 'SH-24-01-9804-702', 'SH-24-01-26596-264',
       'SH-24-05-46424-585', 'ME-22-07-32175-327', 'ME-22-01-47042-277',
       'ME-22-07-53625-139', 'ME-23-01-46786-206', 'SH-22-05-36033-112',
       'SH-21-03-56850-36', 'SH-22-01-16014-223', 'SH-21-11-9851-17',
       'ME-23-04-8444-574', 'SH-21-02-40908-370', 'SH-20-05-28746-251',
       'SH-19-12-65763-975', 'SH-16-12-19437-616', 'SH-22-01-30

In [11]:
train_verdicts = np.random.choice(df_cleaned['verdict'].unique(), size=35, replace=False)
train_df = df_cleaned[df_cleaned['verdict'].isin(train_verdicts)]

remaining_df = df_cleaned[~df_cleaned['verdict'].isin(train_verdicts)]
remaining_verdicts = remaining_df['verdict'].unique()
test_verdicts, eval_verdicts = train_test_split(remaining_verdicts, test_size=0.1, random_state=42)

test_df = remaining_df[remaining_df['verdict'].isin(test_verdicts)]
eval_df = remaining_df[remaining_df['verdict'].isin(eval_verdicts)]

In [12]:
len(train_df), len(test_df), len(eval_df)

(4226, 3293, 257)

In [13]:
label_dict = create_label_dict(train_df)
label_dict

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df.rename(columns={label: 'label'}, inplace=True)
  balanced_df = df.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df.rename(columns={label: 'label'}, inplace=True)
  balanced_df = df.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df.rename(columns={label: 'label'}, inplace=True)
  balanced_df = 

{'reject':                                                    text  label
 0     עוד התרשם שירות המבחן, כי הנאשם מבטא מודעות למ...      0
 1     בהמשך לחץ נאשם 1 את ידו ל הסוכן ותוך כדי לחיצת...      0
 2     בגין אישום זה הורשע הנאשם בעבירה של סחר בסם מס...      0
 3     הנאשם קיבל אחריות למעשים וציין כי ביצע את עביר...      0
 4     בהמשך למתואר לעיל, בסמוך לשעה 22:59 הגיע הנאשם...      0
 ...                                                 ...    ...
 2319  בישראל, אין ייצור מקומי משמעותי של סמים סינטטיים.      1
 2320           שירות המבחן המליץ על מאסר בעבודות שירות.      1
 2321  לצורך קביעת העונש המתאים לנאשם בתוך מתחם העניש...      1
 2322  ככלל, נסיבות הקשורות בביצוע העבירה נקבעות על ב...      1
 2323  הוא לא הפר תנאים ונרתם מהטיפול באופן שמסוכנותו...      1
 
 [2324 rows x 2 columns],
 'PUNISHMENT':                                                   text  label
 0    מתקופה זו ינוכו ימי מעצרו של הנאשם במסגרת הליך...      0
 1    רק לאחרונה הוא יצא לעבודה והוא עובד היום בשכר ..

In [13]:
import shutil

save_path = '/home/tak/pred-sentencing/resources/data/trainning/sentence_classification/drugs/35cases_in_train'

# Remove the directory if it exists, then create it
if os.path.exists(save_path):
    shutil.rmtree(save_path)

os.mkdir(save_path)
train_pkl = save_dfs_pickle(train_df, save_path, 'train_balance', balance=True)
test_pkl = save_dfs_pickle(test_df, save_path, 'test')
eval_pkl = save_dfs_pickle(eval_df, save_path, 'eval')


Dictionary saved as /home/tak/pred-sentencing/resources/data/trainning/sentence_classification/drugs/35cases_in_train/train_balance_label_dataframes.pkl
Dictionary saved as /home/tak/pred-sentencing/resources/data/trainning/sentence_classification/drugs/35cases_in_train/test_label_dataframes.pkl
Dictionary saved as /home/tak/pred-sentencing/resources/data/trainning/sentence_classification/drugs/35cases_in_train/eval_label_dataframes.pkl


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df.rename(columns={label: 'label'}, inplace=True)
  balanced_df = df.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df.rename(columns={label: 'label'}, inplace=True)
  balanced_df = df.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df.rename(columns={label: 'label'}, inplace=True)
  balanced_df = 

In [15]:
train_pkl.keys()

dict_keys(['reject', 'PUNISHMENT', 'GENERAL_CIRCUM', 'CIR_TYPE', 'CIR_AMOUNT', 'CIR_ROLE', 'CIR_EQ'])

In [14]:
#load_pickle
path_pickle = '/home/tak/pred-sentencing/resources/data/trainning/sentence_classification/drugs/35cases_in_train/train_balance_label_dataframes.pkl'
with open(path_pickle, 'rb') as f:
    label_dict = pickle.load(f)
path_test_pickle = '/home/tak/pred-sentencing/resources/data/trainning/sentence_classification/drugs/35cases_in_train/test_label_dataframes.pkl'
with open(path_test_pickle, 'rb') as f:
    label_dict_test = pickle.load(f)

label_dict_test['reject']

Unnamed: 0,text,label
0,"פקודת הסמים המסוכנים [נוסח חדש], תשל""ג-1973: ס...",1
1,"חוק העונשין, תשל""ז-1977: סע' 40ד', 40ד'(א)",1
2,"פקודת המבחן [נוסח חדש], תשכ""ט-1969",1
3,הנאשם הורשע במסגרת הסדר דיוני בעובדות כתב אישו...,1
4,"כמפורט בעובדות האישום המתוקן, ביום 11.10.22 יצ...",1
...,...,...
7771,"בשים לב לכל שנאמר בדיון בפניי, בהתחשב בהודאה ש...",1
7772,בנוסף אני גוזר על הנאשם עונש של 9 חודשי מאסר ע...,0
7773,ככל שבמסגרת הליכי מעצר או הליכים אחרים הופקדו ...,1
7774,מזכירות בית המשפט תנפיק פקודת מאסר לאלתר וזאת ...,1
