In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import os
import pickle

In [6]:
df = pd.read_csv('/home/tak/MOJ/resources/data/tagging/drugs/gt/combined_output.csv')
df['text'] = df['text'].astype(str)

df_cleaned = df.drop_duplicates(subset=['text'])
duplicates = df_cleaned.duplicated(subset=['text'])
label_columns = [col for col in df_cleaned.columns if not col.startswith('Unnamed') and col != 'verdict' and col != 'text']

#drop nan values
# df_cleaned = df_cleaned.dropna(subset=['verdict'])


In [7]:
label_columns

['reject',
 'CIR_PUNISHMENT',
 'CONFESSION',
 'CIR_TYPE',
 'REGRET',
 'CIR_ROLE',
 'GENERAL_CIRCUM',
 'CIR_AMOUNT',
 'RESPO',
 'CIR_EQ',
 'CIRCUM_OFFENSE']

In [9]:
def value_vounts(df):
    verdict_grouped = df.groupby('verdict').size().reset_index(name='total_sentences')
    label_counts = df.groupby('verdict')[label_columns].sum().reset_index()

    combined_df = pd.merge(verdict_grouped, label_counts, on='verdict')

    return combined_df

def balance_dataframe(df):
    
    min_count = df['label'].value_counts().min()
    balanced_df = df.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)

    return balanced_df

def create_label_dict(df, balance=True, text_column='text', ):    
    
    label_dict = {}
    for label in label_columns:
        label_df = df[[text_column, label]]
        label_df.rename(columns={label: 'label'}, inplace=True)
        if balance:
            label_df = balance_dataframe(label_df)
        label_dict[label] = label_df

    return label_dict



def save_dict_as_pickle(label_dict, file_name='label_dataframes.pkl'):
    with open(file_name, 'wb') as f:
        pickle.dump(label_dict, f)
    print(f"Dictionary saved as {file_name}")
    
def load_dict_from_pickle(file_name='label_dataframes.pkl'):
    with open(file_name, 'rb') as f:
        label_dict = pickle.load(f)
    print(f"Dictionary loaded from {file_name}")
    return label_dict

def save_dfs_pickle(df, save_path, type_, balance=False):
    label_dataframes = create_label_dict(df, balance)
    save_dict_as_pickle(label_dataframes, os.path.join(save_path, type_ + '_label_dataframes.pkl'))
    return label_dataframes

In [10]:
print("DataFrame columns:", df_cleaned.columns)


DataFrame columns: Index(['verdict', 'text', 'reject', 'CIR_PUNISHMENT', 'CONFESSION', 'CIR_TYPE',
       'REGRET', 'CIR_ROLE', 'GENERAL_CIRCUM', 'CIR_AMOUNT', 'RESPO', 'CIR_EQ',
       'CIRCUM_OFFENSE'],
      dtype='object')


In [11]:
%pip install scikit-multilearn

Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl.metadata (6.0 kB)
Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
Installing collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0
Note: you may need to restart the kernel to use updated packages.


In [42]:
from skmultilearn.model_selection import iterative_train_test_split

# Load your CSV file into DataFrame
# df_cleaned = pd.read_csv('your_file.csv')

# Features (you can adjust as needed)
X = df_cleaned.drop(columns=[
    'reject','CIR_PUNISHMENT', 'CONFESSION', 'CIR_TYPE','CIR_ROLE',
    'GENERAL_CIRCUM', 'CIR_AMOUNT', 'REGRET','RESPO', 'CIR_EQ'
]).values

# All your binary label columns
y = df_cleaned[[
    'reject','CIR_PUNISHMENT', 'CONFESSION', 'CIR_TYPE','CIR_ROLE',
    'GENERAL_CIRCUM', 'CIR_AMOUNT', 'REGRET','RESPO', 'CIR_EQ'
]].values

# First split: 60% train, 40% temp (eval + test)
X_train, y_train, X_temp, y_temp = iterative_train_test_split(X, y, test_size=0.5)

# Second split: temp into 20% eval and 20% test (half each)
X_eval, y_eval, X_test, y_test = iterative_train_test_split(X_temp, y_temp, test_size=0.8)

# Convert splits back to DataFrames
feature_columns = df_cleaned.drop(columns=[
    'reject','CIR_PUNISHMENT', 'CONFESSION', 'CIR_TYPE', 'REGRET', 'CIR_ROLE',
    'GENERAL_CIRCUM', 'CIR_AMOUNT', 'REGRET','RESPO', 'CIR_EQ'
]).columns.tolist()

label_columns = [
    'reject', 'CIR_PUNISHMENT', 'CONFESSION', 'CIR_TYPE', 'CIR_ROLE',
    'GENERAL_CIRCUM', 'CIR_AMOUNT', 'REGRET', 'RESPO', 'CIR_EQ'
]


# Train DataFrame
df_train = pd.DataFrame(X_train, columns=feature_columns)
df_train[label_columns] = y_train

# Eval DataFrame
df_eval = pd.DataFrame(X_eval, columns=feature_columns)
df_eval[label_columns] = y_eval

# Test DataFrame
df_test = pd.DataFrame(X_test, columns=feature_columns)
df_test[label_columns] = y_test

# Verify distributions
# Function to verify and collect distribution into a DataFrame
# Define the verify_distribution function
def verify_distribution(df, name):
    dist = df[label_columns].mean()
    dist.name = name
    print(f"--- {name} Distribution ---")
    print(dist, "\n")
    return dist

# Call verify_distribution for each split
# train_dist = verify_distribution(df_train, "Train")
# eval_dist = verify_distribution(df_eval, "Eval")
# test_dist = verify_distribution(df_test, "Test")

# # Combine into one DataFrame
# distribution_df = pd.concat([train_dist, eval_dist, test_dist], axis=1)

# # Save combined distributions to CSV
# distribution_df.to_csv("label_distribution.csv", index=True)

#save the verify_distribution in one csv file


#remove nan
train_df = df_train.dropna()
eval_df = df_eval.dropna()
test_df = df_test.dropna()

# Save DataFrames to CSV
# train_df.to_csv('train_drugs.csv', index=False)
# eval_df.to_csv('eval_drugs.csv', index=False)
# test_df.to_csv('test_drugs.csv', index=False)

In [39]:
train_verdicts = np.random.choice(df_cleaned['verdict'].unique(), size=15, replace=False)
train_df = df_cleaned[df_cleaned['verdict'].isin(train_verdicts)]

remaining_df = df_cleaned[~df_cleaned['verdict'].isin(train_verdicts)]
remaining_verdicts = remaining_df['verdict'].unique()
test_verdicts, eval_verdicts = train_test_split(remaining_verdicts, test_size=0.1, random_state=42)

test_df = remaining_df[remaining_df['verdict'].isin(test_verdicts)]
eval_df = remaining_df[remaining_df['verdict'].isin(eval_verdicts)]

In [24]:
len(train_df), len(test_df), len(eval_df)

(977, 1179, 292)

In [10]:
import shutil

save_path = '/home/tak/MOJ/resources/data/trainning/sentence_classification/drugs/stratify_sentences_50_40_10'

train_df = '/home/tak/MOJ/src/scripts/sentence_classification/train_drugs (1).csv'
test_df = '/home/tak/MOJ/src/scripts/sentence_classification/test_drugs - test_drugs.csv'
eval_df = '/home/tak/MOJ/src/scripts/sentence_classification/eval_drugs - eval_drugs.csv'
columns = ['CIR_TYPE', 'CIR_AMOUNT', 'CIR_ROLE', 'CIR_EQ']
# create new column that if in the columns above there is a value of 1 then the new column will be 1
df = pd.read_csv(train_df)
df['CIRCUM_OFFENSE'] = df[columns].sum(axis=1)
df['CIRCUM_OFFENSE'] = df['CIRCUM_OFFENSE'].apply(lambda x: 1 if x > 0 else 0)
train_df = df

df = pd.read_csv(test_df)
df['CIRCUM_OFFENSE'] = df[columns].sum(axis=1)
df['CIRCUM_OFFENSE'] = df['CIRCUM_OFFENSE'].apply(lambda x: 1 if x > 0 else 0)
test_df = df

df = pd.read_csv(eval_df)
df['CIRCUM_OFFENSE'] = df[columns].sum(axis=1)
df['CIRCUM_OFFENSE'] = df['CIRCUM_OFFENSE'].apply(lambda x: 1 if x > 0 else 0)
eval_df = df

# Remove the directory if it exists, then create it
if os.path.exists(save_path):
    shutil.rmtree(save_path)

os.mkdir(save_path)
train_pkl = save_dfs_pickle(train_df, save_path, 'train_balance', balance=True)
test_pkl = save_dfs_pickle(test_df, save_path, 'test')
eval_pkl = save_dfs_pickle(eval_df, save_path, 'eval')


Dictionary saved as /home/tak/MOJ/resources/data/trainning/sentence_classification/drugs/stratify_sentences_50_40_10/train_balance_label_dataframes.pkl
Dictionary saved as /home/tak/MOJ/resources/data/trainning/sentence_classification/drugs/stratify_sentences_50_40_10/test_label_dataframes.pkl
Dictionary saved as /home/tak/MOJ/resources/data/trainning/sentence_classification/drugs/stratify_sentences_50_40_10/eval_label_dataframes.pkl


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df.rename(columns={label: 'label'}, inplace=True)
  balanced_df = df.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df.rename(columns={label: 'label'}, inplace=True)
  balanced_df = df.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df.rename(columns={label: 'label'}, inplace=True)
  balanced_df = 

In [106]:
train_df.keys()

Index(['text', 'reject', 'CIR_PUNISHMENT', 'CONFESSION', 'CIR_TYPE',
       'CIR_ROLE', 'GENERAL_CIRCUM', 'CIR_AMOUNT', 'REGRET', 'RESPO', 'CIR_EQ',
       'CIRCUM_OFFENSE'],
      dtype='object')

In [12]:
#read pickle file
train_pkl = load_dict_from_pickle('/home/tak/MOJ/resources/data/trainning/sentence_classification/drugs/stratify_sentences_50_40_10/train_balance_label_dataframes.pkl')
train_pkl

Dictionary loaded from /home/tak/MOJ/resources/data/trainning/sentence_classification/drugs/stratify_sentences_50_40_10/train_balance_label_dataframes.pkl


{'reject':                                                    text  label
 0                 הנאשם כבן 32, נשוי ואב לשלושה פעוטות.      0
 1     בנסיבות אלה יש למקם את עונשו מעט מתחת לאמצע מת...      0
 2     על פי עובדות כתב האישום, בתמצית רבה, ביום 31/7...      0
 3     היא סיימה 12 שנות לימוד במסגרות חרדיות ללא בגר...      0
 4     ההגנה הגישה שתי אסמכתאות לכך שהנאשם שולב בהליכ...      0
 ...                                                 ...    ...
 1069  התביעה טענה לפגיעה בערכים המוגנים של פגיעה בבר...      1
 1070  הנאשם השיב לשיחה: "וואלה בסדר אחי עוד שעה אני ...      1
 1071  מזכירות בית המשפט תנפיק פקודת מאסר לאלתר וזאת ...      1
 1072  ככל שקיימות יתרות זכות ואין עיקולים ניתן להשיב...      1
 1073  אשר למדיניות הענישה הנוהגת, זו לרוב מחמירה ומס...      1
 
 [1074 rows x 2 columns],
 'CIR_PUNISHMENT':                                                   text  label
 0    עוד נטען, כי הנאשם צעיר בן 19 בעת ביצוע העבירו...      0
 1    ביחס לנסיבות שאינן קשורות בביצוע העבירה הזכי