In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import os
import pickle
import shutil


# Drugs

In [3]:
expirement_name = 'experiment_1'
save_path = f'/home/tak/MOJ/resources/data/trainning/sentence_classification/drugs/{expirement_name}'

train_df = '/home/tak/MOJ/resources/data/tagging/drugs/gt/data_splits/train_drugs.csv'
test_df = '/home/tak/MOJ/resources/data/tagging/drugs/gt/data_splits/test_drugs.csv'
eval_df = '/home/tak/MOJ/resources/data/tagging/drugs/gt/data_splits/eval_drugs.csv'


#read the data
train_df = pd.read_csv(train_df)
test_df = pd.read_csv(test_df)
eval_df = pd.read_csv(eval_df)


# Remove the directory if it exists, then create it
if os.path.exists(save_path):
    shutil.rmtree(save_path)

os.mkdir(save_path)

In [4]:
label_columns = ['reject', 'CIR_PUNISHMENT', 'CONFESSION', 'CIR_TYPE', 'CIR_ROLE',
                  'GENERAL_CIRCUM', 'CIR_AMOUNT', 'REGRET', 'RESPO', 'CIR_EQ','CIRCUM_OFFENCE']

In [5]:
def value_vounts(df):
    verdict_grouped = df.groupby('verdict').size().reset_index(name='total_sentences')
    label_counts = df.groupby('verdict')[label_columns].sum().reset_index()

    combined_df = pd.merge(verdict_grouped, label_counts, on='verdict')

    return combined_df

def balance_dataframe(df):
    
    min_count = df['label'].value_counts().min()
    balanced_df = df.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)

    return balanced_df

def create_label_dict(df, balance=True, text_column='text', ):    
    
    label_dict = {}
    for label in label_columns:
        label_df = df[[text_column, label]]
        label_df.rename(columns={label: 'label'}, inplace=True)
        if balance:
            label_df = balance_dataframe(label_df)
        label_dict[label] = label_df

    return label_dict



def save_dict_as_pickle(label_dict, file_name='label_dataframes.pkl'):
    with open(file_name, 'wb') as f:
        pickle.dump(label_dict, f)
    print(f"Dictionary saved as {file_name}")
    
def load_dict_from_pickle(file_name='label_dataframes.pkl'):
    with open(file_name, 'rb') as f:
        label_dict = pickle.load(f)
    print(f"Dictionary loaded from {file_name}")
    return label_dict

def save_dfs_pickle(df, save_path, type_, balance=False):
    label_dataframes = create_label_dict(df, balance)
    save_dict_as_pickle(label_dataframes, os.path.join(save_path, type_ + '_label_dataframes.pkl'))
    return label_dataframes

In [6]:
# for drugs
CIR_COLUMNS = ['CIR_TYPE', 'CIR_ROLE','CIR_AMOUNT','CIR_EQ']
# for each dataframe, create new column as CIRCUM_OFFENCE and put 1 if the was at least one of the CIR_COLUMNS is 1
def create_circum_offence_column(df):
    df['CIRCUM_OFFENCE'] = df[CIR_COLUMNS].any(axis=1).astype(int)
    return df
train_df = create_circum_offence_column(train_df)
test_df = create_circum_offence_column(test_df)
eval_df = create_circum_offence_column(eval_df)

train_pkl = save_dfs_pickle(train_df, save_path, 'train_balance', balance=True)
test_pkl = save_dfs_pickle(test_df, save_path, 'test')
eval_pkl = save_dfs_pickle(eval_df, save_path, 'eval')

Dictionary saved as /home/tak/MOJ/resources/data/trainning/sentence_classification/drugs/experiment_1/train_balance_label_dataframes.pkl
Dictionary saved as /home/tak/MOJ/resources/data/trainning/sentence_classification/drugs/experiment_1/test_label_dataframes.pkl
Dictionary saved as /home/tak/MOJ/resources/data/trainning/sentence_classification/drugs/experiment_1/eval_label_dataframes.pkl


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df.rename(columns={label: 'label'}, inplace=True)
  balanced_df = df.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df.rename(columns={label: 'label'}, inplace=True)
  balanced_df = df.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df.rename(columns={label: 'label'}, inplace=True)
  balanced_df = 

# Weapon

In [7]:
expirement_name = 'experiment_1'
save_path = f'/home/tak/MOJ/resources/data/trainning/sentence_classification/weapon/{expirement_name}'

train_df = '/home/tak/MOJ/resources/data/tagging/weapon/gt/data_splits/train.csv'
test_df = '/home/tak/MOJ/resources/data/tagging/weapon/gt/data_splits/test.csv'
eval_df = '/home/tak/MOJ/resources/data/tagging/weapon/gt/data_splits/eval.csv'

#read the data
train_df = pd.read_csv(train_df)
test_df = pd.read_csv(test_df)
eval_df = pd.read_csv(eval_df)


# Remove the directory if it exists, then create it
if os.path.exists(save_path):
    shutil.rmtree(save_path)

os.mkdir(save_path)

In [11]:
label_columns = ['reject', 'CONFESSION', 'CIR_TYPE_WEP', 'CIR_HELD_WAY_WEP', 'CIR_AMMU_AMOUNT_WEP','CIR_PURPOSE',
                 'GENERAL_CIRCUM', 'CIR_STATUS_WEP', 'REGRET', 'PUNISHMENT', 'CIR_PLANNING','RESPO','CIR_OBTAIN_WAY_WEP',
                'CIR_USE','CIRCUM_OFFENCE']

In [12]:
def value_vounts(df):
    verdict_grouped = df.groupby('verdict').size().reset_index(name='total_sentences')
    label_counts = df.groupby('verdict')[label_columns].sum().reset_index()

    combined_df = pd.merge(verdict_grouped, label_counts, on='verdict')

    return combined_df

def balance_dataframe(df):
    
    min_count = df['label'].value_counts().min()
    balanced_df = df.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)

    return balanced_df

def create_label_dict(df, balance=True, text_column='text', ):    
    
    label_dict = {}
    for label in label_columns:
        label_df = df[[text_column, label]]
        label_df.rename(columns={label: 'label'}, inplace=True)
        if balance:
            label_df = balance_dataframe(label_df)
        label_dict[label] = label_df

    return label_dict



def save_dict_as_pickle(label_dict, file_name='label_dataframes.pkl'):
    with open(file_name, 'wb') as f:
        pickle.dump(label_dict, f)
    print(f"Dictionary saved as {file_name}")
    
def load_dict_from_pickle(file_name='label_dataframes.pkl'):
    with open(file_name, 'rb') as f:
        label_dict = pickle.load(f)
    print(f"Dictionary loaded from {file_name}")
    return label_dict

def save_dfs_pickle(df, save_path, type_, balance=False):
    label_dataframes = create_label_dict(df, balance)
    save_dict_as_pickle(label_dataframes, os.path.join(save_path, type_ + '_label_dataframes.pkl'))
    return label_dataframes

In [13]:
# for drugs
CIR_COLUMNS = ['CIR_TYPE_WEP', 'CIR_HELD_WAY_WEP', 'CIR_AMMU_AMOUNT_WEP','CIR_PURPOSE','CIR_STATUS_WEP',
               'CIR_PLANNING','CIR_OBTAIN_WAY_WEP','CIR_USE']
# for each dataframe, create new column as CIRCUM_OFFENCE and put 1 if the was at least one of the CIR_COLUMNS is 1
def create_circum_offence_column(df):
    df['CIRCUM_OFFENCE'] = df[CIR_COLUMNS].any(axis=1).astype(int)
    return df
train_df = create_circum_offence_column(train_df)
test_df = create_circum_offence_column(test_df)
eval_df = create_circum_offence_column(eval_df)
train_df['CIRCUM_OFFENCE'].value_counts()
train_pkl = save_dfs_pickle(train_df, save_path, 'train_balance', balance=True)
test_pkl = save_dfs_pickle(test_df, save_path, 'test')
eval_pkl = save_dfs_pickle(eval_df, save_path, 'eval')

Dictionary saved as /home/tak/MOJ/resources/data/trainning/sentence_classification/weapon/experiment_1/train_balance_label_dataframes.pkl
Dictionary saved as /home/tak/MOJ/resources/data/trainning/sentence_classification/weapon/experiment_1/test_label_dataframes.pkl
Dictionary saved as /home/tak/MOJ/resources/data/trainning/sentence_classification/weapon/experiment_1/eval_label_dataframes.pkl


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df.rename(columns={label: 'label'}, inplace=True)
  balanced_df = df.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df.rename(columns={label: 'label'}, inplace=True)
  balanced_df = df.groupby('label').apply(lambda x: x.sample(min_count)).reset_index(drop=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df.rename(columns={label: 'label'}, inplace=True)
  balanced_df = 