In [None]:
!mkdir raw_data
!mkdir raw_data/gym

In [None]:
!python3 -m venv venv

In [None]:
!source venv/bin/activate

In [None]:
!pip install datasets==1.4.0 py7zr wget

In [None]:
!pip install 'transformers[torch]'

In [None]:
!pip install -e .

In [None]:
!python ./entail2/dataloader/gym2entail_multitask.py

In [None]:
!CUDA_VISIBLE_DEVICES=0 \
python ./entail2/runner/runner.py \
--learning_rate 1e-5 \
--warmup_ratio 0.06 \
--train_batch_size 32 \
--num_train_epochs 10 \
--bert_name bert \
--model_name efl_no_cl \
--use_sampler \
--mode train;

In [None]:
!python ./scripts/gen_singletask_test.py \
--data_dir raw_data/gym \
--task_dir custom_dataset_4

In [None]:
!python scripts/gen_singletask_zeroshot_support.py --data_dir raw_data/gym --task_dir custom_dataset_4 --shots 1 --times 1

In [None]:
!python entail2/runner/runner.py \
--data_dir raw_data/gym \
--task_dir custom_dataset_4  \
--model entail2      \
--test_times 1 \
--test_shots 10\
--mode test

In [None]:
!python entail2/runner/runner.py --learning_rate 1e-5 --warmup_ratio 0.06 --train_batch_size 32 --num_train_epochs 10 --bert_name bert --model_name unifew --use_sampler --mode train

In [None]:
import os
import pandas as pd
from tqdm import tqdm
from googletrans import Translator
import random

# translate sentences using Google Translate API
def translate_sentences(sentences, target_languages):
    translations = {}
    translator = Translator()

    for lang in target_languages:
        try:
            translations[lang] = [translator.translate(sentence, dest=lang).text for sentence in tqdm(sentences, desc=f'Translating to {lang}')]
        except Exception as e:
            print(f"Translation to {lang} failed. Error: {e}")
            translations[lang] = []  

    return translations

# process a folder and translate a percentage of the dataset
def process_folder(folder_path, target_languages, translation_percentage=0.7):
    files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

    for file in files:
        file_path = os.path.join(folder_path, file)

        # Check if the file is a training or testing file based on its name
        if 'train' in file or 'test' in file:
            # Load the dataset
            print("File", file, " Started")
            try:
                df = pd.read_csv(file_path, sep='\t')

                # Determine the number of sentences to translate based on the percentage
                num_sentences_to_translate = int(len(df) * translation_percentage)

                # Randomly select sentences to translate
                sentences_to_translate = random.sample(df.iloc[:, 0].tolist(), num_sentences_to_translate)

                # Translate sentences
                translations = translate_sentences(sentences_to_translate, target_languages)

                # Distribute the translations equally across the languages
                num_languages = len(target_languages)
                num_sentences_per_language = num_sentences_to_translate // num_languages

                # Create a mapping of original sentences to translated sentences for each language
                translation_mapping = {}
                for i, lang in enumerate(target_languages):
                    start_idx = i * num_sentences_per_language
                    end_idx = start_idx + num_sentences_per_language
                    translation_mapping.update(zip(sentences_to_translate[start_idx:end_idx], translations[lang]))

                # Apply translations to the DataFrame
                df['translated_sentence'] = df.iloc[:, 0].map(translation_mapping).fillna(df.iloc[:, 0])

                # Save the updated DataFrame to the same file in TSV format
                df.to_csv(file_path, sep='\t', index=False)
                print("File", file, " completed")

            except Exception as e:
                print(f"Error processing file {file}: {e}")

if __name__ == "__main__":
    # dataset folder
    dataset_folder = './raw_data/gym'

    # Set the target languages for translation
    target_languages = ['fr', 'es', 'de', 'ja', 'zh']

    # Set the translation percentage
    translation_percentage = 0.7

    # Get a list of all folders in the dataset folder
    folders = [f for f in os.listdir(dataset_folder) if os.path.isdir(os.path.join(dataset_folder, f))]

    # Process each folder
    for folder in folders:
        print("Folder", folder, " Started")
        folder_path = os.path.join(dataset_folder, folder)
        process_folder(folder_path, target_languages, translation_percentage)
        print("Folder", folder, " completed")


In [None]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import os

# current working directory
data_directory = './swear.json'

# List all files in the specified directory
for dirname, _, filenames in os.walk(data_directory):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:


import pandas as pd

# CSV file name
csv_file = 'spam_data_hi.csv'

# Read the CSV file into a DataFrame 
df = pd.read_csv(csv_file)

# output TSV file name
tsv_file = 'spam_data_hi.tsv'

# TSV file with tab separator
df.to_csv(tsv_file, sep='\t', index=False)


In [None]:
!pip install pandas scikit-learn


import os
import pandas as pd
from sklearn.model_selection import train_test_split

# path to your folder containing TSV files
folder_path = './raw_data/gym/custom_dataset'

tsv_files = [file for file in os.listdir(folder_path) if file.endswith('.tsv')]

# Loop through each TSV file
for file in tsv_files:
    # Read the TSV file into a DataFrame
    df = pd.read_csv(os.path.join(folder_path, file), sep='\t')

    # Split the data into train and test sets (80% train, 20% test)
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    # Define new file names for train and test datasets
    train_file_name = file.replace('.tsv', '_train.tsv')
    test_file_name = file.replace('.tsv', '_test.tsv')

    # Save the train and test datasets to new files
    train_df.to_csv(os.path.join(folder_path, train_file_name), sep='\t', index=False)
    test_df.to_csv(os.path.join(folder_path, test_file_name), sep='\t', index=False)


In [None]:
  
import os
import pandas as pd
import emoji
import regex

def remove_emojis(text):
    """
    Remove emojis from text.
    """
    if isinstance(text, str):
        emoji_pattern = regex.compile("["
                                    u"\U0001F600-\U0001F64F"
                                    u"\U0001F300-\U0001F5FF"
                                    u"\U0001F680-\U0001F6FF"
                                    u"\U0001F700-\U0001F77F"
                                    u"\U0001F780-\U0001F7FF"
                                    u"\U0001F800-\U0001F8FF"
                                    u"\U0001F900-\U0001F9FF"
                                    u"\U0001FA00-\U0001FA6F"
                                    u"\U0001FA70-\U0001FAFF"
                                    u"\U00002702-\U000027B0"
                                    "]+", flags=regex.UNICODE)
        return emoji_pattern.sub(r'', text)
    else:
        return text


def process_files_in_folder(folder_path):
    """
    Process all TSV files in a folder that contain "train" in their names.
    """
    tsv_files = [file for file in os.listdir(folder_path) if file.endswith('.tsv') and 'train' in file]

    for file in tsv_files:
        file_path = os.path.join(folder_path, file)

        try:
            # Read the TSV file into a DataFrame
            with open(file_path, 'r', errors='replace') as file_content:
                df = pd.read_csv(file_content, sep='\t')

            # Remove emojis from the first column
            df.iloc[:, 0] = df.iloc[:, 0].apply(remove_emojis)

            # Save the DataFrame back to the TSV file
            df.to_csv(file_path, sep='\t', index=False)

        except pd.errors.ParserError as e:
            print(f"Error processing file: {file_path}")
            print(f"Error details: {e}")

# main folder containing subfolders
main_folder = './raw_data/gym'

# Process files in all subfolders
for subfolder in os.listdir(main_folder):
    subfolder_path = os.path.join(main_folder, subfolder)

    # Check if the subfolder is a directory
    if os.path.isdir(subfolder_path):
        process_files_in_folder(subfolder_path)



In [None]:
import zipfile

# path to the zip file
zip_file_path = './raw_data/gym/custom_dataset/Test1.zip'

# extraction directory
extract_to = './raw_data/gym/custom_dataset'

# Open the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    
    # Extract all contents into the specified directory
    zip_ref.extractall(extract_to)

print(f'Contents of {zip_file_path} have been extracted to {extract_to}')


In [None]:
import os
import pandas as pd

def convert_csv_to_tsv(input_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    # List all files in the input folder
    files = os.listdir(input_folder)

    for file in files:
        if file.endswith(".csv"):
            # Form the input and output file paths
            input_path = os.path.join(input_folder, file)
            output_path = os.path.join(output_folder, file.replace(".csv", ".tsv"))

            # Read the CSV file into a DataFrame
            df = pd.read_csv(input_path)

            # Save the DataFrame to a TSV file
            df.to_csv(output_path, sep='\t', index=False)

            print(f"Converted {file} to {output_path}")

# input and output folders
input_folder = './raw_data/gym/custom_dataset'
output_folder = './raw_data/gym/custom_dataset'

# function call to convert CSV to TSV
convert_csv_to_tsv(input_folder, output_folder)


In [None]:
import pandas as pd
file_path = './raw_data/gym/custom_dataset_13/custom_dataset_13_test.tsv'
df = pd.read_csv(file_path, sep='\t')

df = df.iloc[:, 1:]

df.to_csv(file_path, sep='\t', index=False)
print("done")