In [None]:
import os
import shutil
import zipfile

def copy_clean_files(extracted_folder, output_folder, bad_words):
    total_deleted_size = 0

    # Walk through all directories and subdirectories
    for root, dirs, files in os.walk(extracted_folder):
        # Create a corresponding output folder structure
        relative_path = os.path.relpath(root, extracted_folder)
        output_subfolder = os.path.join(output_folder, relative_path)

        if not os.path.exists(output_subfolder):
            os.makedirs(output_subfolder)

        # Process each text file in the current directory
        text_files = [f for f in files if f.endswith('.txt')]
        for text_file in text_files:
            file_path = os.path.join(root, text_file)

            # Read the content of the text file
            with open(file_path, 'r', encoding='utf-8-sig') as f:
                content = f.read()

            # Check if any bad word is in the content (case insensitive)
            if not any(bad_word.lower() in content.lower() for bad_word in bad_words):
                # If no bad words are found, copy the file to the corresponding output folder
                shutil.copy(file_path, os.path.join(output_subfolder, text_file))
                print(f"Copied {file_path} to {output_subfolder}")
            else:
                # Calculate the size of the deleted file
                deleted_file_size = os.path.getsize(file_path)
                total_deleted_size += deleted_file_size
                print(f"File with bad words detected and ignored: {file_path} (Size: {deleted_file_size} bytes)")

    return total_deleted_size

# Function to extract zip file
def extract_zip(zip_file_path, extract_to):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"Extracted {zip_file_path} to {extract_to}")

# List of bad words (can be modified)
bad_words = ['मौगा', 'भतारकटनी', 'जरलाहा', 'मुहझौसा', 'भकचोनार', 'लबरा', 'चितकाबर', 'छिचलेदार', 'बुर',
             'बरबताह', 'लारचट्टा', 'गरचट्टा', 'बुरचट्टा', 'भतार', 'वेश्या', 'धीचोदा', 'चाक',
             'गू' , 'रंड', 'लार', 'गार', 'खसवा', 'चितवा', 'थेथर']  # Replace with actual bad words

# Set your zip file path, extraction folder, and output folder path
zip_file_path = "/content/text_folder_file.zip"  # Replace with your .zip file path
extract_to_folder = "content/extracted/folder"  # Temporary folder where the .zip will be extracted
output_folder = "content/clean/folder"  # Replace with your clean folder path

# Extract the zip file
extract_zip(zip_file_path, extract_to_folder)

# Call the function to copy files that do not contain bad words and get the total deleted size
total_deleted_size = copy_clean_files(extract_to_folder, output_folder, bad_words)

# Convert deleted size to MB for readability
total_deleted_size_mb = total_deleted_size / (1024 * 1024)
print(f"Total amount of data deleted: {total_deleted_size_mb:.2f} MB")
print("Task complete.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Copied content/extracted/folder/text_folder/youtube_transcript_1/transcript_yt322.txt to content/clean/folder/text_folder/youtube_transcript_1
Copied content/extracted/folder/text_folder/youtube_transcript_1/transcript_yt46.txt to content/clean/folder/text_folder/youtube_transcript_1
Copied content/extracted/folder/text_folder/youtube_transcript_1/transcript_yt433.txt to content/clean/folder/text_folder/youtube_transcript_1
Copied content/extracted/folder/text_folder/youtube_transcript_1/transcript_yt461.txt to content/clean/folder/text_folder/youtube_transcript_1
Copied content/extracted/folder/text_folder/youtube_transcript_1/transcript_yt17.txt to content/clean/folder/text_folder/youtube_transcript_1
Copied content/extracted/folder/text_folder/youtube_transcript_1/transcript_yt306.txt to content/clean/folder/text_folder/youtube_transcript_1
Copied content/extracted/folder/text_folder/youtube_transcript_1/transcript_yt3

In [None]:
from google.colab import drive
drive.mount('/content/drive')