In [1]:
!py -m pip install ekphrasis




[notice] A new release of pip is available: 23.3.2 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
cd d:\\Internship\\CEEW

d:\Internship\CEEW


In [28]:
import pandas as pd
from tqdm import tqdm
from ekphrasis.classes.segmenter import Segmenter
import os
import re
import csv
from textblob import TextBlob

# Initialize Ekphrasis segmenter with English Wikipedia corpus
seg_eng = Segmenter(corpus="english")

# Function to segment text using Ekphrasis
def segment_text(text, segmenter):
    try:
        segmented_text = segmenter.segment(text)
        return segmented_text
    except Exception as e:
        print(f"Error processing text: {e}")
        return None

Reading english - 1grams ...
Reading english - 2grams ...


In [29]:
# Function to correct spelling using TextBlob
def correct_spelling(text):
    try:
        return str(TextBlob(text).correct())
    except Exception as e:
        print(f"Error correcting spelling: {e}")
        return text

# Process a single chunk
# Process a single chunk
def process_chunk(chunk):
    if chunk is None or not isinstance(chunk, str):
        return ""
    
    try:
        # Split camel case/Pascal case words
        segmented_text = segment_text(chunk, seg_eng)
        # Correct spelling using TextBlob
        corrected_text = correct_spelling(segmented_text)
        # Segment text using Ekphrasis
        return corrected_text
    except Exception as e:
        print(f"Error processing chunk: {e}")
        return chunk

In [19]:
def process_batch(batch, batch_number, output_checkpoint_base):
    try:
        cleaned_chunks = [process_chunk(chunk) for chunk in tqdm(batch, desc=f"Processing batch {batch_number}") if chunk is not None]
        
        # Save cleaned_chunks to a CSV file
        checkpoint_csv = f"{output_checkpoint_base}{batch_number}.csv"
        batch_df = pd.DataFrame(cleaned_chunks, columns=['Segmented_Document'])
        batch_df.to_csv(checkpoint_csv, index=False)
        
        print(f"Batch {batch_number} saved to {checkpoint_csv}")
    except Exception as e:
        print(f"Error processing batch {batch_number}: {e}")

In [20]:
# Function to merge all batches into a single CSV file
def merge_batches(output_checkpoint_base, output_csv, total_batches):
    try:
        dfs = []
        for batch_number in tqdm(range(1, total_batches + 1), desc="Merging batches"):
            checkpoint_csv = f"{output_checkpoint_base}{batch_number}.csv"
            if os.path.exists(checkpoint_csv):
                df_checkpoint = pd.read_csv(checkpoint_csv)
                dfs.append(df_checkpoint)
        
        merged_df = pd.concat(dfs, ignore_index=True)
        merged_df.to_csv(output_csv, index=False)
        print(f"All batches merged into {output_csv}")
    except Exception as e:
        print(f"Error merging batches: {e}")

In [30]:
# Main processing function
def main(input_csv, output_checkpoint_base, output_csv, batch_size=1000):
    try:
        # Read CSV into a DataFrame
        df = pd.read_csv(input_csv)
        
        # Extract text chunks
        refined_chunks = df['Document'].tolist()
        total_chunks = len(refined_chunks)
        
        print(f"Total chunks to process: {total_chunks}")

        # Determine the last completed checkpoint
        checkpoint_files = [f for f in os.listdir(os.path.dirname(output_checkpoint_base)) if f.startswith(os.path.basename(output_checkpoint_base))]
        completed_batches = sorted([int(f.split('_')[-1].split('.')[0]) for f in checkpoint_files])
        start_batch = (completed_batches[-1] + 1) if completed_batches else 1

        for batch_number in tqdm(range(start_batch, (total_chunks // batch_size) + 2), desc="Overall Progress"):
            start_index = (batch_number - 1) * batch_size
            end_index = min(start_index + batch_size, total_chunks)
            batch = refined_chunks[start_index:end_index]
            process_batch(batch, batch_number, output_checkpoint_base)

        total_batches = (total_chunks // batch_size) + 1
        merge_batches(output_checkpoint_base, output_csv, total_batches)
    except Exception as e:
        print(f"Error in main processing function: {e}")

In [31]:
# File paths
input_csv = "d:\\Internship\\CEEW\\cleaned_chunks_final.csv"
output_checkpoint_base = "d:\\Internship\\CEEW\\cleaned_chunks_final_segmented_checkpoint_"
output_csv = "d:\\Internship\\CEEW\\cleaned_chunks_final_segmented.csv"

# Run the main function
main(input_csv, output_checkpoint_base, output_csv)

Total chunks to process: 92952


Processing batch 59:  78%|███████▊  | 778/1000 [1:11:42<20:27,  5.53s/it]


Error processing text: maximum recursion depth exceeded while calling a Python object
Error correcting spelling: The `text` argument passed to `__init__(text)` must be a string, not <class 'NoneType'>




Error processing text: maximum recursion depth exceeded while calling a Python object
Error correcting spelling: The `text` argument passed to `__init__(text)` must be a string, not <class 'NoneType'>


Processing batch 59: 100%|██████████| 1000/1000 [02:12<00:00,  7.55it/s]
Overall Progress:   3%|▎         | 1/35 [02:12<1:15:01, 132.39s/it]

Batch 59 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_59.csv




Error processing text: maximum recursion depth exceeded while calling a Python object
Error correcting spelling: The `text` argument passed to `__init__(text)` must be a string, not <class 'NoneType'>




Error processing text: maximum recursion depth exceeded while calling a Python object
Error correcting spelling: The `text` argument passed to `__init__(text)` must be a string, not <class 'NoneType'>


Processing batch 60: 100%|██████████| 1000/1000 [01:58<00:00,  8.41it/s]
Overall Progress:   6%|▌         | 2/35 [04:11<1:08:28, 124.49s/it]

Batch 60 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_60.csv


Processing batch 61: 100%|██████████| 1000/1000 [01:58<00:00,  8.44it/s]
Overall Progress:   9%|▊         | 3/35 [06:09<1:04:56, 121.78s/it]

Batch 61 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_61.csv


Processing batch 62: 100%|██████████| 1000/1000 [00:58<00:00, 17.02it/s]
Overall Progress:  11%|█▏        | 4/35 [07:08<50:03, 96.90s/it]   

Batch 62 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_62.csv


Processing batch 63: 100%|██████████| 1000/1000 [01:08<00:00, 14.55it/s]
Overall Progress:  14%|█▍        | 5/35 [08:17<43:22, 86.75s/it]

Batch 63 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_63.csv


Processing batch 64: 100%|██████████| 1000/1000 [01:38<00:00, 10.14it/s]
Overall Progress:  17%|█▋        | 6/35 [09:55<43:52, 90.77s/it]

Batch 64 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_64.csv


Processing batch 65: 100%|██████████| 1000/1000 [01:57<00:00,  8.52it/s]
Overall Progress:  20%|██        | 7/35 [11:53<46:25, 99.49s/it]

Batch 65 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_65.csv


Processing batch 66: 100%|██████████| 1000/1000 [01:41<00:00,  9.83it/s]
Overall Progress:  23%|██▎       | 8/35 [13:35<45:05, 100.22s/it]

Batch 66 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_66.csv


Processing batch 67: 100%|██████████| 1000/1000 [01:55<00:00,  8.62it/s]
Overall Progress:  26%|██▌       | 9/35 [15:31<45:33, 105.15s/it]

Batch 67 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_67.csv


Processing batch 68: 100%|██████████| 1000/1000 [00:59<00:00, 16.89it/s]
Overall Progress:  29%|██▊       | 10/35 [16:30<37:54, 90.97s/it]

Batch 68 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_68.csv


Processing batch 69: 100%|██████████| 1000/1000 [00:51<00:00, 19.59it/s]
Overall Progress:  31%|███▏      | 11/35 [17:21<31:30, 78.75s/it]

Batch 69 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_69.csv


Processing batch 70: 100%|██████████| 1000/1000 [00:46<00:00, 21.35it/s]
Overall Progress:  34%|███▍      | 12/35 [18:08<26:28, 69.05s/it]

Batch 70 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_70.csv


Processing batch 71: 100%|██████████| 1000/1000 [00:43<00:00, 22.82it/s]
Overall Progress:  37%|███▋      | 13/35 [18:52<22:30, 61.40s/it]

Batch 71 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_71.csv


Processing batch 72: 100%|██████████| 1000/1000 [00:45<00:00, 21.76it/s]
Overall Progress:  40%|████      | 14/35 [19:38<19:51, 56.74s/it]

Batch 72 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_72.csv


Processing batch 73: 100%|██████████| 1000/1000 [00:48<00:00, 20.45it/s]
Overall Progress:  43%|████▎     | 15/35 [20:26<18:07, 54.38s/it]

Batch 73 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_73.csv


Processing batch 74: 100%|██████████| 1000/1000 [00:35<00:00, 28.16it/s]
Overall Progress:  46%|████▌     | 16/35 [21:02<15:25, 48.70s/it]

Batch 74 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_74.csv


Processing batch 75: 100%|██████████| 1000/1000 [00:39<00:00, 25.41it/s]
Overall Progress:  49%|████▊     | 17/35 [21:41<13:46, 45.89s/it]

Batch 75 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_75.csv


Processing batch 76: 100%|██████████| 1000/1000 [01:03<00:00, 15.80it/s]
Overall Progress:  51%|█████▏    | 18/35 [22:45<14:29, 51.12s/it]

Batch 76 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_76.csv


Processing batch 77: 100%|██████████| 1000/1000 [01:20<00:00, 12.43it/s]
Overall Progress:  54%|█████▍    | 19/35 [24:05<15:58, 59.93s/it]

Batch 77 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_77.csv


Processing batch 78: 100%|██████████| 1000/1000 [01:28<00:00, 11.32it/s]
Overall Progress:  57%|█████▋    | 20/35 [25:33<17:06, 68.46s/it]

Batch 78 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_78.csv


Processing batch 79: 100%|██████████| 1000/1000 [01:09<00:00, 14.34it/s]
Overall Progress:  60%|██████    | 21/35 [26:43<16:03, 68.84s/it]

Batch 79 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_79.csv


Processing batch 80: 100%|██████████| 1000/1000 [00:58<00:00, 17.21it/s]
Overall Progress:  63%|██████▎   | 22/35 [27:41<14:13, 65.62s/it]

Batch 80 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_80.csv


Processing batch 81: 100%|██████████| 1000/1000 [00:55<00:00, 18.15it/s]
Overall Progress:  66%|██████▌   | 23/35 [28:36<12:29, 62.46s/it]

Batch 81 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_81.csv


Processing batch 82: 100%|██████████| 1000/1000 [01:01<00:00, 16.27it/s]
Overall Progress:  69%|██████▊   | 24/35 [29:38<11:23, 62.16s/it]

Batch 82 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_82.csv


Processing batch 83: 100%|██████████| 1000/1000 [01:22<00:00, 12.18it/s]
Overall Progress:  71%|███████▏  | 25/35 [31:00<11:21, 68.14s/it]

Batch 83 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_83.csv


Processing batch 84: 100%|██████████| 1000/1000 [01:18<00:00, 12.80it/s]
Overall Progress:  74%|███████▍  | 26/35 [32:18<10:40, 71.15s/it]

Batch 84 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_84.csv


Processing batch 85: 100%|██████████| 1000/1000 [01:16<00:00, 13.11it/s]
Overall Progress:  77%|███████▋  | 27/35 [33:34<09:41, 72.68s/it]

Batch 85 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_85.csv


Processing batch 86: 100%|██████████| 1000/1000 [00:57<00:00, 17.35it/s]
Overall Progress:  80%|████████  | 28/35 [34:32<07:57, 68.17s/it]

Batch 86 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_86.csv


Processing batch 87: 100%|██████████| 1000/1000 [01:19<00:00, 12.53it/s]
Overall Progress:  83%|████████▎ | 29/35 [35:52<07:09, 71.66s/it]

Batch 87 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_87.csv


Processing batch 88: 100%|██████████| 1000/1000 [01:14<00:00, 13.42it/s]
Overall Progress:  86%|████████▌ | 30/35 [37:06<06:02, 72.52s/it]

Batch 88 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_88.csv


Processing batch 89: 100%|██████████| 1000/1000 [01:31<00:00, 10.98it/s]
Overall Progress:  89%|████████▊ | 31/35 [38:37<05:12, 78.10s/it]

Batch 89 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_89.csv


Processing batch 90: 100%|██████████| 1000/1000 [01:40<00:00,  9.98it/s]
Overall Progress:  91%|█████████▏| 32/35 [40:18<04:14, 84.73s/it]

Batch 90 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_90.csv


Processing batch 91: 100%|██████████| 1000/1000 [01:09<00:00, 14.29it/s]
Overall Progress:  94%|█████████▍| 33/35 [41:28<02:40, 80.30s/it]

Batch 91 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_91.csv


Processing batch 92: 100%|██████████| 1000/1000 [01:11<00:00, 13.97it/s]
Overall Progress:  97%|█████████▋| 34/35 [42:39<01:17, 77.69s/it]

Batch 92 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_92.csv


Processing batch 93: 100%|██████████| 952/952 [01:09<00:00, 13.78it/s]
Overall Progress: 100%|██████████| 35/35 [43:48<00:00, 75.11s/it]


Batch 93 saved to d:\Internship\CEEW\cleaned_chunks_final_segmented_checkpoint_93.csv


Merging batches: 100%|██████████| 93/93 [00:00<00:00, 303.92it/s]


All batches merged into d:\Internship\CEEW\cleaned_chunks_final_segmented.csv
