In [9]:
import csv
import string
from collections import Counter
from prettytable import PrettyTable

def count_punctuation_in_multiple_files(filepaths):
    results = []
    all_punctuations = set()

    # Process each file
    for filepath in filepaths:
        punct_counter = Counter()
        row_count = 0

        with open(filepath, 'r', encoding='utf-8') as f:
            reader = csv.reader(f, delimiter='\t')
            for row in reader:
                if len(row) >= 3:
                    row_count += 1
                    text = row[2]
                    for char in text:
                        if char in string.punctuation:
                            punct_counter[char] += 1

        results.append((filepath, row_count, punct_counter))
        all_punctuations.update(punct_counter.keys())

    # Sort punctuations for consistent table
    all_punctuations = sorted(all_punctuations)

    # Create a PrettyTable
    table = PrettyTable()
    field_names = ['Punctuation'] + [f'File {i+1}' for i in range(len(filepaths))]
    table.field_names = field_names

    # Add punctuation counts row by row
    for punct in all_punctuations:
        row = [punct]
        for _, _, punct_counter in results:
            row.append(punct_counter.get(punct, 0))
        table.add_row(row)

    # Print number of rows processed for each file
    for idx, (filepath, row_count, _) in enumerate(results):
        print(f"File {idx+1}: '{filepath}' -> Number of rows processed: {row_count}")
    print()

    # Print the table
    print(table)




In [10]:
# Example usage
train_path= '/content/drive/MyDrive/ssmt_project/punctuation_prediction_model/Data_for_training_punct/train.tsv'
dev_path= '/content/drive/MyDrive/ssmt_project/punctuation_prediction_model/Data_for_training_punct/dev.tsv'
test_path= '/content/drive/MyDrive/ssmt_project/punctuation_prediction_model/Data_for_training_punct/test.tsv'

filepaths = [train_path, dev_path, test_path]
count_punctuation_in_multiple_files(filepaths)

File 1: '/content/drive/MyDrive/ssmt_project/punctuation_prediction_model/Data_for_training_punct/train.tsv' -> Number of rows processed: 2518
File 2: '/content/drive/MyDrive/ssmt_project/punctuation_prediction_model/Data_for_training_punct/dev.tsv' -> Number of rows processed: 394
File 3: '/content/drive/MyDrive/ssmt_project/punctuation_prediction_model/Data_for_training_punct/test.tsv' -> Number of rows processed: 647

+-------------+--------+--------+--------+
| Punctuation | File 1 | File 2 | File 3 |
+-------------+--------+--------+--------+
|      !      |   13   |   2    |   2    |
|      "      |  330   |   53   |   78   |
|      $      |   13   |   0    |   4    |
|      %      |   12   |   0    |   0    |
|      &      |   4    |   0    |   0    |
|      '      |  450   |   89   |  102   |
|      (      |  209   |   26   |   78   |
|      )      |  208   |   26   |   78   |
|      +      |   5    |   0    |   0    |
|      ,      |  2812  |  442   |  772   |
|      -      | 