
**Your Name**

ID: 000000000

Import Important Libraries

In [None]:
import re
import operator
import os
import urllib.request
import tarfile
from tabulate import tabulate

 Reads the content of a file and yields each line.

In [None]:
def read_data(file_path):
    with open(file_path, 'r', errors='ignore') as file:
        for line in file:
            yield line.rstrip()

Performs word count analysis on a file.

In [None]:
def perform_word_count(file_path):
    word_count = {}
    for line in read_data(file_path):
        words = line.split()
        for word in words:
            word = re.sub(r'\W+', '', word.lower())
            if word:
                if word in word_count:
                    word_count[word] += 1
                else:
                    word_count[word] = 1
    return word_count

Performs word count analysis on files within a folder.

In [None]:
def map_reduce(folder_path):
    file_word_counts = {}
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path) and not file_name.endswith('.tar.gz'):
            word_count = perform_word_count(file_path)
            file_word_counts[file_name] = word_count
    return file_word_counts

 Displays the word count analysis in a tabular format.

In [None]:
def display_word_count(word_count):
    sorted_word_count = sorted(word_count.items(), key=operator.itemgetter(1), reverse=True)
    table_data = []
    current_row = []
    words_per_row = 3  # Define the number of words per row here
    for word, count in sorted_word_count:
        current_row.append([word, count])
        if len(current_row) == words_per_row:
            table_data.append(current_row)
            current_row = []

    # Add the remaining words if there are any
    if current_row:
        table_data.append(current_row)

    table = tabulate(table_data, tablefmt="grid")
    print(table)
    print("\n")


 Saves the word count analysis results to text files.

In [None]:

def save_word_count(file_word_counts, folder_path):
    save_folder_path = os.path.join(folder_path, "word_results")
    os.makedirs(save_folder_path, exist_ok=True)
    for file_name, word_count in file_word_counts.items():
        sorted_word_count = sorted(word_count.items(), key=operator.itemgetter(1), reverse=True)
        save_file_path = os.path.join(save_folder_path, f"{file_name}_results.txt")
        with open(save_file_path, 'w') as file:
            for word, count in sorted_word_count:
                file.write(f"{word}\t{count}\n")


Downloads a tar file from the given URL and extracts it

In [None]:
def download_and_extract_tar(url, save_path):
    print("Downloading tar file...")
    urllib.request.urlretrieve(url, save_path)
    print("Extracting tar file...")
    with tarfile.open(save_path, "r:gz") as tar:
        tar.extractall(os.path.dirname(save_path))
    print("Extraction completed.")


The main function to perform word count analysis on files.

In [None]:
def main():
    print("Choose an option:")
    print("1. Perform word count analysis on a single file")
    print("2. Perform word count analysis on files within a folder")
    choice = input("Enter your choice (1 or 2): ")

    if choice == "1":
        file_path = input("Enter the path to the file: ")
        if os.path.isfile(file_path):
            word_count = perform_word_count(file_path)
            display_word_count(word_count)
            save_option = input("Do you want to save the word count results? (y/n): ")
            if save_option.lower() == "y":
                folder_path = os.path.dirname(file_path)
                file_word_counts = {os.path.basename(file_path): word_count}
                save_word_count(file_word_counts, folder_path)
                print("Word count results saved successfully.")
        else:
            print("Invalid file path.")
    elif choice == "2":
        folder_path = input("Enter the path to the folder: ")
        if os.path.isdir(folder_path):
            file_word_counts = map_reduce(folder_path)
            for file_name, word_count in file_word_counts.items():
                print("Word count analysis for file:", file_name)
                display_word_count(word_count)
            save_option = input("Do you want to save the word count results? (y/n): ")
            if save_option.lower() == "y":
                save_word_count(file_word_counts, folder_path)
                print("Word count results saved successfully.")
        else:
            print("Invalid folder path.")
    else:
        print("Invalid choice.")


Executing the program

In [None]:


if __name__ == "__main__":
    # Download and extract the tar file
    tar_url = "http://qwone.com/~jason/20Newsgroups/20news-bydate.tar.gz"
    tar_save_path = "20news-bydate.tar.gz"
    download_and_extract_tar(tar_url, tar_save_path)

    # Set the folder path for analysis
    folder_path = "20news-bydate"
    main()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| ['twice', 1]                           | ['much', 1]                       | ['taxes', 1]                                     |
+----------------------------------------+-----------------------------------+--------------------------------------------------+
| ['he', 1]                              | ['formerly', 1]                   | ['got', 1]                                       |
+----------------------------------------+-----------------------------------+--------------------------------------------------+
| ['wages', 1]                           | ['h', 1]                          | ['l', 1]                                         |
+----------------------------------------+-----------------------------------+--------------------------------------------------+
| ['mencken', 1]                         |                                   |                                                  |
+------------------------

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Do you want to save the word count results? (y/n): y
Word count results saved successfully.
