# Cows and Bulls Words Extraction

## Imports

In [1]:
import os
import pandas as pd
from collections import Counter
import re
import zipfile
import shutil

extraction_path = './temp'



## Functions 

In [2]:
# checks if the character has any characters that are repeated. 
def all_unique_characters(word):
    return len(set(word)) == len(word)

# takes a df and returns only the words that are of 4 characters long and the ones that do not have any charaters repeated. 
def filter_4_letter_words_and_unique(item,field_name):
    return item[item[field_name].apply(lambda x: len(x) == 4 and all_unique_characters(x))]

# returns list of words that are repeated more than n times in the df
def fitler_words_repeated_more_than_n_times(item,field_name,n):
    return item[item[field_name] > n]

# set Subtraction 
def subtract(left,right,left_field,right_field):
    merged_df = left.merge(right, how='left', left_on='Word', right_on='Name', indicator=True)
    filtered_df = merged_df[merged_df['_merge'] == 'left_only']
    return filtered_df

# Get the top n words that are of 4 characters in length
def get_top_4_letter_words(file_path, top_n=1500):
    # Read the content of the file
    with open(file_path, 'r', encoding='latin-1') as file:
        text = file.read().lower()
    
    # Tokenize the text and filter out non-4-letter words with no numbers or special characters
    words = re.findall(r'\b[a-z]{4}\b', text)
    
    # Count the frequency of each 4-letter word
    word_counts = Counter(words)
    
    # Get the most common 4-letter words
    most_common_words = word_counts.most_common(top_n)
    
    return most_common_words

# Function to process all text files in a folder
def process_folder(folder_path):
    all_words = []
    complete_text = ""
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
                            
            top_4_letter_words = get_top_4_letter_words(file_path)
            all_words.append(top_4_letter_words)
    return all_words

#Calculates the average repetition of a word across multiple files. 
def calculate_average_frequency(lists):
    frequency_dict = {}

    # Iterate over each list and update the dictionary
    for lst in lists:
        for name, frequency in lst:
            if name in frequency_dict:
                frequency_dict[name]['total'] += frequency
            else:
                frequency_dict[name] = {'total': frequency}

    # Calculate the average frequency for each name
    average_frequency_list = [
        (name, values['total'] / len(lists))
        for name, values in frequency_dict.items()
    ]

    return average_frequency_list

#UnZip a file and save it

def unzip_file(zip_file_path, file_extension):
    # Check if the zip file exists
    if not os.path.isfile(zip_file_path):
        print(f"Error: The zip file {zip_file_path} does not exist.")
        return
    
    os.makedirs(extraction_path, exist_ok=True)

    # Open the zip file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # List all the files in the zip archive
        zip_file_list = zip_ref.namelist()
        
        # Filter for files with specified extension
        files_with_extension = [f for f in zip_file_list if f.endswith(file_extension)]

        if not files_with_extension:
            print(f"No files with '{file_extension}' extension found in the zip archive.")
        else:
            # Extract and process each file with the specified extension
            for file_name in files_with_extension:
                try:
                    # Ensure subdirectories are correctly handled
                    base_name = os.path.basename(file_name)
                    extracted_file_path = os.path.join(extraction_path, base_name)
                
                    # Extract the file to the specified path
                    with zip_ref.open(file_name) as source, open(extracted_file_path, 'wb') as target:
                        target.write(source.read())
                    
                except Exception as e:
                    print(f"An error occurred while processing {file_name}: {e}")

def delete_folder():
    shutil.rmtree(extraction_path)
        

## Extract 4 Letter Names without repeated Characters.

In [3]:
# Read the CSV file into a DataFrame
unzip_file('Names/Names.csv.zip','.csv')
names_df = pd.read_csv(extraction_path+'/Names.csv')
# Filter names that are 4 characters long
names_df['Name'] = names_df['Name'].astype(str)  # Ensure 'Name' column is string type

names_df['Name'] = names_df['Name'].str.lower()

# Filter names that do not have repeating characters
names_no_repeats = filter_4_letter_words_and_unique(names_df,'Name')

names_no_repeats = names_no_repeats.drop_duplicates(subset='Name')
delete_folder()


In [4]:
unzip_file('Books/Books.zip','.txt')
folder_path = extraction_path 
each_books_top_n_words = calculate_average_frequency(process_folder(folder_path))

In [5]:
each_books_top_n_words = pd.DataFrame(each_books_top_n_words, columns=['Word', 'Frequency'])

average_4_letter_words = filter_4_letter_words_and_unique(each_books_top_n_words,'Word')

average_4_letter_words = average_4_letter_words.drop_duplicates(subset='Word')


## Let's remove all the words that are names of People

In [6]:
words_without_human_names = subtract(average_4_letter_words,names_no_repeats,'Word','Name')
words_without_human_names = words_without_human_names[['Word','Frequency']]

## Let's store the top 1000 most frequently used words. 

In [7]:
words_without_human_names = words_without_human_names.sort_values(by='Frequency', ascending=False)

top_1000 = words_without_human_names.head(1000)

top_1000.to_csv('result.csv', index=True)

In [8]:
delete_folder()