## 1. Download the data

In [1]:
import csv
import os
import pandas as pd
import random

def read_csv_file(file_path):
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"The file {file_path} does not exist.")
    
    with open(file_path, mode='r', newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        data = [row for row in reader]
    
    return data

filepath = os.getcwd() + '/english_word_freq.csv'

data = read_csv_file(filepath)

data = pd.read_csv(filepath) # changing to pandas for easier data manipulation

# print(data, type(data)) # Uncomment this line to see the data read from the CSV file

## 2. clean the data

In [2]:
data = pd.read_csv('english_word_freq.csv', sep=';')

# Drop missing values
data_clean = data.dropna()

# Drop duplicate rows
data_clean = data_clean.drop_duplicates(keep=False)

# Column names
word_col = data_clean.columns[1]       # the column with words
freq_col = data_clean.columns[2]       # frequency column
cols_to_drop = [data_clean.columns[0],  # first column
                data_clean.columns[2],  # third column (original frequency, will sort anyway)
                data_clean.columns[3],  # fourth column
                data_clean.columns[4]]  # fifth column

# Keep only alphabetic 3-letter words
data_clean = data_clean[data_clean[word_col].str.isalpha()]
data_clean = data_clean[data_clean[word_col].str.len() == 3]

# Sort by frequency (descending)
data_clean = data_clean.sort_values(by=freq_col, ascending=False)

# Drop unnecessary columns
data_clean = data_clean.drop(columns=cols_to_drop)

# Reset index
data_clean = data_clean.reset_index(drop=True)

# Save cleaned data
cleaned_filepath = os.path.join(os.getcwd(), 'english_word_freq_cleaned.csv')
data_clean.to_csv(cleaned_filepath, index=False)

# print(f"Cleaned data:\n{data_clean.head()}\nData shape: {data_clean.shape}") # Uncomment this line to see the cleaned data


  data = pd.read_csv('english_word_freq.csv', sep=';')


## 3. Generate lists

In [None]:
# Keep the first 400 words
data_clean = data_clean.iloc[:400]

num_rows = 20
num_words_per_row = 20

# Convert DataFrame column to list
words_list = data_clean[word_col].tolist()

# Shuffle the full list of 400 words
random.shuffle(words_list)

# Partition into 20 lists of 20 words each
random_words = [words_list[i * num_words_per_row : (i + 1) * num_words_per_row] 
                for i in range(num_rows)]

# Convert to DataFrame
random_words_df = pd.DataFrame(random_words)

# Save to CSV
random_words_filepath = os.path.join(os.getcwd(), "random_words.csv")
random_words_df.to_csv(random_words_filepath, index=False)

print(f"random words:\n{random_words_df.head()}\ndata shape: {random_words_df.shape}")


random words:
      0
0  Word
1  None
2  None
3  None
4  None
data shape: (20, 1)
