## 1. Download the data

In [None]:
import csv
import os
import pandas as pd
import random

def read_csv_file(file_path):
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"The file {file_path} does not exist.")
    
    with open(file_path, mode='r', newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        data = [row for row in reader]
    
    return data

filepath = os.getcwd() + '/english_word_freq.csv'

data = read_csv_file(filepath)

data = pd.read_csv(filepath) # changing to pandas for easier data manipulation

print(data, type(data)) # Uncomment this line to see the data read from the CSV file

  data = pd.read_csv('english_word_freq.csv', sep=';')


        Rank;Word;Frequency;Frequency per million;Zipf value
0                        1;you;257875407;38.758.605;75.884  
1                         2;i;241866372;363.524.513;75.605  
2                         3;the;201388773;30.268.679;7.481  
3                        4;to;154325666;231.951.065;73.654  
4                        5;'s;130877284;196.708.209;72.938  
...                                                    ...  
1048570                      23473;sanctii;7;0.0011;0.0414  
1048571               23473;sanctions-free;7;0.0011;0.0414  
1048572                  23473;sanctissime;7;0.0011;0.0414  
1048573                 23473;sanctuaryand;7;0.0011;0.0414  
1048574                  23473;sand-bikers;7;0.0011;0.0414  

[1048575 rows x 1 columns] <class 'pandas.core.frame.DataFrame'>


## 2. clean the data

In [28]:
data = pd.read_csv('english_word_freq.csv', sep=';')

# Drop missing values
data_clean = data.dropna()

# Drop duplicate rows
data_clean = data_clean.drop_duplicates(keep=False)

# Column names
word_col = data_clean.columns[1]       # the column with words
freq_col = data_clean.columns[2]       # frequency column
cols_to_drop = [data_clean.columns[0],  # first column
                data_clean.columns[2],  # third column (original frequency, will sort anyway)
                data_clean.columns[3],  # fourth column
                data_clean.columns[4]]  # fifth column

# Keep only alphabetic 3-letter words
data_clean = data_clean[data_clean[word_col].str.isalpha()]
data_clean = data_clean[data_clean[word_col].str.len() == 3]

# Sort by frequency (descending)
data_clean = data_clean.sort_values(by=freq_col, ascending=False)

# Drop unnecessary columns
data_clean = data_clean.drop(columns=cols_to_drop)

# Reset index
data_clean = data_clean.reset_index(drop=True)

# Save cleaned data
cleaned_filepath = os.path.join(os.getcwd(), 'english_word_freq_cleaned.csv')
data_clean.to_csv(cleaned_filepath, index=False)

print(f"Cleaned data:\n{data_clean.head()}\nData shape: {data_clean.shape}")


  data = pd.read_csv('english_word_freq.csv', sep=';')


Cleaned data:
  Word
0  you
1  the
2  and
3  for
4  was
Data shape: (10781, 1)


## 3. Generate lists

In [8]:
# Generate 20 rows × 20 words
num_rows = 20
num_words_per_row = 20
random_words = []

words_list = data_clean[first_col].tolist()

for _ in range(num_rows):
    row = random.sample(words_list, num_words_per_row)
    random_words.append(row)

random_words_df = pd.DataFrame(random_words)
random_words_filepath = os.path.join(os.getcwd(), "random_words.csv")
random_words_df.to_csv(random_words_filepath, index=False)

print(f"random words:\n{random_words_df.head()}\ndata shape: {random_words_df.shape}")

random words:
    0    1    2    3    4    5    6    7    8    9    10   11   12   13   14  \
0  kav  jug  ebr  nwk  uab  cpb  lvr  lwt  kwh  anv  cmz  qmu  tun  fpt  bet   
1  dst  vmm  rkf  npw  muh  ebg  qct  zit  psh  gtp  big  ild  cfv  ftf  ckx   
2  dvw  kap  vbb  bcz  igr  bmm  dou  baq  arm  fic  oog  bhm  lni  hsy  fyc   
3  awf  ewn  fwb  tmf  fjd  qao  bna  gad  spv  dga  cbi  rig  rld  vov  pao   
4  vnr  wau  sjt  gki  wsa  unf  uer  llt  snf  bmr  cka  jnz  vru  vug  oow   

    15   16   17   18   19  
0  dvv  uny  dov  ncw  hlv  
1  yon  kul  rfn  hsu  ggs  
2  opi  tze  sul  wgu  mnl  
3  rdc  fpo  unn  dhq  tsn  
4  uge  jbu  amo  qed  nxl  
data shape: (20, 20)
