## 1. Download the data

In [2]:
import csv
import os
import pandas as pd

def read_csv_file(file_path):
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"The file {file_path} does not exist.")
    
    with open(file_path, mode='r', newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        data = [row for row in reader]
    
    return data

filepath = os.getcwd() + '/unigram_freq.csv'

data = read_csv_file(filepath)

data = pd.read_csv(filepath) # changing to pandas for easier data manipulation

print(data, type(data)) # Uncomment this line to see the data read from the CSV file

           word        count
0           the  23135851162
1            of  13151942776
2           and  12997637966
3            to  12136980858
4             a   9081174698
...         ...          ...
333328    gooek        12711
333329   gooddg        12711
333330  gooblle        12711
333331   gollgo        12711
333332    golgw        12711

[333333 rows x 2 columns] <class 'pandas.core.frame.DataFrame'>


## 2. clean the data

In [3]:
# Drop missing values
data_clean = data.dropna()

# Drop duplicate rows
data_clean = data_clean.drop_duplicates(keep=False)

# Keep only rows where first column (assume it's the first by index, not by name) has length == 3
first_col = data_clean.columns[0]
data_clean = data_clean[data_clean[first_col].str.len() == 3]

# sort by second column (the second column is frequency count, so we want to keep the most common words)
second_col = data_clean.columns[1]
data_clean = data_clean.sort_values(by=second_col, ascending=False)

# drop second column (assume it's the second by index, not by name)
second_col = data_clean.columns[1]
data_clean = data_clean.drop(columns=[second_col])

# Reset index
data_clean = data_clean.reset_index(drop=True)

print(f"cleaned data:\n{data_clean.head()}\ndata shape: {data_clean.shape}") # Uncomment this line to see the cleaned data

# Save cleaned data to a new CSV file
cleaned_filepath = os.getcwd() + '/unigram_freq_cleaned.csv'
data_clean.to_csv(cleaned_filepath, index=False)


cleaned data:
  word
0  the
1  and
2  for
3  you
4  not
data shape: (12976, 1)


## 3. Generate lists

In [None]:
# generate 20 rows with 20 random words from the cleaned data
import random
num_rows = 20
num_words_per_row = 20
random_words = []

# Get list of words from the first column
words_list = data_clean[first_col].tolist()

# fetch random words
for _ in range(num_rows):
    row = random.sample(words_list, 3)
    random_words.append(row)

random_words_df = pd.DataFrame(random_words)
random_words_filepath = os.getcwd() + '/random_words.csv'
random_words_df.to_csv(random_words_filepath, index=False)

print(f"random words:\n{random_words_df.head()}\ndata shape: {random_words_df.shape}") # Uncomment this line to see the random words data

random words:
    0    1    2    3    4    5    6    7    8    9    10   11   12   13   14  \
0  bno  ivw  yrt  ijl  zas  eaz  gpb  ffy  ste  ael  awt  rnc  cbs  abs  sjn   
1  rgj  ecf  ooh  kvo  bms  pbw  qgd  jnt  tni  gbl  cha  kha  prg  ugc  zoe   
2  rtk  dek  dob  uwb  dol  mkw  zad  ark  qrw  ide  jsw  afr  ibn  lhe  vlx   
3  hgx  its  aqj  ehh  oic  epe  vwr  unq  lvq  vcv  zpi  cts  kbe  mub  imw   
4  pcm  bav  nul  ios  nia  wpe  hur  gbr  ndc  ezu  hxw  hte  wid  trz  hlf   

    15   16   17   18   19  
0  lqd  ddx  hav  oem  bfy  
1  nts  oyl  bid  umg  yzz  
2  fxb  woz  uge  whj  roq  
3  ofo  obl  khr  mvl  vwb  
4  map  nku  bxd  ovd  apu  
data shape: (20, 20)
