# ***Reformatting Dataset***

Import Packages

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv("train_data.csv")
data.head(16)

Unnamed: 0,Game ID,Puzzle Date,Word,Group Name,Group Level,Starting Row,Starting Column
0,1,2023-06-12,SNOW,WET WEATHER,0,1,1
1,1,2023-06-12,LEVEL,PALINDROMES,3,1,2
2,1,2023-06-12,SHIFT,KEYBOARD KEYS,2,1,3
3,1,2023-06-12,KAYAK,PALINDROMES,3,1,4
4,1,2023-06-12,HEAT,NBA TEAMS,1,2,1
5,1,2023-06-12,TAB,KEYBOARD KEYS,2,2,2
6,1,2023-06-12,BUCKS,NBA TEAMS,1,2,3
7,1,2023-06-12,RETURN,KEYBOARD KEYS,2,2,4
8,1,2023-06-12,JAZZ,NBA TEAMS,1,3,1
9,1,2023-06-12,HAIL,WET WEATHER,0,3,2


In [10]:
# Assuming the 'data' is the same as in your example
df = pd.DataFrame(data, columns=['Game ID', 'Word', 'Group Name', 'Group Level', 'Starting Row', 'Starting Column'])

# Function to split words into chunks of 4
def split_into_chunks(words, chunk_size=4):
    # Filter out any chunks that are not full (less than 4 words)
    return [words[i:i + chunk_size] for i in range(0, len(words), chunk_size) if len(words[i:i + chunk_size]) == chunk_size]

# Group words by 'Game ID' and create a list of words
grouped = df.groupby(['Group Name', 'Game ID'])['Word'].apply(list).reset_index()

# Apply the function to split the words into chunks
grouped['Words'] = grouped['Word'].apply(lambda x: split_into_chunks(x))

# Flatten the grouped 'Words' column
expanded_rows = []
for _, row in grouped.iterrows():
    for words in row['Words']:
        expanded_rows.append([row['Game ID']] + words)

# Convert expanded rows to a DataFrame
expanded_df = pd.DataFrame(expanded_rows, columns=['Game ID', 'word1', 'word2', 'word3', 'word4'])

# Function to check for spaces in words and filter out groups with spaces
def drop_groups_with_spaces(df):
    # Ensure there are no NaN values and treat them as empty strings
    df = df.fillna('')  # Replace NaN with empty string
    
    # Filter rows where none of the words contain spaces
    df['has_space'] = df[['word1', 'word2', 'word3', 'word4']].apply(lambda row: any(' ' in word for word in row), axis=1)
    
    # Drop rows where any word in the group contains a space
    df_cleaned = df[~df['has_space']].drop(columns='has_space')
    return df_cleaned

# Apply the function to drop groups with spaces
expanded_df_cleaned = drop_groups_with_spaces(expanded_df)

# Split the Game IDs into train and test
groups = expanded_df_cleaned['Game ID'].unique()

split_index = int(len(groups) * 0.8)

# Split the Game IDs into train and test
train_game_ids = groups[:split_index]  # First 80% of Game IDs
test_game_ids = groups[split_index:]   # Remaining 20% of Game IDs

# Filter the expanded_df_cleaned to get the rows for train and test
train_data = expanded_df_cleaned[expanded_df_cleaned['Game ID'].isin(train_game_ids)]
test_data = expanded_df_cleaned[expanded_df_cleaned['Game ID'].isin(test_game_ids)]

# Ensure that train data is divisible by 4
train_data = train_data.iloc[:len(train_data) - (len(train_data) % 4)]  # Truncate to a multiple of 4

# Save to CSV
train_data.to_csv("trainData2.csv", index=False)
test_data.to_csv("testData2.csv", index=False)

# Display the test data
print(test_data)


      Game ID     word1  word2   word3     word4
704       111      WOOD   IRON   WEDGE    PUTTER
705       185     ROUGH  GREEN  BUNKER   FAIRWAY
711       311   GROUNDS  GREEN    LAWN     FIELD
715        62        NU     PI      XI        MU
718       406  ELECTRIC    GAS    WOOD  CHARCOAL
...       ...       ...    ...     ...       ...
2051       21        JO    MEG    BETH       AMY
2056      224      RATS  SHOOT    DARN    CURSES
2060      404     QUIET   EASY  ENOUGH     RELAX
2065      157        SI     DA      JA       HAI
2066      185     TOUGH  DOUGH   COUGH     BOUGH

[386 rows x 5 columns]


In [11]:
df = df[df['Game ID'].isin(expanded_df_cleaned['Game ID'].unique())]
print(df['Game ID'].unique())
print(expanded_df_cleaned['Game ID'].unique())
df.to_csv("Training2.csv")

[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  24  25  26  27  28  29  30  31  32  33  34  35  36  37
  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55
  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73
  74  75  76  77  79  80  81  82  83  84  85  86  87  88  89  90  91  92
  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110
 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 189
 187 188 185 184 183 186 182 190 191 192 193 194 195 196 197 198 199 200
 201 202 203 204 205 206 208 209 210 213 212 214 215 216 217 218 222 220
 221 219 223 224 225 226 227 228 229 230 231 233 234 235 236 239 238 240
 241 242 243 256 249 248 253 244 255 251 252 250 25