In [5]:
import gc
import os
from math import exp
from collections import Counter
from typing import List, Optional, Union
import numpy as np
import pandas as pd

In [11]:
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'




class ParticipantVisibleError(Exception):
    pass

In [13]:
df = pd.read_csv('sample_submission.csv')

In [19]:
df.head()

Unnamed: 0,id,text
0,0,advent chimney elf family fireplace gingerbrea...
1,1,advent chimney elf family fireplace gingerbrea...
2,2,yuletide decorations gifts cheer holiday carol...
3,3,yuletide decorations gifts cheer holiday carol...
4,4,hohoho candle poinsettia snowglobe peppermint ...


In [21]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      6 non-null      int64 
 1   text    6 non-null      object
dtypes: int64(1), object(1)
memory usage: 228.0+ bytes


In [23]:
df.head()

Unnamed: 0,id,text
0,0,advent chimney elf family fireplace gingerbrea...
1,1,advent chimney elf family fireplace gingerbrea...
2,2,yuletide decorations gifts cheer holiday carol...
3,3,yuletide decorations gifts cheer holiday carol...
4,4,hohoho candle poinsettia snowglobe peppermint ...


In [25]:
# Define a function for preprocessing
def preprocess_text(text):
    # Tokenize by splitting into words
    words = text.split()
    # Clean each word (strip whitespace, etc.)
    cleaned_words = [word.strip() for word in words]
    return cleaned_words

# Apply preprocessing to the 'text' column
df['tokenized_text'] = df['text'].apply(preprocess_text)

# Verify the result
print(df[['id', 'text', 'tokenized_text']].head())


   id                                               text  \
0   0  advent chimney elf family fireplace gingerbrea...   
1   1  advent chimney elf family fireplace gingerbrea...   
2   2  yuletide decorations gifts cheer holiday carol...   
3   3  yuletide decorations gifts cheer holiday carol...   
4   4  hohoho candle poinsettia snowglobe peppermint ...   

                                      tokenized_text  
0  [advent, chimney, elf, family, fireplace, ging...  
1  [advent, chimney, elf, family, fireplace, ging...  
2  [yuletide, decorations, gifts, cheer, holiday,...  
3  [yuletide, decorations, gifts, cheer, holiday,...  
4  [hohoho, candle, poinsettia, snowglobe, pepper...  


In [27]:
# Baseline model function
def unscramble_words(text):
    # Sort letters in each word alphabetically
    unscrambled = " ".join("".join(sorted(word)) for word in text.split())
    return unscrambled

# Apply the baseline model
df['unscrambled_text'] = df['text'].apply(unscramble_words)

# Preview the result
print(df[['id', 'unscrambled_text']].head())


   id                                   unscrambled_text
0   0  adentv cehimny efl afilmy aceefilpr abdeegginr...
1   1  adentv cehimny efl afilmy aceefilpr abdeegginr...
2   2  deeiltuy acdeinoorst fgist ceehr adhiloy aclor...
3   3  deeiltuy acdeinoorst fgist ceehr adhiloy aclor...
4   4  hhhooo acdeln aeiinopstt beglnoosw eeimnppprt ...


In [29]:
output_file = 'processed_submission.csv'
df.to_csv(output_file, index=False)
print(f"Processed data saved to {output_file}")


Processed data saved to processed_submission.csv


In [31]:
import nltk
from nltk.corpus import words

# Download the word list (only needs to be done once)
nltk.download('words')

# Create a set of valid English words
word_list = set(words.words())

# Function to validate unscrambled words against the dictionary
def validate_words(text):
    words = text.split()
    valid_words = [word for word in words if word in word_list]
    return " ".join(valid_words)

# Apply the validation to the baseline unscrambled text
df['validated_text'] = df['unscrambled_text'].apply(validate_words)

# Preview the validated results
print(df[['id', 'unscrambled_text', 'validated_text']].head())


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


   id                                   unscrambled_text        validated_text
0   0  adentv cehimny efl afilmy aceefilpr abdeegginr...                      
1   1  adentv cehimny efl afilmy aceefilpr abdeegginr...                      
2   2  deeiltuy acdeinoorst fgist ceehr adhiloy aclor...                      
3   3  deeiltuy acdeinoorst fgist ceehr adhiloy aclor...                    is
4   4  hhhooo acdeln aeiinopstt beglnoosw eeimnppprt ...  joy bow in it not as


In [33]:
from collections import Counter

# Build a unigram (single character) frequency model from valid words
unigram_model = Counter("".join(word_list))

# Function to score unscrambled text based on character frequency
def score_words(text):
    score = 0
    for word in text.split():
        score += sum(unigram_model[char] for char in word)
    return score

# Apply the scoring model
df['unscrambled_score'] = df['unscrambled_text'].apply(score_words)

# Preview the scored results
print(df[['id', 'unscrambled_text', 'unscrambled_score']].head())


   id                                   unscrambled_text  unscrambled_score
0   0  adentv cehimny efl afilmy aceefilpr abdeegginr...           10907427
1   1  adentv cehimny efl afilmy aceefilpr abdeegginr...           15991039
2   2  deeiltuy acdeinoorst fgist ceehr adhiloy aclor...           17464289
3   3  deeiltuy acdeinoorst fgist ceehr adhiloy aclor...           22913751
4   4  hhhooo acdeln aeiinopstt beglnoosw eeimnppprt ...           33907347


In [45]:
submission_file = 'submission 2.csv'
submission_df.to_csv(submission_file, index=False)

print(f"Submission file saved as {submission_file}")

Submission file saved as submission 2.csv


In [35]:
import random

# Generate simulated scrambled-unscrambled pairs
def scramble_word(word):
    return ''.join(random.sample(word, len(word)))

unscrambled_words = ['holiday', 'cheer', 'snow', 'gift', 'tree']
scrambled_words = [scramble_word(word) for word in unscrambled_words]

# Create a DataFrame
data = pd.DataFrame({'scrambled': scrambled_words, 'unscrambled': unscrambled_words})
print(data)


  scrambled unscrambled
0   odahiyl     holiday
1     crehe       cheer
2      onws        snow
3      figt        gift
4      rtee        tree


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the scrambled words
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
X = vectorizer.fit_transform(data['scrambled'])
y = data['unscrambled']


In [39]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest model
model = RandomForestClassifier()
model.fit(X, y)


In [41]:
# Predict on new scrambled words
new_scrambled = ['odliyah', 'heerch', 'onsw']
X_new = vectorizer.transform(new_scrambled)
predictions = model.predict(X_new)

# Display predictions
for s, p in zip(new_scrambled, predictions):
    print(f"Scrambled: {s}, Prediction: {p}")


Scrambled: odliyah, Prediction: gift
Scrambled: heerch, Prediction: tree
Scrambled: onsw, Prediction: snow


In [43]:
import pandas as pd

# Assuming you have predictions and their IDs in the following lists
submission_data = {
    'id': [1, 2, 3],  # Replace with actual IDs from your dataset
    'unscrambled_text': ['holiday', 'cheer', 'snow'],  # Replace with your model's predictions
}

# Create a DataFrame
submission_df = pd.DataFrame(submission_data)

# Save to a CSV file
submission_file = 'submission.csv'
submission_df.to_csv(submission_file, index=False)

print(f"Submission file saved as {submission_file}")


Submission file saved as submission.csv


In [47]:
import random
import pandas as pd

# Generate simulated scrambled-unscrambled pairs
def scramble_word(word):
    return ''.join(random.sample(word, len(word)))

unscrambled_words = ['holiday', 'cheer', 'snow', 'gift', 'tree']
scrambled_words = [scramble_word(word) for word in unscrambled_words]

# Create a DataFrame
data = pd.DataFrame({'scrambled': scrambled_words, 'unscrambled': unscrambled_words})
print(data)

  scrambled unscrambled
0   oyadilh     holiday
1     receh       cheer
2      osnw        snow
3      iftg        gift
4      erte        tree


In [49]:
output_file = 'processed_submission 2.csv'
df.to_csv(output_file, index=False)
print(f"Processed data saved to {output_file}")

Processed data saved to processed_submission 2.csv
