In [25]:
# Step 1: Import libraries
import pandas as pd
import textstat
import os

In [27]:
# Step 2: Create data folder if it doesn't exist
os.makedirs("data", exist_ok=True)

# Step 3: Load dataset
df = pd.read_csv(r"C:\Users\abc\Desktop\Dyslexia-assistant\data\train.csv")
print("✅ Dataset loaded successfully")
print(df.head())

✅ Dataset loaded successfully
          id url_legal license  \
0  c12129c31       NaN     NaN   
1  85aa80a4c       NaN     NaN   
2  b69ac6792       NaN     NaN   
3  dd1000b26       NaN     NaN   
4  37c1b32fb       NaN     NaN   

                                             excerpt    target  standard_error  
0  When the young people returned to the ballroom... -0.340259        0.464009  
1  All through dinner time, Mrs. Fayre was somewh... -0.315372        0.480805  
2  As Roger had predicted, the snow departed as q... -0.580118        0.476676  
3  And outside before the palace a great garden w... -1.054013        0.450007  
4  Once upon a time there were Three Bears who li...  0.247197        0.510845  


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2834 entries, 0 to 2833
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              2834 non-null   object 
 1   url_legal       830 non-null    object 
 2   license         830 non-null    object 
 3   excerpt         2834 non-null   object 
 4   target          2834 non-null   float64
 5   standard_error  2834 non-null   float64
dtypes: float64(2), object(4)
memory usage: 133.0+ KB


In [31]:
# Step 3: Drop unnecessary columns if they exist
columns_to_drop = ['id', 'url_legal', 'license', 'target', 'standard_error']
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')
df.head()

Unnamed: 0,excerpt
0,When the young people returned to the ballroom...
1,"All through dinner time, Mrs. Fayre was somewh..."
2,"As Roger had predicted, the snow departed as q..."
3,And outside before the palace a great garden w...
4,Once upon a time there were Three Bears who li...


In [32]:
# Step 4: Define a function to extract readability features
def extract_features(text):
    return {
        'flesch_score': textstat.flesch_reading_ease(text),
        'sentence_count': textstat.sentence_count(text),
        'words_per_sentence': textstat.words_per_sentence(text),
        'syllable_count': textstat.syllable_count(text),
        'difficult_words': textstat.difficult_words(text)
    }

In [33]:
# Step 5: Apply the feature extraction
features = df['excerpt'].apply(extract_features).apply(pd.Series)
print("✅ Features extracted. Sample:")
print(features.head(), "\n")

✅ Features extracted. Sample:
   flesch_score  sentence_count  words_per_sentence  syllable_count  \
0     79.251143            11.0           16.272727           235.0   
1     78.945814            14.0           12.071429           231.0   
2     78.125492            12.0           13.833333           225.0   
3     70.372268             5.0           32.800000           200.0   
4     79.157265             5.0           29.400000           170.0   

   difficult_words  
0             27.0  
1             18.0  
2             24.0  
3             17.0  
4              3.0   



In [34]:
# Step 6: Concatenate features back to the original DataFrame
df = pd.concat([df, features], axis=1)
df.head()

Unnamed: 0,excerpt,flesch_score,sentence_count,words_per_sentence,syllable_count,difficult_words
0,When the young people returned to the ballroom...,79.251143,11.0,16.272727,235.0,27.0
1,"All through dinner time, Mrs. Fayre was somewh...",78.945814,14.0,12.071429,231.0,18.0
2,"As Roger had predicted, the snow departed as q...",78.125492,12.0,13.833333,225.0,24.0
3,And outside before the palace a great garden w...,70.372268,5.0,32.8,200.0,17.0
4,Once upon a time there were Three Bears who li...,79.157265,5.0,29.4,170.0,3.0


In [35]:
# Step 7: Create label column: 1 = Easy, 0 = Hard based on Flesch > 40
df['label'] = df['flesch_score'].apply(lambda x: 1 if x > 40 else 0)

# Step 8: Display final dataset info
print("✅ Final DataFrame sample:")
print(df.head(), "\n")

✅ Final DataFrame sample:
                                             excerpt  flesch_score  \
0  When the young people returned to the ballroom...     79.251143   
1  All through dinner time, Mrs. Fayre was somewh...     78.945814   
2  As Roger had predicted, the snow departed as q...     78.125492   
3  And outside before the palace a great garden w...     70.372268   
4  Once upon a time there were Three Bears who li...     79.157265   

   sentence_count  words_per_sentence  syllable_count  difficult_words  label  
0            11.0           16.272727           235.0             27.0      1  
1            14.0           12.071429           231.0             18.0      1  
2            12.0           13.833333           225.0             24.0      1  
3             5.0           32.800000           200.0             17.0      1  
4             5.0           29.400000           170.0              3.0      1   



In [36]:

print("📊 Label distribution:")
print(df['label'].value_counts(), "\n")

📊 Label distribution:
label
1    2537
0     297
Name: count, dtype: int64 



In [37]:
# Step 9: Save the processed dataset
df.to_csv(r"C:\Users\abc\Desktop\Dyslexia-assistant\data\processed.csv", index=False)
print("💾 Processed data saved as: data/processed.csv")

💾 Processed data saved as: data/processed.csv


In [38]:
df.head()

Unnamed: 0,excerpt,flesch_score,sentence_count,words_per_sentence,syllable_count,difficult_words,label
0,When the young people returned to the ballroom...,79.251143,11.0,16.272727,235.0,27.0,1
1,"All through dinner time, Mrs. Fayre was somewh...",78.945814,14.0,12.071429,231.0,18.0,1
2,"As Roger had predicted, the snow departed as q...",78.125492,12.0,13.833333,225.0,24.0,1
3,And outside before the palace a great garden w...,70.372268,5.0,32.8,200.0,17.0,1
4,Once upon a time there were Three Bears who li...,79.157265,5.0,29.4,170.0,3.0,1
