In [21]:
# Step 1: Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import os


In [13]:
# Step 2: Load processed features
df = pd.read_csv(r"C:\Users\abc\Desktop\Dyslexia-assistant\data\processed.csv")
df.head()


Unnamed: 0,excerpt,flesch_score,sentence_count,words_per_sentence,syllable_count,difficult_words,label
0,When the young people returned to the ballroom...,79.251143,11.0,16.272727,235.0,27.0,1
1,"All through dinner time, Mrs. Fayre was somewh...",78.945814,14.0,12.071429,231.0,18.0,1
2,"As Roger had predicted, the snow departed as q...",78.125492,12.0,13.833333,225.0,24.0,1
3,And outside before the palace a great garden w...,70.372268,5.0,32.8,200.0,17.0,1
4,Once upon a time there were Three Bears who li...,79.157265,5.0,29.4,170.0,3.0,1


In [14]:
# Step 3: Show label distribution
print("🔍 Original label distribution:")
print(df['label'].value_counts(), "\n")

🔍 Original label distribution:
label
1    2537
0     297
Name: count, dtype: int64 



In [16]:
# Step 4: Balance the dataset
min_count = df['label'].value_counts().min()
df_easy = df[df['label'] == 1].sample(n=min_count, random_state=42)
df_hard = df[df['label'] == 0].sample(n=min_count, random_state=42)

df_balanced = pd.concat([df_easy, df_hard], ignore_index=True).sample(frac=1, random_state=42)
print("⚖️ Dataset balanced.")
print(df_balanced['label'].value_counts(), "\n")

⚖️ Dataset balanced.
label
1    297
0    297
Name: count, dtype: int64 



In [17]:
# Step 5: Define features and target
feature_cols = ['flesch_score', 'sentence_count', 'words_per_sentence', 'syllable_count', 'difficult_words']
X = df_balanced[feature_cols]
y = df_balanced['label']

In [18]:
# Step 6: Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"📚 Train samples: {len(X_train)}, 🧪 Test samples: {len(X_test)}\n")

📚 Train samples: 475, 🧪 Test samples: 119



In [19]:
# Step 7: Train the model
model = LogisticRegression()
model.fit(X_train, y_train)
print("✅ Model training complete.\n")

✅ Model training complete.



In [22]:
# Step 8: Evaluate the model
y_pred = model.predict(X_test)

print("📈 Classification Report:")
print(classification_report(y_test, y_pred))

print("🧩 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

📈 Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        58
           1       1.00      1.00      1.00        61

    accuracy                           1.00       119
   macro avg       1.00      1.00      1.00       119
weighted avg       1.00      1.00      1.00       119

🧩 Confusion Matrix:
[[58  0]
 [ 0 61]]


In [24]:
# Step 9: Save the model.
os.makedirs("models", exist_ok=True)
joblib.dump(model, r"C:\Users\abc\Desktop\Dyslexia-assistant\model\readability_model.pkl")
print("💾 Model saved to models/readability_model.pkl")

💾 Model saved to models/readability_model.pkl
