In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv(
    "readability_scores100kword.csv",
    on_bad_lines="skip"  # Abaikan baris bermasalah
)
# Menampilkan 5 baris pertama
df.head()


Unnamed: 0,s_id,sentence,word_count,avg_word_length,flesch_score
0,1,Bila perlu ia juga dapat mengambil organ dari ...,11,4.45,70.05
1,2,Dia juga bilang bahwa ramuan ini juga pernah d...,17,5.18,43.6
2,3,"Di siang hari ia tampak seperti manusia biasa,...",30,5.27,27.87
3,4,Jika kepala tersebut terpisah pada jangka wakt...,14,4.79,57.67
4,5,"Kalo sudah bgt, korban tak akan kembali lagi k...",12,3.75,88.91


In [14]:
df.isna().sum()

s_id               0
sentence           0
word_count         0
avg_word_length    0
flesch_score       0
dtype: int64

In [16]:
# Contoh data
df = pd.read_csv("readability_scores100kword.csv", on_bad_lines="skip")

# Fitur dan target
X = df[["word_count", "avg_word_length"]]
y = df["flesch_score"]

# Split train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model baseline
model = LinearRegression()
model.fit(X_train, y_train)

# Prediksi
y_pred = model.predict(X_test)

# Evaluasi
print("MSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))
print("Koefisien:", model.coef_)
print("Intercept:", model.intercept_)


MSE: 0.005733868962991693
R²: 0.9999902275572128
Koefisien: [ -1.01529796 -28.20058629]
Intercept: 206.85036992207085


In [18]:
# 1Simpan model ke file
import joblib

# Asumsikan variabel model adalah LinearRegression yang sudah dilatih
joblib.dump(model, "readability_model.pkl")
print("Model disimpan sebagai 'readability_model.pkl'")

Model disimpan sebagai 'readability_model.pkl'


In [22]:
# Program cek skor readability dari input teks
import re

# Fungsi untuk menghitung jumlah kata, kalimat, dan suku kata
def count_syllables(word):
    word = word.lower()
    syllables = re.findall(r'[aeiouy]+', word)
    return max(1, len(syllables))

def text_stats(text):
    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    words = re.findall(r'\w+', text)
    syllables = sum(count_syllables(w) for w in words)
    return {
        "words": len(words),
        "sentences": len(sentences),
        "syllables": syllables
    }

# Load model dan cek skor
model = joblib.load("readability_model.pkl")

text = input("Masukkan teks: ")
stats = text_stats(text)

# Hitung features: Words per Sentence & Syllables per Word
wps = stats["words"] / stats["sentences"]
spw = stats["syllables"] / stats["words"]

predicted_score = model.predict([[wps, spw]])[0]
print(f"Prediksi skor keterbacaan: {predicted_score:.2f}")

Prediksi skor keterbacaan: 132.29


