In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report


In [2]:
data = pd.read_csv('WordDifficulty.csv')
print(data.isnull().sum())  
data = data.dropna()

Word                1
Length              0
Freq_HAL            0
Log_Freq_HAL        0
I_Mean_RT          13
I_Zscore           13
I_SD               31
Obs                13
I_Mean_Accuracy     0
dtype: int64


In [3]:
data['label'] = (data['I_Mean_Accuracy'] < 0.64).astype(int)

In [4]:
features = ['Length', 'Log_Freq_HAL', 'I_Mean_RT', 'I_Zscore', 'I_SD', 'Obs']

In [5]:
X = data[features] 
y = data['label']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

In [8]:
print(data.isnull().sum())  # Should show all zeros now


Word               0
Length             0
Freq_HAL           0
Log_Freq_HAL       0
I_Mean_RT          0
I_Zscore           0
I_SD               0
Obs                0
I_Mean_Accuracy    0
label              0
dtype: int64


In [8]:
gbm.fit(X_train, y_train)

In [9]:
y_pred = gbm.predict(X_test)

In [10]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.9919653893695921
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      6914
           1       0.97      0.98      0.97      1176

    accuracy                           0.99      8090
   macro avg       0.98      0.99      0.98      8090
weighted avg       0.99      0.99      0.99      8090



In [11]:
# new_word_features = [[1,16.18, 798.92, -0.01, 333.85, 24]]  
# predicted_label = gbm.predict(new_word_features)
# print(f'The word is predicted to be {"hard" if predicted_label[0] == 1 else "easy"}')
frequency_data =  pd.read_csv('WordFrequency.csv')

In [12]:
import pandas as pd
import numpy as np
import re  
def compute_features(new_word, frequency_df, mean_rt, mean_sd, scaling_factor):
    length = len(new_word)  
    
    freq_hal = frequency_df.set_index('word')['count'].get(new_word, 1)
    
    scaled_freq_hal = freq_hal * scaling_factor
    log_freq_hal = np.log(scaled_freq_hal)  
   
    zscore = 0  
    obs = 1  
  
    return [length, log_freq_hal, mean_rt, zscore, mean_sd, obs]

def predict_word_difficulty(new_word):
    if new_word in data['Word'].values:
        word_data = data[data['Word'] == new_word]
        word_features = word_data[features].values[0]
    else:
        mean_rt = data['I_Mean_RT'].mean()
        mean_sd = data['I_SD'].mean()
        word_features = compute_features(new_word, frequency_data, mean_rt, mean_sd, scaling_factor)
        
   
    predicted_label = gbm.predict([word_features])
    
    return "hard" if predicted_label[0] == 1 else "easy"


def predict_paragraph_difficulty(paragraph):
    
    words = re.findall(r'\b\w+\b', paragraph.lower())  

    word_difficulties = {}
    for word in words:
        difficulty = predict_word_difficulty(word)
        word_difficulties[word] = difficulty

    return word_difficulties

paragraph = "This is an example paragraph with onomatopoeia words that may or may not be difficult."
word_difficulties = predict_paragraph_difficulty(paragraph)


for word, difficulty in word_difficulties.items():
    print(f'Word: "{word}", Difficulty: {difficulty}')

Word: "this", Difficulty: easy
Word: "is", Difficulty: easy
Word: "an", Difficulty: easy
Word: "example", Difficulty: easy
Word: "paragraph", Difficulty: easy
Word: "with", Difficulty: easy
Word: "onomatopoeia", Difficulty: hard
Word: "words", Difficulty: easy
Word: "that", Difficulty: easy
Word: "may", Difficulty: easy
Word: "or", Difficulty: easy
Word: "not", Difficulty: easy
Word: "be", Difficulty: easy
Word: "difficult", Difficulty: easy




In [13]:
import joblib

# Save the trained model to a file
joblib.dump(gbm, 'trained_gbm_model.pkl')

['trained_gbm_model.pkl']