In [22]:
!pip install textstat



In [23]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from textstat import flesch_reading_ease, flesch_kincaid_grade, automated_readability_index, gunning_fog, coleman_liau_index, linsear_write_formula, dale_chall_readability_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

In [24]:
data = pd.read_csv('/content/drive/MyDrive/bloom for questions generation/arc_dataset.csv')


In [25]:
def extractor(string):
    index = string.find("(A)")
    if index != -1:
        return string[:index]
    else:
        return string


In [26]:
data['question'] = [extractor(item) for item in data['question']]
data = data[['question','difficulty']]


In [27]:
data['Flesch_Reading_Ease'] = data['question'].apply(flesch_reading_ease)
data['Flesch_Kincaid_Grade_Level'] = data['question'].apply(flesch_kincaid_grade)
data['ARI'] = data['question'].apply(automated_readability_index)
data['Gunning_FOG_Index'] = data['question'].apply(gunning_fog)
data['Coleman_Liau_Index'] = data['question'].apply(coleman_liau_index)
data['Linsear_Write_Formula'] = data['question'].apply(linsear_write_formula)
data['Dale_Chall_Readability_Score'] = data['question'].apply(dale_chall_readability_score)

In [28]:
print(data.head())

                                            question  difficulty  \
0  Which factor will most likely cause a person t...           8   
1  Lichens are symbiotic organisms made of green ...           8   
2  When a switch is used in an electrical circuit...           5   
3  Which of the following is an example of an ass...           8   
4  Rocks are classified as igneous, metamorphic, ...           8   

   Flesch_Reading_Ease  Flesch_Kincaid_Grade_Level   ARI  Gunning_FOG_Index  \
0                76.22                         5.6   5.4               8.13   
1                51.34                         9.0   8.1               9.82   
2                84.68                         4.4   4.6               8.13   
3                60.31                         7.6   5.5              11.67   
4                46.10                        13.0  15.8              18.00   

   Coleman_Liau_Index  Linsear_Write_Formula  Dale_Chall_Readability_Score  
0                6.95                  

In [29]:
# scaler = MinMaxScaler()
# scaled_columns = ['Flesch_Reading_Ease', 'Flesch_Kincaid_Grade_Level', 'ARI', 'Gunning_FOG_Index',
#                   'Coleman_Liau_Index', 'Linsear_Write_Formula', 'Dale_Chall_Readability_Score']
# data[scaled_columns] = scaler.fit_transform(data[scaled_columns])

In [30]:
print(data.head())

                                            question  difficulty  \
0  Which factor will most likely cause a person t...           8   
1  Lichens are symbiotic organisms made of green ...           8   
2  When a switch is used in an electrical circuit...           5   
3  Which of the following is an example of an ass...           8   
4  Rocks are classified as igneous, metamorphic, ...           8   

   Flesch_Reading_Ease  Flesch_Kincaid_Grade_Level   ARI  Gunning_FOG_Index  \
0                76.22                         5.6   5.4               8.13   
1                51.34                         9.0   8.1               9.82   
2                84.68                         4.4   4.6               8.13   
3                60.31                         7.6   5.5              11.67   
4                46.10                        13.0  15.8              18.00   

   Coleman_Liau_Index  Linsear_Write_Formula  Dale_Chall_Readability_Score  
0                6.95                  

In [31]:
weights = {
    'Flesch_Reading_Ease': 0.2,
    'Flesch_Kincaid_Grade_Level': 0.15,
    'ARI': 0.15,
    'Gunning_FOG_Index': 0.1,
    'Coleman_Liau_Index': 0.1,
    'Linsear_Write_Formula': 0.15,
    'Dale_Chall_Readability_Score': 0.15
}

In [32]:
data['Readability_Score'] = sum(data[metric] * weight for metric, weight in weights.items())

In [33]:
print("Minimum Readability_Score:", data['Readability_Score'].min())
print("Maximum Readability_Score:", data['Readability_Score'].max())


Minimum Readability_Score: 4.992500000000001
Maximum Readability_Score: 26.61200000000001


In [34]:
X = data[['Flesch_Reading_Ease', 'Flesch_Kincaid_Grade_Level', 'ARI', 'Gunning_FOG_Index',
          'Coleman_Liau_Index', 'Linsear_Write_Formula', 'Dale_Chall_Readability_Score']]
y = data['Readability_Score']


In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [36]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

In [37]:
y_pred = model.predict(X_test)


In [38]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import spearmanr

In [39]:
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
spearman_corr, _ = spearmanr(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared Score (R2): {r2}")
print(f"Spearman's Correlation: {spearman_corr}")

Mean Squared Error (MSE): 0.0654831462896135
Root Mean Squared Error (RMSE): 0.2558967492751979
Mean Absolute Error (MAE): 0.1423164091631518
R-squared Score (R2): 0.9879894086366525
Spearman's Correlation: 0.9953068340669615


In [40]:
import joblib

joblib.dump(model, '/content/drive/MyDrive/QDET/readability_regressor_model.pkl')


['/content/drive/MyDrive/QDET/readability_regressor_model.pkl']

In [41]:
import joblib

question = "Oil, natural gas and coal are examples of … (A)Biofuels (B)Geothermal resources (C)Renewable resources (D)Fossil fuels"

def readability(question):
  model = joblib.load('/content/drive/MyDrive/QDET/readability_regressor_model.pkl')
  question = extractor(question)
  print(question)
  input = []
  input.append(flesch_reading_ease(question))
  input.append(flesch_kincaid_grade(question))
  input.append(automated_readability_index(question))
  input.append(gunning_fog(question))
  input.append(coleman_liau_index(question))
  input.append(linsear_write_formula(question))
  input.append(dale_chall_readability_score(question))

  input = [input]
  pred = model.predict(input)
  return pred[0]

In [42]:
print(readability(question))

Oil, natural gas and coal are examples of … 
19.260874999999995


