In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
data=pd.read_excel("dataset_modelling.xlsx")

#map lexile score to grade level(Ordinal Variables)
#https://www.scholastic.com/parents/books-and-reading/reading-resources/book-selection-tips/lexile-levels-made-easy.html
map_dict={'BR190L - 0L':'Grade < 2',
'10L - 200L':'Grade < 2',
'210L - 400L':'Grade < 2',
'410L - 600L':'Grade 2 - Grade 4',
'610L - 800L':'Grade 2 - Grade 4',
'810L - 1000L':'Grade 2 - Grade 4',
'1010L - 1200L':'Grade > 4',
'1210L - 1400L':'Grade > 4'}

data["Grade Level"]=data["Lexile Score Range"].map(map_dict)
data[["Lexile Score Range","Grade Level"]].groupby(["Grade Level"]).count()



Unnamed: 0_level_0,Lexile Score Range
Grade Level,Unnamed: 1_level_1
Grade 2 - Grade 4,456
Grade < 2,62
Grade > 4,30


In [3]:
#converting grade level to ordinal variable for using as the target variable in modelling
map_dict_ordinal={'Grade < 2':1,
'Grade 2 - Grade 4':2,
'Grade > 4':3}

data["Lexile Score Range - Label"]=data["Grade Level"].map(map_dict_ordinal).astype(int)


In [4]:
#Baseline Model

X=data[['Word_Count', 'Sentence_Count', 'Avg_Word_Length', 'Avg_No_Word_Per_Sentence', 'Avg_Syllable_Count_Per_Word', 'No_Complex_Words', 'No_Common_Words', 'Avg_No_Complex_Words_Per_Sentence', 'Avg_No_Simple_Words_Per_Sentence', 'Ratio_Complex_Words_Per_Common_Words', 'No_Easy_Words', 'No_Difficulty_Words', 'Avg_No_Easy_Words_Per_Sentence', 'Avg_No_Difficulty_Words_Per_Sentence', 'Ratio_Difficulty_Words_Per_Easy_Words', 'Automated_Readability_Index', 'Flesch_Reading_Ease', 'FleschKincaid_Grade_Level', 'Coleman_Liau_Index', 'Gunning_Fog_Index', 'SMOG_Index', 'Linsear_Write', 'Dale_Chall_Readability', 'ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'CONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SPACE', 'SYM', 'VERB', 'X']]
y=data["Lexile Score Range - Label"]

randomforestmodel=RandomForestClassifier(n_estimators=1000,random_state=42)

#Baseline Model
print("Random Forest: Baseline Model")
scores = cross_val_score(randomforestmodel, X, y, cv=4, scoring='accuracy')
print("Accuracy: ",round((scores.mean())*100,2),"%")

Random Forest: Baseline Model
Accuracy:  88.5 %


In [5]:
#Feature Selection

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.30, random_state=42,shuffle=True)

print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', X_test.shape)

print('\n')
sel = SelectFromModel(RandomForestClassifier(n_estimators = 1000, random_state=42))
sel.fit(X_train, y_train)

print("Base Model: Random Forest" )
selected_features= X_train.columns[(sel.get_support())]
print("No of features Selected: ",len(selected_features))
print("Selected Features: ",selected_features)

Training Features Shape: (383, 42)
Training Labels Shape: (165, 42)


Base Model: Random Forest
No of features Selected:  16
Selected Features:  Index(['Word_Count', 'Avg_Word_Length', 'No_Common_Words',
       'Avg_No_Complex_Words_Per_Sentence', 'No_Easy_Words',
       'Avg_No_Difficulty_Words_Per_Sentence', 'Automated_Readability_Index',
       'Flesch_Reading_Ease', 'FleschKincaid_Grade_Level',
       'Coleman_Liau_Index', 'Gunning_Fog_Index', 'SMOG_Index', 'ADP', 'CCONJ',
       'DET', 'NOUN'],
      dtype='object')


In [7]:
#Baseline model with selected features
X=data[selected_features]
randomforestmodel=RandomForestClassifier(n_estimators=1000,random_state=42)
#Baseline Model
print("Model 1 - Random Forest: Baseline Model with Selected Features")
scores = cross_val_score(randomforestmodel, X, y, cv=4, scoring='accuracy')
print("Accuracy: ",round((scores.mean())*100,2),"%")

Model 1 - Random Forest: Baseline Model with Selected Features
Accuracy:  88.5 %


In [10]:
#Random Forest - RandomizedSearch with Parameter Tuning

param_grid = {
    'bootstrap': [True, False],
    'max_depth': [4, 8, 12, 16],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [200, 400, 600, 800,1000]
}

randomforestmodel=RandomForestClassifier(n_estimators=1000,random_state=42)

random_search = RandomizedSearchCV(estimator = randomforestmodel, param_distributions=param_grid, cv = 4, n_jobs = -1, verbose = 2)
random_search.fit(X, y)

print(random_search.best_params_)


Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   22.2s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   24.6s finished


{'n_estimators': 600, 'min_samples_split': 12, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 4, 'bootstrap': False}


In [11]:
#Random Forest - RandomizedSearch with Parameter Tuned Model

randomforestmodel=RandomForestClassifier(n_estimators=600,
                       random_state=42,
                       min_samples_split=12,
                       min_samples_leaf=4,
                       max_features='sqrt',
                       max_depth=4,
                       bootstrap=False)

print("Model 2- Random Forest Tuned Model with Selected Features")
scores = cross_val_score(randomforestmodel, X, y, cv=4, scoring='accuracy')
print("Accuracy: ",round((scores.mean())*100,2),"%")



Model 2- Random Forest Tuned Model with Selected Features
Accuracy:  89.42 %


In [12]:
#SVC Classifier
X=data[['Word_Count', 'Sentence_Count', 'Avg_Word_Length', 'Avg_No_Word_Per_Sentence', 'Avg_Syllable_Count_Per_Word', 'No_Complex_Words', 'No_Common_Words', 'Avg_No_Complex_Words_Per_Sentence', 'Avg_No_Simple_Words_Per_Sentence', 'Ratio_Complex_Words_Per_Common_Words', 'No_Easy_Words', 'No_Difficulty_Words', 'Avg_No_Easy_Words_Per_Sentence', 'Avg_No_Difficulty_Words_Per_Sentence', 'Ratio_Difficulty_Words_Per_Easy_Words', 'Automated_Readability_Index', 'Flesch_Reading_Ease', 'FleschKincaid_Grade_Level', 'Coleman_Liau_Index', 'Gunning_Fog_Index', 'SMOG_Index', 'Linsear_Write', 'Dale_Chall_Readability', 'ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'CONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SPACE', 'SYM', 'VERB', 'X']]
y=data["Lexile Score Range - Label"]


svcmodel=SVC(C=1.0,random_state=42)
#SVC baseline model
print("Model 3- Support Vector Machine")
scores = cross_val_score(svcmodel, X, y, cv=4, scoring='accuracy')
print("Accuracy: ",round((scores.mean())*100,2),"%")


Model 3- Support Vector Machine
Accuracy:  83.21 %


In [13]:
#Xgboost Classifer

X=data[['Word_Count', 'Sentence_Count', 'Avg_Word_Length', 'Avg_No_Word_Per_Sentence', 'Avg_Syllable_Count_Per_Word', 'No_Complex_Words', 'No_Common_Words', 'Avg_No_Complex_Words_Per_Sentence', 'Avg_No_Simple_Words_Per_Sentence', 'Ratio_Complex_Words_Per_Common_Words', 'No_Easy_Words', 'No_Difficulty_Words', 'Avg_No_Easy_Words_Per_Sentence', 'Avg_No_Difficulty_Words_Per_Sentence', 'Ratio_Difficulty_Words_Per_Easy_Words', 'Automated_Readability_Index', 'Flesch_Reading_Ease', 'FleschKincaid_Grade_Level', 'Coleman_Liau_Index', 'Gunning_Fog_Index', 'SMOG_Index', 'Linsear_Write', 'Dale_Chall_Readability', 'ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'CONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SPACE', 'SYM', 'VERB', 'X']]
y=data["Lexile Score Range - Label"]

xgbmodel=xgb.XGBClassifier(objective='multi:softprob',random_state=42)
#XGB baseline model
print("Model 4 - XGBoost BaseLine Model")
scores = cross_val_score(xgbmodel, X, y, cv=4, scoring='accuracy')
print("Accuracy: ",round((scores.mean())*100,2),"%")



Model 4 - XGBoost BaseLine Model
Accuracy:  87.96 %


In [14]:
#Feature Selection

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.30, random_state=42,shuffle=True)

print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', X_test.shape)

sel = SelectFromModel(xgb.XGBClassifier(objective='multi:softprob',random_state=42))
sel.fit(X_train, y_train)

selected_features= X_train.columns[(sel.get_support())]
print("No of features Selected: ",len(selected_features))
print("Selected Features: ",selected_features)


Training Features Shape: (383, 42)
Training Labels Shape: (165, 42)
No of features Selected:  11
Selected Features:  Index(['Word_Count', 'Avg_Word_Length', 'Avg_No_Complex_Words_Per_Sentence',
       'No_Difficulty_Words', 'Automated_Readability_Index',
       'FleschKincaid_Grade_Level', 'Coleman_Liau_Index', 'NOUN', 'NUM',
       'PRON', 'SCONJ'],
      dtype='object')


In [15]:
#XGBoost Baseline model with selected features
X=data[selected_features]

xgbmodel=xgb.XGBClassifier(objective='multi:softprob',random_state=42)

#XGB baseline model with selected features
print("Model 5 - XGBoost BaseLine Model with Selected Features")
scores = cross_val_score(xgbmodel, X, y, cv=4, scoring='accuracy')
print("Accuracy: ",round((scores.mean())*100,2),"%")


Model 5 - XGBoost BaseLine Model with Selected Features
Accuracy:  88.69 %


In [16]:
#XGBOOST - RandomizedSearch with Parameter Tuning

param_grid = {
    'max_depth': [4, 8, 12, 16],
    'min_child_weight':[2,4,6,8],
    'subsample': [0.4,0.6,0.8],
    'colsample_bytree': [0.4,0.6,0.8],
    'n_estimators': [200, 400, 600, 800,1000],
    'learning_rate':[0.01,0.03,0.1,0.3]   
}
xgb_search = RandomizedSearchCV(estimator = xgbmodel, param_distributions=param_grid, cv = 4, n_jobs = -1, verbose = 2)
xgb_search.fit(X, y)
print(xgb_search.best_params_)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.3s


{'subsample': 0.8, 'n_estimators': 200, 'min_child_weight': 6, 'max_depth': 4, 'learning_rate': 0.01, 'colsample_bytree': 0.6}


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    7.3s finished


In [17]:
xgbmodel=xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0,
              learning_rate=0.01, max_delta_step=0, max_depth=12,
              min_child_weight=8, missing=None, n_estimators=200, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.8, verbosity=1)
print("Model 6- XgBoost Tuned Model with Selected Features")
scores = cross_val_score(xgbmodel, X, y, cv=4, scoring='accuracy')
print("Accuracy: ",round((scores.mean())*100,2),"%")

Model 6- XgBoost Tuned Model with Selected Features
Accuracy:  89.78 %
