In [47]:
!pip install textstat



In [48]:
import joblib
import numpy as np
import pandas as pd
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from collections import defaultdict
from collections import Counter
from sklearn.preprocessing import StandardScaler
from textstat import flesch_reading_ease, flesch_kincaid_grade, automated_readability_index, gunning_fog, coleman_liau_index, linsear_write_formula, dale_chall_readability_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [49]:
model1 = joblib.load('/content/drive/MyDrive/QDET/linguistic_regressor.pkl')
model2 = joblib.load('/content/drive/MyDrive/QDET/readability_regressor_model.pkl')
model3 = joblib.load('/content/drive/MyDrive/QDET/TFIDF_model.joblib')

In [50]:
def scale_value(value, old_min, old_max, new_min, new_max):
    scaled_value = (value - old_min) * (new_max - new_min) / (old_max - old_min) + new_min
    return scaled_value

In [51]:
def count_pos_tags(dataframe, question_column):
    unique_pos_tags = set()

    def update_unique_pos_tags(text):
        tokens = word_tokenize(text)
        pos_tags = pos_tag(tokens)
        unique_pos_tags.update(tag for word, tag in pos_tags)

    dataframe[question_column].apply(update_unique_pos_tags)

    for pos_tag_count in unique_pos_tags:
        pos_tag_column = f'{pos_tag_count}'
        dataframe[pos_tag_column] = dataframe[question_column].apply(lambda x: sum(1 for _, tag in pos_tag(word_tokenize(x)) if tag == pos_tag_count))

    return dataframe

In [52]:
def linguistic_score(text):
  number_of_words = len(str(text).split())
  number_of_sentences = len(text.split("."))
  avg_word_length= sum(len(word) for word in str(text).split()) / len(str(text).split())

  df = pd.DataFrame(columns = ["question"])
  df['question'] =[text]
  tags = count_pos_tags(df,"question")
  tags["number_of_words"] = number_of_words
  tags["number_of_sentences"] = number_of_sentences
  tags["avg_word_length"] = avg_word_length

  input = pd.read_csv("/content/drive/MyDrive/QDET/tags_order.csv")
  input.drop(index=input.index, inplace=True)

  common_columns = tags.columns.intersection(input.columns)
  input[common_columns] = tags[common_columns]
  input = input.fillna(0)
  input = input.iloc[:,1:]
  y_pred = model1.predict(input)

  scaled_value = scale_value(y_pred[0], 1, 170, 1, 10)

  return scaled_value


In [53]:
def readability_score(question):
  input = []
  input.append(flesch_reading_ease(question))
  input.append(flesch_kincaid_grade(question))
  input.append(automated_readability_index(question))
  input.append(gunning_fog(question))
  input.append(coleman_liau_index(question))
  input.append(linsear_write_formula(question))
  input.append(dale_chall_readability_score(question))

  input = [input]
  pred = model2.predict(input)

  scaled_value = scale_value(pred[0], 5, 30, 1, 10)
  return scaled_value

In [54]:
def tfidf_score(question):
  vectorizer = joblib.load('/content/drive/MyDrive/QDET/TFIDF_QA_vectorizer.joblib' )
  X = vectorizer.transform([question])
  difficulty_prediction = model3.predict(X)[0]

  return difficulty_prediction

In [63]:
def final_prediction(question) :

  linguistic = linguistic_score(question)
  readability = readability_score(question)
  tfidf = tfidf_score(question)

  final_score = np.mean([linguistic , readability , tfidf],axis = 0)

  return final_score

In [66]:

question = "what is happening to the environment that is causing gloabal warming"

difficulty = np.round(final_prediction(question),3)
print("Difficulty: " + str(difficulty) + "/10")

Difficulty: 5.375/10


