In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv("/content/drive/MyDrive/bloom for questions generation/arc_dataset.csv")

selected_columns = ['originalQuestionID', 'question', 'AnswerKey', 'isMultipleChoiceQuestion', 'includesDiagram', 'year', 'difficulty']
df = df[selected_columns]
print(df)

     originalQuestionID                                           question  \
0               7220990  Which factor will most likely cause a person t...   
1                  5189  Lichens are symbiotic organisms made of green ...   
2                401169  When a switch is used in an electrical circuit...   
3                    27  Which of the following is an example of an ass...   
4                    10  Rocks are classified as igneous, metamorphic, ...   
...                 ...                                                ...   
7782             415480  Which change would most likely increase the nu...   
7783            7172795  The skin is the largest organ in the human bod...   
7784                 59  Which food provides the most energy for the bo...   
7785            7219643  Screech owls have two color variations-red and...   
7786             412487  A scientist is measuring the amount of movemen...   

     AnswerKey  isMultipleChoiceQuestion  includesDiagram  year

In [3]:
df['QO'] = df['question'].str.split(' \([A-Z]\)', n=0).str[0]
df['QC'] = df['question'].str.split(' \([A-Z]\)', n=0).str[0] + ' ' + df['AnswerKey']
df['QA'] = df['question']

In [4]:
# Initialize an empty dictionary to store models and vectorizers
models_dict = {}

In [5]:
encoding_approaches = ['QO', 'QC', 'QA']

# Train models for each encoding approach and save them
for approach in encoding_approaches:
    X = df[approach]
    y = df['difficulty']

    vectorizer = TfidfVectorizer(norm='l2')
    X = vectorizer.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    rf_model = RandomForestRegressor(n_estimators=100, max_depth=10)
    scores = cross_val_score(rf_model, X_train, y_train, cv=5)
    print(f"Cross-Validation MSE ({approach}): {abs(scores.mean())}")

    rf_model.fit(X_train, y_train)

    y_pred = rf_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error ({approach}): {mse}")

    # Save the model and vectorizer
    models_dict[approach + '_model'] = rf_model
    models_dict[approach + '_vectorizer'] = vectorizer


Cross-Validation MSE (QO): 0.11221742187387836
Mean Squared Error (QO): 2.710690657885869
Cross-Validation MSE (QC): 0.11171434279732091
Mean Squared Error (QC): 2.711815395152403
Cross-Validation MSE (QA): 0.12702277290192204
Mean Squared Error (QA): 2.6199145108624466


In [6]:
print(models_dict)

{'QO_model': RandomForestRegressor(max_depth=10), 'QO_vectorizer': TfidfVectorizer(), 'QC_model': RandomForestRegressor(max_depth=10), 'QC_vectorizer': TfidfVectorizer(), 'QA_model': RandomForestRegressor(max_depth=10), 'QA_vectorizer': TfidfVectorizer()}


In [7]:
print(models_dict['QA_model'])
print(models_dict['QA_vectorizer'])

RandomForestRegressor(max_depth=10)
TfidfVectorizer()


In [8]:
import joblib

joblib.dump(models_dict['QA_model'], '/content/drive/MyDrive/QDET/TFIDF_model.joblib')
joblib.dump(models_dict['QA_vectorizer'],'/content/drive/MyDrive/QDET/TFIDF_QA_vectorizer.joblib' )

['/content/drive/MyDrive/QDET/TFIDF_QA_vectorizer.joblib']