In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, f1_score, roc_auc_score, r2_score, mean_squared_error, mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Download NLTK resources
nltk.download('punkt') 
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load spaCy's medium-sized English language model
nlp = spacy.load("en_core_web_md")


In [4]:
# Calculate cosine similarity between student answer, examiner answer, and comprehension
def calculate_combined_similarity(student_answer, examiner_answer, comprehension, weights):
    # Check if any of the input text strings are empty
    if not student_answer or not examiner_answer or not comprehension:
        return 0.0  # Return zero similarity if any input text string is empty
    
    # Calculate similarity between student answer and examiner answer
    similarity_examiner = nlp(student_answer).similarity(nlp(examiner_answer))
    
    # Calculate similarity between student answer and comprehension
    similarity_comprehension = nlp(student_answer).similarity(nlp(comprehension))
    
    # Combine similarity scores using weights
    combined_similarity = (weights['examiner'] * similarity_examiner) + (weights['comprehension'] * similarity_comprehension)
    
    return combined_similarity

In [5]:
# Preprocess text data
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)


In [6]:
# Load data
data = pd.read_csv('./Essay Grading Dataset JET.csv', encoding='Latin-1')
data.head


<bound method NDFrame.head of      Question_ID                                      Comprehension  \
0            1.0  In Natural Language Processing, Feature Extrac...   
1            1.0  In Natural Language Processing, Feature Extrac...   
2            1.0  So we know that machines can only understand n...   
3            1.0  So we know that machines can only understand n...   
4            1.0  If we ask any NLP practitioner or data scienti...   
..           ...                                                ...   
691          4.0  n\tSNOBOL ("StriNg Oriented and symBOlic Langu...   
692          4.0  n\tSNOBOL ("StriNg Oriented and symBOlic Langu...   
693          4.0  Portability: \nSo that the program can be move...   
694          4.0  Portability: \nSo that the program can be move...   
695          4.0  Portability: \nSo that the program can be move...   

                                              Question  \
0             What is Feature Extraction from the text   
1

In [7]:
# Preprocess data
data['Preprocessed_Examiner_Answer'] = data['Examiner_Answer'].apply(preprocess_text)

print(data['Preprocessed_Examiner_Answer'])


0      textual data data feed machine learning algori...
1      textual data data feed machine learning algori...
2      know machine understand number make machine ab...
3      know machine understand number make machine ab...
4      featue extraction difficult requires conversio...
                             ...                        
691    n snobol string oriented symbolic language ser...
692    n snobol string oriented symbolic language ser...
693    portability program moved new computer easily ...
694    portability program moved new computer easily ...
695    portability program moved new computer easily ...
Name: Preprocessed_Examiner_Answer, Length: 696, dtype: object


In [8]:
data['Preprocessed_Student_Answer'] = data['Student_Answer'].apply(preprocess_text)

print(data['Preprocessed_Student_Answer'])

0      process converting text data number called fea...
1                                        text processing
2      need feature extraction text convert text numb...
3                                           numeric form
4                                              take time
                             ...                        
691    snobol stand string oriented symbolic language...
692                                               snobol
693    portability program moved new computer easily ...
694    interoperability concern achieving functionali...
695                                          portability
Name: Preprocessed_Student_Answer, Length: 696, dtype: object


In [9]:
data['Preprocessed_Comprehension'] = data['Comprehension'].apply(preprocess_text)

print(data['Preprocessed_Comprehension'])

0      natural language processing feature extraction...
1      natural language processing feature extraction...
2      know machine understand number make machine ab...
3      know machine understand number make machine ab...
4      ask nlp practitioner data scientist answer yes...
                             ...                        
691    n snobol string oriented symbolic language ser...
692    n snobol string oriented symbolic language ser...
693    portability program moved new computer easily ...
694    portability program moved new computer easily ...
695    portability program moved new computer easily ...
Name: Preprocessed_Comprehension, Length: 696, dtype: object


In [28]:
# Specify weights for examiner answer and comprehension
weights = {'examiner': 0.8, 'comprehension': 0.2}

# Calculate semantic similarity
data['Semantic_Similarity'] = data.apply(lambda row: calculate_combined_similarity(row['Preprocessed_Student_Answer'], row['Preprocessed_Examiner_Answer'], row['Comprehension'], weights), axis=1)

# Display semantic similarity
print(data['Semantic_Similarity'])

In [None]:
# Define features and target
X = data[['Semantic_Similarity', 'Question_Score']] 
y = data['Student_Score']


In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Random Forest Regressor 

In [None]:
# Build and train the model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred_rf = rf_model.predict(X_test)
print(y_pred_rf)

[1.28       4.         0.         2.         4.94       2.08
 3.86       8.51       5.91       1.18       0.95       1.
 4.55       3.03       4.11       4.06       4.         4.94
 0.         3.99       4.94       4.94       2.         4.84
 4.09       4.         0.         2.91       4.82       4.03
 4.55       1.         0.04       4.09       0.         6.11
 2.02       2.         3.2        0.         0.2        3.86
 0.16       3.06       1.7        0.         0.         0.
 4.06       1.81       2.14       5.         3.73       6.1
 0.99       6.         3.82       2.         6.         3.86
 3.98       0.95       4.82       0.         3.35       2.
 4.81       2.         1.81       1.28       0.         8.51
 7.58       7.58       4.         2.06       1.78430012 7.88
 2.         0.04       0.         4.06       3.2        0.
 1.7        3.02       4.11       2.         4.94       3.73
 4.9        6.965      1.25       5.         0.2        7.9
 1.         0.12       4.98       

In [None]:
# Create a DataFrame to display actual and predicted values
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_rf})
print(results_df)

     Actual  Predicted
674       1       1.28
314       4       4.00
611       0       0.00
431       2       2.00
552       5       4.94
..      ...        ...
24        4       4.00
158       0       0.00
388       2       2.11
482       5       4.82
110       4       3.80

[140 rows x 2 columns]


In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred_rf)
r2 = r2_score(y_test, y_pred_rf)
rmse = mean_squared_error(y_test, y_pred_rf, squared=False)
mae = mean_absolute_error(y_test, y_pred_rf)

print(f'Mean Squared Error: {mse}')
print(f'R-squared (R²): {r2}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Absolute Error (MAE): {mae}')

Mean Squared Error: 0.1558780821268392
R-squared (R²): 0.9689949115610464
Root Mean Squared Error (RMSE): 0.39481398420881597
Mean Absolute Error (MAE): 0.1724439804243375




## Gradient Boosting Regressor

In [None]:
import xgboost as xgb

In [None]:
# Build and train the model
xgb_model = xgb.XGBRegressor(objective ='reg:squarederror', random_state=42)
xgb_model.fit(X_train, y_train)


In [None]:
# Make predictions
y_pred_xgb = xgb_model.predict(X_test)
print(y_pred_xgb)

[ 1.0259552e+00  3.9923532e+00  1.4376017e-03  1.9967356e+00
  4.9851270e+00  2.0255263e+00  3.9821124e+00  8.9992142e+00
  5.9975581e+00  1.0106583e+00  4.5596607e-02  9.9935061e-01
  4.9718428e+00  3.0156763e+00  4.0046554e+00  4.0049410e+00
  3.9978535e+00  5.0119629e+00  1.0335466e-03  4.0022964e+00
  4.9798608e+00  5.0119629e+00  1.9988220e+00  4.9876523e+00
  4.0132165e+00  4.0025611e+00  3.6772643e-04  3.0051444e+00
  4.9830575e+00  3.9972632e+00  4.9718428e+00  1.0037493e+00
 -7.8090099e-03  4.0132165e+00  5.3530396e-03  5.9987383e+00
  2.0196815e+00  2.0070076e+00  3.0173223e+00  1.4376017e-03
  1.5459110e-02  3.9698272e+00  3.2906721e-03  3.0127516e+00
  1.0326777e+00  7.7219708e-03 -9.0989843e-04  2.7210449e-03
  3.9918013e+00  2.0068336e+00  2.0006154e+00  4.9994388e+00
  3.9910915e+00  5.9964280e+00  1.0004798e+00  6.0017805e+00
  3.9806106e+00  1.9976829e+00  6.0017805e+00  3.9698272e+00
  3.9907641e+00  4.5596607e-02  4.9825292e+00  7.7219708e-03
  3.0757792e+00  2.00298

In [None]:
# Create a DataFrame to display actual and predicted values
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_xgb})
print(results_df)

     Actual  Predicted
674       1   1.025955
314       4   3.992353
611       0   0.001438
431       2   1.996736
552       5   4.985127
..      ...        ...
24        4   3.997263
158       0  -0.000910
388       2   2.024192
482       5   4.982529
110       4   3.984526

[140 rows x 2 columns]


In [None]:
# Evaluate the model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2 = r2_score(y_test, y_pred_xgb)
rmse = mean_squared_error(y_test, y_pred_xgb)
mae = mean_absolute_error(y_test, y_pred_xgb)
print(f'XGBoost Mean Squared Error: {mse_xgb}')
print(f'R-squared (R²): {r2}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Absolute Error (MAE): {mae}')


XGBoost Mean Squared Error: 0.021389910792486446
R-squared (R²): 0.9957454180422702
Root Mean Squared Error (RMSE): 0.021389910792486446
Mean Absolute Error (MAE): 0.023099234953406267


## Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
# Build and train the Decision Tree Regressor model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)


In [None]:
# Make predictions
y_pred_dt = dt_model.predict(X_test)
print(y_pred_dt)

[1.         4.         0.         2.         5.         2.
 4.         9.         6.         1.         0.         1.
 5.         3.         4.         4.         4.         5.
 0.         4.         5.         5.         2.         5.
 4.         4.         0.         3.         5.         4.
 5.         1.         0.         4.         0.         6.
 2.         2.         3.         0.         0.         4.
 0.         3.         1.         0.         0.         0.
 4.         2.         2.         5.         4.         6.
 1.         6.         4.         2.         6.         4.
 4.         0.         5.         0.         3.         2.
 5.         2.         2.         1.         0.         9.
 8.         8.         4.         2.         1.71428571 8.
 2.         0.         0.         4.         3.         0.
 1.         3.         4.         2.         5.         4.
 5.         7.         1.         5.         0.         8.
 1.         0.         5.         2.         3.         

In [None]:
# Create a DataFrame to display actual and predicted values
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_dt})
print(results_df)

     Actual  Predicted
674       1        1.0
314       4        4.0
611       0        0.0
431       2        2.0
552       5        5.0
..      ...        ...
24        4        4.0
158       0        0.0
388       2        2.0
482       5        5.0
110       4        4.0

[140 rows x 2 columns]


In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred_dt)
rmse = mean_squared_error(y_test, y_pred_dt)
r2 = r2_score(y_test, y_pred_dt)
mae = mean_absolute_error(y_test, y_pred_dt)

print(f'Mean Squared Error: {mse}')
print(f'R-squared (R²): {r2}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Absolute Error (MAE): {mae}')

Mean Squared Error: 0.020991253644314867
R-squared (R²): 0.9958247133477245
Root Mean Squared Error (RMSE): 0.020991253644314867
Mean Absolute Error (MAE): 0.012244897959183673


### Dump Decision Tree Regressor

In [None]:
import joblib
joblib.dump(dt_model, 'dt_model_new.joblib')


['dt_model_new.joblib']