In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, f1_score, roc_auc_score, r2_score, mean_squared_error, mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from nltk.metrics.distance import jaccard_distance
from nltk.tokenize import word_tokenize
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download NLTK resources
nltk.download('punkt') 
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load spaCy's medium-sized English language model
nlp = spacy.load("en_core_web_md")

# Load the Universal Sentence Encoder model from TensorFlow Hub
# model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

In [4]:
# Calculate cosine similarity between student answer, examiner answer, and comprehension
def calculate_combined_similarity(student_answer, examiner_answer, comprehension, weights):
    # Check if any of the input text strings are empty
    if not student_answer or not examiner_answer or not comprehension:
        return 0.0  # Return zero similarity if any input text string is empty
    
    # Calculate similarity between student answer and examiner answer
    similarity_examiner = nlp(student_answer).similarity(nlp(examiner_answer))
    
    # Calculate similarity between student answer and comprehension
    similarity_comprehension = nlp(student_answer).similarity(nlp(comprehension))
    
    # Combine similarity scores using weights
    combined_similarity = (weights['examiner'] * similarity_examiner) + (weights['comprehension'] * similarity_comprehension)
    
    return combined_similarity

In [5]:
# # Function to calculate combined similarity using sentence embeddings
# def calculate_combined_similarity(student_answer, examiner_answer, comprehension, weights):
#     # Check if any of the input text strings are empty
#     if not student_answer or not examiner_answer or not comprehension:
#         return 0.0  # Return zero similarity if any input text string is empty
    
#     # Encode sentences to get their embeddings
#     student_embedding = model.encode(student_answer, convert_to_tensor=True)
#     examiner_embedding = model.encode(examiner_answer, convert_to_tensor=True)
#     comprehension_embedding = model.encode(comprehension, convert_to_tensor=True)
    
#     # Calculate cosine similarity between student answer and examiner answer
#     similarity_examiner = util.pytorch_cos_sim(student_embedding, examiner_embedding).item()
    
#     # Calculate cosine similarity between student answer and comprehension
#     similarity_comprehension = util.pytorch_cos_sim(student_embedding, comprehension_embedding).item()
    
#     # Combine similarity scores using weights
#     combined_similarity = (weights['examiner'] * similarity_examiner) + (weights['comprehension'] * similarity_comprehension)
    
#     return combined_similarity


In [6]:
# Preprocess text data
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)


In [7]:
# Load data
data = pd.read_csv('./Essay Grading Dataset JET.csv', encoding='Latin-1')
data.head


<bound method NDFrame.head of      Question_ID                                      Comprehension  \
0            1.0  In Natural Language Processing, Feature Extrac...   
1            1.0  In Natural Language Processing, Feature Extrac...   
2            1.0  So we know that machines can only understand n...   
3            1.0  So we know that machines can only understand n...   
4            1.0  If we ask any NLP practitioner or data scienti...   
..           ...                                                ...   
691          4.0  n\tSNOBOL ("StriNg Oriented and symBOlic Langu...   
692          4.0  n\tSNOBOL ("StriNg Oriented and symBOlic Langu...   
693          4.0  Portability: \nSo that the program can be move...   
694          4.0  Portability: \nSo that the program can be move...   
695          4.0  Portability: \nSo that the program can be move...   

                                              Question  \
0             What is Feature Extraction from the text   
1

In [8]:
# Preprocess data
data['Preprocessed_Examiner_Answer'] = data['Examiner_Answer'].apply(preprocess_text)

print(data['Preprocessed_Examiner_Answer'])


0      textual data data feed machine learning algori...
1      textual data data feed machine learning algori...
2      know machine understand number make machine ab...
3      know machine understand number make machine ab...
4      featue extraction difficult requires conversio...
                             ...                        
691    n snobol string oriented symbolic language ser...
692    n snobol string oriented symbolic language ser...
693    portability program moved new computer easily ...
694    portability program moved new computer easily ...
695    portability program moved new computer easily ...
Name: Preprocessed_Examiner_Answer, Length: 696, dtype: object


In [9]:
data['Preprocessed_Student_Answer'] = data['Student_Answer'].apply(preprocess_text)

print(data['Preprocessed_Student_Answer'])

0      process converting text data number called fea...
1                                        text processing
2      need feature extraction text convert text numb...
3                                           numeric form
4                                              take time
                             ...                        
691    snobol stand string oriented symbolic language...
692                                               snobol
693    portability program moved new computer easily ...
694    interoperability concern achieving functionali...
695                                          portability
Name: Preprocessed_Student_Answer, Length: 696, dtype: object


In [10]:
data['Preprocessed_Comprehension'] = data['Comprehension'].apply(preprocess_text)

print(data['Preprocessed_Comprehension'])

0      natural language processing feature extraction...
1      natural language processing feature extraction...
2      know machine understand number make machine ab...
3      know machine understand number make machine ab...
4      ask nlp practitioner data scientist answer yes...
                             ...                        
691    n snobol string oriented symbolic language ser...
692    n snobol string oriented symbolic language ser...
693    portability program moved new computer easily ...
694    portability program moved new computer easily ...
695    portability program moved new computer easily ...
Name: Preprocessed_Comprehension, Length: 696, dtype: object


In [11]:
# Specify weights for examiner answer and comprehension
weights = {'examiner': 0.1, 'comprehension': 0.9}

# Calculate semantic similarity
data['Semantic_Similarity'] = data.apply(lambda row: calculate_combined_similarity(row['Preprocessed_Student_Answer'], row['Preprocessed_Examiner_Answer'], row['Comprehension'], weights), axis=1)

# Display semantic similarity
print(data['Semantic_Similarity'])

  similarity_examiner = nlp(student_answer).similarity(nlp(examiner_answer))
  similarity_comprehension = nlp(student_answer).similarity(nlp(comprehension))


0      0.840521
1      0.717330
2      0.727067
3      0.556399
4      0.561920
         ...   
691    0.846368
692    0.000000
693    0.903119
694    0.916065
695    0.723000
Name: Semantic_Similarity, Length: 696, dtype: float64


In [12]:
# Define features and target
X = data[['Semantic_Similarity', 'Question_Score']] 
y = data['Student_Score']

print(data[['Semantic_Similarity', 'Question_Score', 'Student_Score']])
data[['Semantic_Similarity', 'Question_Score', 'Student_Score']].to_csv('semantic_similarity2.csv', index=False)



     Semantic_Similarity  Question_Score  Student_Score
0               0.840521              10              7
1               0.717330              10              0
2               0.727067              10              9
3               0.556399              10              0
4               0.561920              10              0
..                   ...             ...            ...
691             0.846368               4              2
692             0.000000               4              0
693             0.903119               3              2
694             0.916065               3              2
695             0.723000               3              0

[696 rows x 3 columns]


In [13]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Random Forest Regressor 

In [14]:
# Build and train the model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

In [15]:
# Make predictions
y_pred_rf = rf_model.predict(X_test)
print(y_pred_rf)

[1.32       3.88       0.05       2.         4.87       2.14
 3.8        8.23       5.89       1.18       0.4        1.
 4.55       3.03       3.9        4.06       3.94       5.
 0.         4.04       4.94       5.         2.         4.85
 4.         3.92       0.         3.         4.68779004 3.97
 4.55       1.         0.06       4.         0.18       6.12
 2.03       2.         3.28       0.05       0.         3.9
 0.         2.99       1.64       0.04       0.09       0.
 4.         2.         2.18       5.         3.93       6.
 1.         6.         3.92       2.         6.         3.9
 3.98       0.4        4.52       0.04       3.         2.08
 5.         2.08       2.         1.32       0.         8.23
 7.76       7.76       4.07       2.02       1.78430012 7.89
 2.         0.06       0.         4.         3.28       0.05
 1.64       3.04       3.9        2.         4.87       3.93
 5.         6.98       1.31       4.92       0.         7.94
 1.02       0.         5.         

In [16]:
# Create a DataFrame to display actual and predicted values
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_rf})
print(results_df)

     Actual  Predicted
674       1       1.32
314       4       3.88
611       0       0.05
431       2       2.00
552       5       4.87
..      ...        ...
24        4       4.00
158       0       0.09
388       2       2.04
482       5       4.52
110       4       4.07

[140 rows x 2 columns]


In [17]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred_rf)
r2 = r2_score(y_test, y_pred_rf)
rmse = mean_squared_error(y_test, y_pred_rf, squared=False)
mae = mean_absolute_error(y_test, y_pred_rf)

print(f'Mean Squared Error: {mse}')
print(f'R-squared (R²): {r2}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Absolute Error (MAE): {mae}')

Mean Squared Error: 0.08344787116421139
R-squared (R²): 0.9834017163273573
Root Mean Squared Error (RMSE): 0.2888734518162086
Mean Absolute Error (MAE): 0.13676078623757196




## Gradient Boosting Regressor

In [18]:
import xgboost as xgb

In [19]:
# Build and train the model
xgb_model = xgb.XGBRegressor(objective ='reg:squarederror', random_state=42)
xgb_model.fit(X_train, y_train)


In [20]:
# Make predictions
y_pred_xgb = xgb_model.predict(X_test)
print(y_pred_xgb)

[ 1.0435537e+00  3.9828410e+00  1.9829546e-03  2.0004976e+00
  4.9657092e+00  2.0315177e+00  3.9675152e+00  8.9809799e+00
  5.9938726e+00  1.0347883e+00  5.3055394e-02  9.9753022e-01
  4.9805174e+00  3.0290396e+00  3.9633672e+00  4.0114532e+00
  3.9983912e+00  4.9952445e+00  2.7387480e-03  4.0120339e+00
  4.9510427e+00  4.9952445e+00  2.0041852e+00  4.9932861e+00
  3.9987631e+00  3.9631753e+00  1.0152231e-03  2.9996216e+00
  4.9850597e+00  3.9920940e+00  4.9805174e+00  9.9875969e-01
  2.6393747e-03  3.9987631e+00  7.8934450e-03  6.0099883e+00
  2.0456290e+00  2.0008240e+00  3.0166011e+00  1.9829546e-03
 -1.9855539e-03  3.9768000e+00  7.0290541e-04  2.9861755e+00
  1.0297772e+00  6.3922764e-03  5.6996322e-03  8.0108605e-03
  4.0056977e+00  1.9871664e+00  2.0148392e+00  4.9924207e+00
  3.9967933e+00  5.9996839e+00  9.9515259e-01  5.9996839e+00
  3.9741905e+00  2.0030115e+00  5.9996839e+00  3.9768000e+00
  3.9805562e+00  5.3055394e-02  4.9744239e+00  6.3922764e-03
  2.9996216e+00  2.00874

In [21]:
# Create a DataFrame to display actual and predicted values
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_xgb})
print(results_df)

     Actual  Predicted
674       1   1.043554
314       4   3.982841
611       0   0.001983
431       2   2.000498
552       5   4.965709
..      ...        ...
24        4   3.998763
158       0   0.005700
388       2   1.996992
482       5   4.974424
110       4   4.002508

[140 rows x 2 columns]


In [22]:
# Evaluate the model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2 = r2_score(y_test, y_pred_xgb)
rmse = mean_squared_error(y_test, y_pred_xgb)
mae = mean_absolute_error(y_test, y_pred_xgb)
print(f'XGBoost Mean Squared Error: {mse_xgb}')
print(f'R-squared (R²): {r2}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Absolute Error (MAE): {mae}')


XGBoost Mean Squared Error: 0.02155556099978716
R-squared (R²): 0.9957124692193362
Root Mean Squared Error (RMSE): 0.02155556099978716
Mean Absolute Error (MAE): 0.02472577824061903


## Decision Tree Regressor

In [23]:
from sklearn.tree import DecisionTreeRegressor

In [24]:
# Build and train the Decision Tree Regressor model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)


In [25]:
# Make predictions
y_pred_dt = dt_model.predict(X_test)
print(y_pred_dt)

[1.         4.         0.         2.         5.         2.
 4.         9.         6.         1.         0.         1.
 5.         3.         4.         4.         4.         5.
 0.         4.         5.         5.         2.         5.
 4.         4.         0.         3.         5.         4.
 5.         1.         0.         4.         0.         6.
 2.         2.         3.         0.         0.         4.
 0.         3.         1.         0.         0.         0.
 4.         2.         2.         5.         4.         6.
 1.         6.         4.         2.         6.         4.
 4.         0.         5.         0.         3.         2.
 5.         2.         2.         1.         0.         9.
 8.         8.         4.         2.         1.71428571 8.
 2.         0.         0.         4.         3.         0.
 1.         3.         4.         2.         5.         4.
 5.         7.         1.         5.         0.         8.
 1.         0.         5.         2.         3.         

In [26]:
# Create a DataFrame to display actual and predicted values
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_dt})
print(results_df)

     Actual  Predicted
674       1        1.0
314       4        4.0
611       0        0.0
431       2        2.0
552       5        5.0
..      ...        ...
24        4        4.0
158       0        0.0
388       2        2.0
482       5        5.0
110       4        4.0

[140 rows x 2 columns]


In [27]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred_dt)
rmse = mean_squared_error(y_test, y_pred_dt)
r2 = r2_score(y_test, y_pred_dt)
mae = mean_absolute_error(y_test, y_pred_dt)

print(f'Mean Squared Error: {mse}')
print(f'R-squared (R²): {r2}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Absolute Error (MAE): {mae}')

Mean Squared Error: 0.020991253644314867
R-squared (R²): 0.9958247133477245
Root Mean Squared Error (RMSE): 0.020991253644314867
Mean Absolute Error (MAE): 0.012244897959183673


### Dump Decision Tree Regressor

In [28]:
import joblib
joblib.dump(dt_model, 'dt_model_new.joblib')


['dt_model_new.joblib']