In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, f1_score, roc_auc_score, r2_score, mean_squared_error, mean_absolute_error


In [2]:
# Download NLTK resources
nltk.download('punkt') 
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load data from CSV
df = pd.read_csv('./Essay Grading Dataset/Essay Grading Dataset JET.csv', encoding='Latin-1')
df.head

<bound method NDFrame.head of      Question_ID                                      Comprehension  \
0            1.0  In Natural Language Processing, Feature Extrac...   
1            1.0  In Natural Language Processing, Feature Extrac...   
2            1.0  So we know that machines can only understand n...   
3            1.0  So we know that machines can only understand n...   
4            1.0  If we ask any NLP practitioner or data scienti...   
..           ...                                                ...   
691          4.0  n\tSNOBOL ("StriNg Oriented and symBOlic Langu...   
692          4.0  n\tSNOBOL ("StriNg Oriented and symBOlic Langu...   
693          4.0  Portability: \nSo that the program can be move...   
694          4.0  Portability: \nSo that the program can be move...   
695          4.0  Portability: \nSo that the program can be move...   

                                              Question  \
0             What is Feature Extraction from the text   
1

In [4]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove non-alphanumeric characters and keep spaces
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    print('text:', text[0])
    
    # Tokenize the text into words
#     tokens = nltk.word_tokenize(text)
    tokens = word_tokenize(text)
    print('token:',tokens[0])
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join the preprocessed tokens back into a single string
    return ' '.join(tokens)


In [5]:
def huber_loss(y_true, y_pred, delta=1.0):
    residual = np.abs(y_true - y_pred)
    quadratic_loss = 0.5 * (residual ** 2)
    linear_loss = delta * (residual - 0.5 * delta)
    
    loss = np.where(residual <= delta, quadratic_loss, linear_loss)
    
    return np.mean(loss)

In [6]:
# Combine text features
X_text = df['Comprehension'] + ' ' + df['Question'] + ' ' + df['Examiner_Answer'] + ' ' + df['Student_Answer']
print(X_text)

0      In Natural Language Processing, Feature Extrac...
1      In Natural Language Processing, Feature Extrac...
2      So we know that machines can only understand n...
3      So we know that machines can only understand n...
4      If we ask any NLP practitioner or data scienti...
                             ...                        
691    n\tSNOBOL ("StriNg Oriented and symBOlic Langu...
692    n\tSNOBOL ("StriNg Oriented and symBOlic Langu...
693    Portability: \nSo that the program can be move...
694    Portability: \nSo that the program can be move...
695    Portability: \nSo that the program can be move...
Length: 696, dtype: object


In [7]:
# Numerical features
X_numeric = df[['Question_Score']]
print(X_numeric)

     Question_Score
0                10
1                10
2                10
3                10
4                10
..              ...
691               4
692               4
693               3
694               3
695               3

[696 rows x 1 columns]


In [8]:
# Text preprocessing
X_text_preprocessed = X_text.apply(preprocess_text)
print(X_text_preprocessed)

text: i
token: in
text: i
token: in
text: s
token: so
text: s
token: so
text: i
token: if
text: i
token: if
text: a
token: artificial
text: a
token: artificial
text: a
token: artificial
text: a
token: artificial
text: a
token: artificial
text: a
token: all
text: a
token: all
text: a
token: all
text: a
token: as
text: a
token: as
text: a
token: ai
text: a
token: ai
text: a
token: ai
text: a
token: artificial
text: a
token: arend
text: a
token: arend
text: a
token: arend
text: a
token: arend
text: a
token: arend
text: t
token: the
text: t
token: the
text: t
token: the
text: t
token: the
text: t
token: types
text: s
token: software
text: i
token: importance
text: b
token: benefits
text: u
token: unit
text: i
token: integration
text: a
token: a
text: a
token: a
text: i
token: in
text: t
token: the
text: p
token: pairwise
text: a
token: advantages
text: s
token: state
text: s
token: state
text: a
token: advantages
text: a
token: advantages
text: f
token: functional
text: f
token: functional

token: a
text: a
token: a
text: a
token: a
text: t
token: the
text: o
token: objectives
text: o
token: objectives
text: o
token: objectives
text: o
token: objectives
text: o
token: objectives
text: d
token: data
text: d
token: data
text: d
token: data
text: d
token: data
text: d
token: data
text: d
token: data
text: d
token: data
text: d
token: data
text: d
token: data
text: d
token: data
text:  
token: the
text:  
token: the
text:  
token: the
text: y
token: yes
text: y
token: yes
text: y
token: yes
text: t
token: top
text: s
token: some
text: s
token: some
text: s
token: systems
text: s
token: systems
text: k
token: knowledge
text: k
token: knowledge
text: k
token: knowledge
text: k
token: knowledge
text: k
token: knowledge
text: 1
token: 1
text: 1
token: 1
text: 1
token: 1
text: 1
token: 1
text: c
token: classification
text: c
token: classification
text: 	
token: expertise
text: 	
token: expertise
text: 1
token: 1
text: t
token: types
text: t
token: types
text: m
token: model
text: 

In [9]:
# Text vectorization
tfidf_vectorizer = TfidfVectorizer()
X_text_vectorized = tfidf_vectorizer.fit_transform(X_text_preprocessed)
print(X_text_vectorized)

  (0, 3225)	0.0717416935401702
  (0, 1971)	0.0717416935401702
  (0, 2936)	0.0717416935401702
  (0, 3224)	0.04698774889286113
  (0, 304)	0.0717416935401702
  (0, 1730)	0.028814670351411796
  (0, 2906)	0.031199196265978696
  (0, 3050)	0.030086785756790427
  (0, 864)	0.02765283918174745
  (0, 1997)	0.06239839253195739
  (0, 682)	0.11608114118104285
  (0, 2280)	0.05006540452597038
  (0, 3075)	0.054317317115280636
  (0, 3072)	0.07124967586129688
  (0, 917)	0.10031580563539064
  (0, 152)	0.1364056523218034
  (0, 1707)	0.12034714302716171
  (0, 1766)	0.10800470372149029
  (0, 1195)	0.1434833870803404
  (0, 2935)	0.13362044718514524
  (0, 3147)	0.21522508062051057
  (0, 2478)	0.049015871615744826
  (0, 174)	0.09106907373788345
  (0, 421)	0.22686164077563056
  (0, 1304)	0.05278298698845358
  :	:
  (695, 1038)	0.14468656295218477
  (695, 2109)	0.042993243689246015
  (695, 198)	0.16493018210305455
  (695, 427)	0.050439345883150515
  (695, 1540)	0.041570028445644364
  (695, 1096)	0.069249252243389

In [10]:
# Combine numerical and text features
X_combined_vectorized = pd.concat([X_numeric.reset_index(drop=True), pd.DataFrame(X_text_vectorized.toarray())], axis=1)
print(X_combined_vectorized)

     Question_Score    0    1    2    3    4    5    6    7    8  ...  3243  \
0                10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
1                10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
2                10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
3                10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
4                10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
..              ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   ...   
691               4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
692               4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
693               3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
694               3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
695               3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   

     3244      3245  3246  3247  3248  3249  3250  

In [11]:
# Define target variable
y = df['Student_Score']
print(y)

0      7
1      0
2      9
3      0
4      0
      ..
691    2
692    0
693    2
694    2
695    0
Name: Student_Score, Length: 696, dtype: int64


In [12]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_combined_vectorized, y, test_size=0.3, random_state=42)

## Linear Regression

In [13]:
# Build and train the model
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [14]:
# Make predictions
y_pred = model.predict(X_test)
print(y_pred)

[ 1.00011802e+00  4.00030112e+00  1.91450119e-04  2.00018954e+00
  5.00021434e+00  2.00000119e+00  4.00006366e+00  9.00128126e+00
  6.00018573e+00  1.00002789e+00  1.34468079e-04  9.99688387e-01
  5.00006795e+00  2.99988198e+00  4.00002098e+00  4.00018644e+00
  3.99981427e+00  5.00019956e+00 -2.24351883e-04  4.00004315e+00
  5.00075936e+00  5.00019956e+00  1.99963474e+00  5.00022101e+00
  4.00000286e+00  4.00005651e+00 -3.16858292e-04  3.00000477e+00
  5.00001144e+00  3.99994040e+00  5.00006795e+00  9.99477148e-01
 -1.70230865e-04  4.00000286e+00 -1.18970871e-04  6.00012898e+00
  2.00003171e+00  1.99990010e+00  3.00016761e+00  7.03334808e-05
 -3.74317169e-05  4.00010061e+00 -2.81572342e-04  3.00013375e+00
  1.00017452e+00 -4.50611115e-05 -3.61442566e-04 -1.28746033e-05
  3.99984479e+00  1.99948239e+00  2.00020862e+00  5.00016952e+00
  3.99997544e+00  6.00003219e+00  9.99572992e-01  1.36711710e+08
  3.99952435e+00  1.99981093e+00  1.36711710e+08  4.00010061e+00
  4.00005102e+00  1.34468

In [15]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 357705073011686.44


In [16]:
# R-squared (R²)
r2 = r2_score(y_test, y_pred)
print(f'R-squared (R²): {r2}')

R-squared (R²): -75637611795287.3


In [17]:
# Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error (RMSE): {rmse}')


Root Mean Squared Error (RMSE): 18913092.634777803


In [18]:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae}')

Mean Absolute Error (MAE): 2616491.954778314


## Logistic Regression 

In [19]:
# Build and train the model
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=42)

In [20]:
# Make predictions
y_pred_log = logistic_model.predict(X_test)
print(y_pred_log)

[3 0 5 2 5 2 4 0 5 4 3 1 5 3 4 4 0 0 0 4 0 0 2 2 4 4 0 3 0 4 5 1 1 4 0 6 2
 0 5 5 4 4 2 3 0 0 1 4 4 2 2 5 0 6 1 6 4 2 6 4 4 3 5 0 2 2 5 2 2 3 4 0 0 0
 4 2 0 8 0 1 0 4 5 5 0 3 4 2 5 0 5 0 5 0 4 6 1 1 5 2 2 0 0 2 1 4 4 4 5 3 6
 4 6 6 1 2 2 4 2 2 2 0 3 0 5 2 5 2 6 4 4 2 2 5 1 4 1 2 5 4 2 4 5 5 1 0 4 4
 4 0 2 0 1 4 1 4 4 6 0 6 3 2 4 4 0 1 2 2 5 4 2 4 0 0 0 6 2 2 0 4 4 4 2 6 4
 0 5 6 5 0 0 2 1 0 5 0 4 5 4 0 1 5 2 1 5 0 4 0 4]


In [21]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred_log)
r2 = r2_score(y_test, y_pred_log)
mae = mean_absolute_error(y_test, y_pred_log)

print(f'Mean Squared Error: {mse}')
print(f'R-squared (R²): {r2}')
print(f'Mean Absolute Error (MAE): {mae}')

Mean Squared Error: 5.722488038277512
R-squared (R²): -0.21003407946712094
Mean Absolute Error (MAE): 1.263157894736842


## Gradient Boosting Regressor

In [22]:
import xgboost as xgb

In [23]:
# Build and train the model
xgb_model = xgb.XGBRegressor(objective ='reg:squarederror', random_state=42)
xgb_model.fit(X_train, y_train)


XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...)

In [24]:
# Make predictions
y_pred_xgb = xgb_model.predict(X_test)
print(y_pred_xgb)

[1.0105939e+00 3.9656594e+00 1.5616957e-02 1.9807748e+00 4.9689212e+00
 2.0028334e+00 4.0053926e+00 8.9630594e+00 5.9924674e+00 1.0187052e+00
 3.4104817e-02 9.8527300e-01 4.9935355e+00 3.0028584e+00 3.9899268e+00
 3.9901028e+00 3.9889264e+00 4.9765377e+00 7.1053002e-03 4.0125031e+00
 4.9744563e+00 4.9765377e+00 2.0059114e+00 4.9816365e+00 3.9896321e+00
 4.0136600e+00 1.7988704e-02 2.9899280e+00 4.9512882e+00 4.0019083e+00
 4.9935355e+00 1.0152935e+00 2.0469958e-02 3.9896321e+00 7.2296653e-03
 6.0154991e+00 2.0132093e+00 2.0030980e+00 2.9807765e+00 2.6620738e-02
 1.1197791e-02 3.9761496e+00 2.2744862e-02 3.0070765e+00 1.0608628e+00
 1.4806349e-02 5.2555211e-02 1.8233750e-02 3.9954715e+00 2.0046437e+00
 2.0046511e+00 4.9900079e+00 3.9601550e+00 6.0101409e+00 1.0102123e+00
 7.1544919e+00 3.9952676e+00 1.9966886e+00 7.1544919e+00 3.9761496e+00
 3.9930756e+00 3.4104817e-02 4.9743333e+00 1.4806349e-02 2.9469180e+00
 2.0105073e+00 4.9914470e+00 2.0210204e+00 2.0046437e+00 1.0105939e+00
 1.823

In [25]:
# Evaluate the model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f'XGBoost Mean Squared Error: {mse_xgb}')

XGBoost Mean Squared Error: 0.04013332512904764


In [26]:
# R-squared (R²)
r2 = r2_score(y_test, y_pred_xgb)
print(f'R-squared (R²): {r2}')

R-squared (R²): 0.9915137103295546


In [27]:
# Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, y_pred_xgb, squared=False)
print(f'Root Mean Squared Error (RMSE): {rmse}')


Root Mean Squared Error (RMSE): 0.20033303554094029


In [28]:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred_xgb)
print(f'Mean Absolute Error (MAE): {mae}')

Mean Absolute Error (MAE): 0.04763099537386991


## Support Vector Regressor (SVR)

In [29]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

In [30]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [31]:
# Build and train the model
svr_model = SVR()
svr_model.fit(X_train_scaled, y_train)


SVR()

In [32]:
# Make predictions
y_pred_svr = svr_model.predict(X_test_scaled)
print(y_pred_svr)

[1.74394652 0.97895469 4.8989934  1.90017251 4.5415874  2.09990662
 4.10021173 1.09073047 4.78694642 1.94510467 1.67006477 0.90899925
 3.12179633 3.09993054 3.543363   3.8999887  3.90000636 1.00946932
 0.85594763 4.0198203  2.03537725 1.00946932 2.1001564  2.15782002
 3.89989561 3.89988723 2.84228907 2.89996205 4.90016699 3.90005592
 3.12179633 1.51088778 1.15391435 3.89989561 1.04405206 6.00015653
 2.10017419 2.09987524 2.8997487  2.85632975 3.66058169 3.89966461
 1.85235477 3.10015042 0.89974356 1.46901957 1.08893461 3.51257073
 3.90017939 2.79788494 2.10007968 4.90013641 0.62354345 5.90002523
 0.89998968 5.7944846  4.09989228 2.10009555 5.7944846  3.89966461
 3.90026074 1.67006477 3.16066684 1.46901957 2.89992147 2.09962187
 4.90024006 2.10007386 2.79788494 1.74394652 3.51257073 1.09073047
 3.27453919 3.27453919 4.09965394 2.10005594 2.80891237 7.18013117
 2.09987524 1.15391435 1.08458079 3.90017939 2.8997487  2.85632975
 0.89974356 3.10019723 3.543363   2.09993418 4.5415874  0.6235

In [33]:
# Evaluate the model
mse_svr = mean_squared_error(y_test, y_pred_svr)
print(f'SVR Mean Squared Error: {mse_svr}')

SVR Mean Squared Error: 3.6818673892210776


In [34]:
# R-squared (R²)
r2 = r2_score(y_test, y_pred_svr)
print(f'R-squared (R²): {r2}')

R-squared (R²): 0.2214601433440192


In [35]:
# Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, y_pred_svr, squared=False)
print(f'Root Mean Squared Error (RMSE): {rmse}')


Root Mean Squared Error (RMSE): 1.9188192695564317


In [36]:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred_svr)
print(f'Mean Absolute Error (MAE): {mae}')

Mean Absolute Error (MAE): 1.0830026204569074


## Random Forest Regressor

In [37]:
from sklearn.ensemble import RandomForestRegressor

In [38]:
# Build and train the model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

RandomForestRegressor(random_state=42)

In [39]:
# Make predictions
y_pred_rf = rf_model.predict(X_test)
print(y_pred_rf)

[1.04       3.64       0.5        1.92       4.48       2.03
 4.05       7.87       5.76       1.12       0.13       0.99
 4.9        3.02       3.85       4.01       3.82       4.6
 0.         3.85714286 4.74       4.6        1.97       4.76
 4.01       4.         0.14       2.93       4.07125    3.885
 4.9        1.01       0.34       4.01       0.06       5.89
 2.04       1.49       2.64       0.86       0.48       3.99
 0.24       3.06       1.65       0.6        0.34       0.04
 3.79       2.         2.09       4.95       3.91       6.04
 1.         5.91       3.91       1.94       5.91       3.99
 3.97       0.13       4.92       0.6        2.7        2.05
 4.91       2.03       2.         1.04       0.04       7.87
 7.24       7.24       3.94       2.03       1.6978176  7.72
 1.49       0.34       0.09       3.79       2.64       0.86
 1.65       2.93       3.85       1.98       4.48       3.91
 4.57       6.51       1.48       4.5        0.48       7.94
 0.98       0.08       5

In [40]:
# Create a DataFrame to display actual and predicted values
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_rf})
print(results_df)

     Actual  Predicted
674       1       1.04
314       4       3.64
611       0       0.50
431       2       1.92
552       5       4.48
..      ...        ...
11        0       0.70
650       3       2.82
629       0       0.04
178       0       0.94
551       5       4.60

[209 rows x 2 columns]


In [41]:
# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f'Random Forest Mean Squared Error: {mse_rf}')

Random Forest Mean Squared Error: 0.19219077805614485


In [42]:
# R-squared (R²)
r2 = r2_score(y_test, y_pred_rf)
print(f'R-squared (R²): {r2}')

R-squared (R²): 0.9593607903325146


In [43]:
# Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, y_pred_rf, squared=False)
print(f'Root Mean Squared Error (RMSE): {rmse}')


Root Mean Squared Error (RMSE): 0.4383956866304057


In [44]:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred_rf)
print(f'Mean Absolute Error (MAE): {mae}')

Mean Absolute Error (MAE): 0.24821905208029602


In [45]:
delta = 1.0
loss = huber_loss(y_test, y_pred_rf, delta)

print(f"Huber Loss: {loss}")

Huber Loss: 0.08363546223006174


## KNN Regressor

In [46]:
from sklearn.neighbors import KNeighborsRegressor

In [47]:
# Create and train KNN Regressor
knn_regressor = KNeighborsRegressor(n_neighbors=5)  # You can adjust the number of neighbors (k) as needed
knn_regressor.fit(X_train, y_train)


KNeighborsRegressor()

In [48]:
# Make predictions
y_pred_knn = knn_regressor.predict(X_test)
print(y_pred_knn)

[0.6 1.6 3.  1.2 2.6 2.  4.  3.6 3.6 2.2 0.6 0.8 3.  1.8 4.  4.4 2.4 5.
 0.  4.4 3.  5.  1.2 3.8 3.2 4.  2.  2.6 1.6 4.  3.  1.  1.8 3.2 0.8 6.
 2.8 0.4 1.2 3.  2.4 2.4 1.2 3.8 3.8 2.2 0.8 0.  2.2 2.6 2.  3.  4.  6.
 1.  6.  4.4 2.  6.  2.4 4.  0.6 3.  2.2 2.4 2.  3.8 2.2 2.6 0.6 0.  3.6
 3.2 3.2 2.4 2.4 1.2 4.8 0.4 1.8 1.2 2.2 1.2 3.  3.8 3.4 4.  2.  2.6 4.
 2.  2.8 3.4 2.  2.4 7.2 0.8 0.4 3.  1.2 2.6 1.2 1.8 1.2 1.  1.6 4.4 2.4
 3.  2.4 3.6 4.6 6.  3.6 2.  1.2 2.  4.6 0.8 1.2 2.4 1.6 2.4 1.8 3.  1.2
 5.  1.2 3.6 3.6 1.6 1.2 2.4 2.  0.8 2.4 0.8 2.4 3.  3.2 1.2 4.4 3.4 5.
 1.8 2.  4.2 4.  4.6 1.6 1.2 3.2 1.  4.  0.8 4.  4.4 6.  3.8 6.  0.6 2.
 4.  4.  1.8 2.  2.  2.  3.  3.6 2.  4.  2.8 0.6 0.8 6.  1.2 3.2 1.8 3.2
 2.8 2.4 2.  3.6 4.2 0.4 3.  6.  3.  1.6 1.6 1.2 0.8 1.2 2.6 1.6 4.  3.4
 3.6 1.6 1.  5.  1.2 0.  3.  1.8 0.  1.6 4.4]


In [49]:
# Create a DataFrame to display actual and predicted values
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_knn})
print(results_df)

     Actual  Predicted
674       1        0.6
314       4        1.6
611       0        3.0
431       2        1.2
552       5        2.6
..      ...        ...
11        0        3.0
650       3        1.8
629       0        0.0
178       0        1.6
551       5        4.4

[209 rows x 2 columns]


In [50]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred_knn)
r2 = r2_score(y_test, y_pred_knn)
mae = mean_absolute_error(y_test, y_pred_knn)

print(f'Mean Squared Error: {mse}')
print(f'R-squared (R²): {r2}')
print(f'Mean Absolute Error (MAE): {mae}')

Mean Squared Error: 2.8258373205741627
R-squared (R²): 0.4024697931995972
Mean Absolute Error (MAE): 1.1913875598086126


## Gaussain Naives Bayes

In [51]:
from sklearn.naive_bayes import GaussianNB

In [52]:
# Build and train the Gaussian Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

GaussianNB()

In [53]:
# Make predictions
y_pred_nb = nb_model.predict(X_test)
print(y_pred_nb)

[1 4 5 2 5 2 4 9 6 1 1 1 2 3 1 4 4 2 1 4 2 2 2 5 4 4 0 3 5 4 2 1 1 4 2 6 2
 2 3 0 4 4 2 3 1 4 1 5 4 2 2 5 4 6 1 6 4 2 6 4 4 1 2 4 3 2 5 2 2 1 5 9 8 8
 4 2 3 8 2 1 0 4 3 0 1 3 1 2 5 4 6 7 1 5 4 8 1 1 5 2 3 3 3 2 1 4 4 4 5 3 6
 4 6 6 1 2 2 5 2 2 3 2 3 3 5 2 5 2 6 3 4 2 3 6 1 4 1 2 2 4 2 4 1 5 1 5 4 1
 4 4 2 8 1 1 1 4 5 6 1 6 1 2 4 4 3 1 2 2 5 3 2 4 7 1 2 6 2 2 3 4 4 5 2 6 4
 2 5 6 5 2 5 2 1 2 5 4 4 5 4 5 1 5 2 0 5 1 5 2 5]


In [54]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred_nb)
r2 = r2_score(y_test, y_pred_nb)
mae = mean_absolute_error(y_test, y_pred_nb)

print(f'Mean Squared Error: {mse}')
print(f'R-squared (R²): {r2}')
print(f'Mean Absolute Error (MAE): {mae}')

Mean Squared Error: 3.0239234449760763
R-squared (R²): 0.36058399814111997
Mean Absolute Error (MAE): 0.8038277511961722


## Decision Tree Regressor

In [55]:
from sklearn.tree import DecisionTreeRegressor

In [56]:
# Build and train the Decision Tree Regressor model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)


DecisionTreeRegressor(random_state=42)

In [57]:
# Make predictions
y_pred_dt = dt_model.predict(X_test)
print(y_pred_dt)

[1.         4.         0.         2.         5.         2.
 4.         9.         6.         1.         0.         1.
 5.         3.         4.         4.         4.         5.
 0.         4.         5.         5.         2.         5.
 4.         4.         0.         3.         5.         4.
 5.         1.         0.         4.         0.         6.
 2.         2.         3.         0.         0.         4.
 0.         3.         1.         0.         0.         0.
 4.         2.         2.         5.         4.         6.
 1.         6.         4.         2.         6.         4.
 4.         0.         5.         0.         3.         2.
 5.         2.         2.         1.         0.         9.
 8.         8.         4.         2.         1.71428571 8.
 2.         0.         0.         4.         3.         0.
 1.         3.         4.         2.         5.         4.
 5.         7.         1.         5.         0.         8.
 1.         0.         5.         2.         3.         

In [58]:
# Create a DataFrame to display actual and predicted values
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_dt})
print(results_df)

     Actual  Predicted
674       1        1.0
314       4        4.0
611       0        0.0
431       2        2.0
552       5        5.0
..      ...        ...
11        0        0.0
650       3        3.0
629       0        0.0
178       0        0.0
551       5        5.0

[209 rows x 2 columns]


In [59]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred_dt)
r2 = r2_score(y_test, y_pred_dt)
mae = mean_absolute_error(y_test, y_pred_dt)

print(f'Mean Squared Error: {mse}')
print(f'R-squared (R²): {r2}')
print(f'Mean Absolute Error (MAE): {mae}')

Mean Squared Error: 0.01406112684308173
R-squared (R²): 0.9970267403685198
Mean Absolute Error (MAE): 0.008202323991797676


In [60]:
delta = 1.0
loss = huber_loss(y_test, y_pred_dt, delta)

print(f"Huber Loss: {loss}")

Huber Loss: 0.00580997949419002


## Ridge Regression

In [61]:
from sklearn.linear_model import Ridge

In [62]:
 # Build and train the Ridge Regression model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train_scaled, y_train)


Ridge()

In [63]:
# Make predictions
y_pred_ridge = ridge_model.predict(X_test_scaled)
print(y_pred_ridge)

[1.00310323e+00 3.88372929e+00 1.63224162e-01 1.99238545e+00
 4.99927977e+00 1.99996717e+00 4.00047166e+00 8.96212095e+00
 5.71862521e+00 1.00091923e+00 1.97248769e-02 9.75736635e-01
 4.99125506e+00 3.00018222e+00 3.99709404e+00 3.99924618e+00
 3.99959191e+00 4.88040917e+00 2.31367091e-02 3.99981002e+00
 4.97737302e+00 4.88040917e+00 2.00000622e+00 4.97994393e+00
 3.99702149e+00 4.00251032e+00 6.21111744e-03 2.99915779e+00
 4.99973116e+00 3.99987095e+00 4.99125506e+00 1.00050023e+00
 6.88682155e-02 3.99702149e+00 1.71522776e-03 5.99946238e+00
 2.00048168e+00 2.00151460e+00 2.99730953e+00 8.63313423e-03
 1.33425722e-02 3.99325702e+00 1.13472807e-02 3.00030148e+00
 9.59723837e-01 5.60812404e-04 9.82367995e-02 5.02012044e-02
 3.99958579e+00 2.00715109e+00 2.01081512e+00 4.96775105e+00
 3.97371734e+00 5.99941229e+00 1.00058737e+00 5.99187068e+00
 3.99993965e+00 1.99991925e+00 5.99187068e+00 3.99325702e+00
 3.99957621e+00 1.97248769e-02 4.99325762e+00 5.60812404e-04
 2.99914024e+00 2.002258

In [64]:
# Create a DataFrame to display actual and predicted values
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_knn})
print(results_df)

     Actual  Predicted
674       1        0.6
314       4        1.6
611       0        3.0
431       2        1.2
552       5        2.6
..      ...        ...
11        0        3.0
650       3        1.8
629       0        0.0
178       0        1.6
551       5        4.4

[209 rows x 2 columns]


In [65]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred_ridge)
r2 = r2_score(y_test, y_pred_ridge)
mae = mean_absolute_error(y_test, y_pred_ridge)

print(f'Mean Squared Error: {mse}')
print(f'R-squared (R²): {r2}')
print(f'Mean Absolute Error (MAE): {mae}')

Mean Squared Error: 0.015579833875988719
R-squared (R²): 0.9967056060552191
Mean Absolute Error (MAE): 0.0261909598050387


In [66]:
delta = 1.0
loss = huber_loss(y_test, y_pred_ridge, delta)

print(f"Huber Loss: {loss}")

Huber Loss: 0.006569004386879058


## Partial Least Squares Regression (PLSR)

In [67]:
from sklearn.cross_decomposition import PLSRegression

In [68]:
# Build and train the PLSR model
plsr_model = PLSRegression(n_components=2)
plsr_model.fit(X_train_scaled, y_train)

PLSRegression()

In [69]:
# Make predictions
y_pred_plsr = plsr_model.predict(X_test_scaled)
print(y_pred_plsr)

[[1.33337023]
 [1.52321794]
 [4.67433512]
 [1.21310314]
 [3.19728812]
 [2.14411336]
 [5.15950756]
 [2.34671577]
 [3.38088017]
 [2.4384553 ]
 [1.34939545]
 [1.65269713]
 [2.77712908]
 [2.85536607]
 [2.93326907]
 [3.04019629]
 [3.05648219]
 [1.92269958]
 [1.47055893]
 [3.99089062]
 [1.90800665]
 [1.92269958]
 [1.97610588]
 [2.331381  ]
 [2.8932446 ]
 [3.01679014]
 [2.73753714]
 [2.47383682]
 [3.23670368]
 [3.0566948 ]
 [2.77712908]
 [2.40414565]
 [1.34457928]
 [2.8932446 ]
 [1.02107661]
 [7.14787951]
 [2.33276187]
 [1.87817334]
 [2.4770451 ]
 [2.68845529]
 [3.21119695]
 [3.30753021]
 [1.1959354 ]
 [3.0316931 ]
 [1.50629567]
 [2.08374654]
 [1.78190131]
 [2.80522704]
 [3.10152432]
 [2.68259926]
 [2.30325346]
 [4.71162384]
 [0.4805396 ]
 [7.01732921]
 [0.35955202]
 [7.03998021]
 [4.70636588]
 [1.52960327]
 [7.03998021]
 [3.30753021]
 [3.44097984]
 [1.34939545]
 [2.77217021]
 [2.08374654]
 [2.4524353 ]
 [2.38927598]
 [3.58414849]
 [2.39040636]
 [2.68259926]
 [1.33337023]
 [2.80522704]
 [2.34

In [70]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred_plsr)
r2 = r2_score(y_test, y_pred_plsr)
mae = mean_absolute_error(y_test, y_pred_plsr)

print(f'Mean Squared Error: {mse}')
print(f'R-squared (R²): {r2}')
print(f'Mean Absolute Error (MAE): {mae}')

Mean Squared Error: 3.4096934686447
R-squared (R²): 0.27901198394842996
Mean Absolute Error (MAE): 1.3768668760422014


## Ordinary Least Squares Regression (OLSR)

In [71]:
import statsmodels.api as sm

In [72]:
# Add a constant term to the features matrix
X = sm.add_constant(X_combined_vectorized)
print(X)

     const  Question_Score    0    1    2    3    4    5    6    7  ...  3243  \
0      1.0              10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
1      1.0              10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
2      1.0              10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
3      1.0              10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
4      1.0              10  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
..     ...             ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   ...   
691    1.0               4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
692    1.0               4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
693    1.0               3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
694    1.0               3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
695    1.0               3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   

     3244      3245  3246  

In [73]:
# Create and fit the OLS model
ols_model = sm.OLS(y, X).fit()

In [74]:
# Get the predicted values
y_pred_ols = ols_model.predict(X)
print(y_pred_ols)

0      7.000000e+00
1      6.039613e-14
2      9.000000e+00
3      4.618528e-14
4      5.417888e-14
           ...     
691    2.000000e+00
692    3.019807e-14
693    2.000000e+00
694    2.000000e+00
695    3.552714e-14
Length: 696, dtype: float64


In [75]:
# Compare actual vs. predicted values
result_df = pd.DataFrame({'Actual': y, 'Predicted': y_pred_ols})
print(result_df)

     Actual     Predicted
0         7  7.000000e+00
1         0  6.039613e-14
2         9  9.000000e+00
3         0  4.618528e-14
4         0  5.417888e-14
..      ...           ...
691       2  2.000000e+00
692       0  3.019807e-14
693       2  2.000000e+00
694       2  2.000000e+00
695       0  3.552714e-14

[696 rows x 2 columns]


In [76]:
# Print the summary of the model
print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:          Student_Score   R-squared:                       0.994
Model:                            OLS   Adj. R-squared:                  0.992
Method:                 Least Squares   F-statistic:                     523.8
Date:                Thu, 25 Jan 2024   Prob (F-statistic):               0.00
Time:                        23:52:18   Log-Likelihood:                 284.35
No. Observations:                 696   AIC:                            -230.7
Df Residuals:                     527   BIC:                             537.5
Df Model:                         168                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const              2.1323      0.110     19.

In [77]:
delta = 1.0
loss = huber_loss(y, y_pred_ols, delta)

print(f"Huber Loss: {loss}")

Huber Loss: 0.011494252873563218
