In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, f1_score, roc_auc_score, r2_score, mean_squared_error, mean_absolute_error


In [2]:
# Download NLTK resources
nltk.download('punkt') 
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load data from CSV
df = pd.read_csv('./Essay Grading Dataset/Essay Grading Dataset JET.csv', encoding='Latin-1')
df.head

<bound method NDFrame.head of      Question_ID                                      Comprehension  \
0            1.0  In Natural Language Processing, Feature Extrac...   
1            1.0  In Natural Language Processing, Feature Extrac...   
2            1.0  So we know that machines can only understand n...   
3            1.0  So we know that machines can only understand n...   
4            1.0  If we ask any NLP practitioner or data scienti...   
..           ...                                                ...   
691          4.0  n\tSNOBOL ("StriNg Oriented and symBOlic Langu...   
692          4.0  n\tSNOBOL ("StriNg Oriented and symBOlic Langu...   
693          4.0  Portability: \nSo that the program can be move...   
694          4.0  Portability: \nSo that the program can be move...   
695          4.0  Portability: \nSo that the program can be move...   

                                              Question  \
0             What is Feature Extraction from the text   
1

In [4]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove non-alphanumeric characters and keep spaces
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    print('text:', text[0])
    
    # Tokenize the text into words
#     tokens = nltk.word_tokenize(text)
    tokens = word_tokenize(text)
    print('token:',tokens[0])
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join the preprocessed tokens back into a single string
    return ' '.join(tokens)


In [5]:
def huber_loss(y_true, y_pred, delta=1.0):
    residual = np.abs(y_true - y_pred)
    quadratic_loss = 0.5 * (residual ** 2)
    linear_loss = delta * (residual - 0.5 * delta)
    
    loss = np.where(residual <= delta, quadratic_loss, linear_loss)
    
    return np.mean(loss)

In [6]:
# Combine text features
X_text = df['Comprehension'] + ' ' + df['Question'] + ' ' + df['Examiner_Answer'] + ' ' + df['Student_Answer']
print(X_text)

0      In Natural Language Processing, Feature Extrac...
1      In Natural Language Processing, Feature Extrac...
2      So we know that machines can only understand n...
3      So we know that machines can only understand n...
4      If we ask any NLP practitioner or data scienti...
                             ...                        
691    n\tSNOBOL ("StriNg Oriented and symBOlic Langu...
692    n\tSNOBOL ("StriNg Oriented and symBOlic Langu...
693    Portability: \nSo that the program can be move...
694    Portability: \nSo that the program can be move...
695    Portability: \nSo that the program can be move...
Length: 696, dtype: object


In [7]:
# Numerical features
X_numeric = df[['Question_Score']]
print(X_numeric)

     Question_Score
0                10
1                10
2                10
3                10
4                10
..              ...
691               4
692               4
693               3
694               3
695               3

[696 rows x 1 columns]


In [None]:
# Text preprocessing
X_text_preprocessed = X_text.apply(preprocess_text)
print(X_text_preprocessed)

text: i
token: in
text: i
token: in
text: s
token: so
text: s
token: so
text: i
token: if
text: i
token: if
text: a
token: artificial
text: a
token: artificial
text: a
token: artificial
text: a
token: artificial
text: a
token: artificial
text: a
token: all
text: a
token: all
text: a
token: all
text: a
token: as
text: a
token: as
text: a
token: ai
text: a
token: ai
text: a
token: ai
text: a
token: artificial
text: a
token: arend
text: a
token: arend
text: a
token: arend
text: a
token: arend
text: a
token: arend
text: t
token: the
text: t
token: the
text: t
token: the
text: t
token: the
text: t
token: types
text: s
token: software
text: i
token: importance
text: b
token: benefits
text: u
token: unit
text: i
token: integration
text: a
token: a
text: a
token: a
text: i
token: in
text: t
token: the
text: p
token: pairwise
text: a
token: advantages
text: s
token: state
text: s
token: state
text: a
token: advantages
text: a
token: advantages
text: f
token: functional
text: f
token: functional

text: p
token: performance
text: t
token: types
text: f
token: features
text: a
token: a
text: a
token: a
text: a
token: a
text: a
token: a
text: t
token: the
text: o
token: objectives
text: o
token: objectives
text: o
token: objectives
text: o
token: objectives
text: o
token: objectives
text: d
token: data
text: d
token: data
text: d
token: data
text: d
token: data
text: d
token: data
text: d
token: data
text: d
token: data
text: d
token: data
text: d
token: data
text: d
token: data
text:  
token: the
text:  
token: the
text:  
token: the
text: y
token: yes
text: y
token: yes
text: y
token: yes
text: t
token: top
text: s
token: some
text: s
token: some
text: s
token: systems
text: s
token: systems
text: k
token: knowledge
text: k
token: knowledge
text: k
token: knowledge
text: k
token: knowledge
text: k
token: knowledge
text: 1
token: 1
text: 1
token: 1
text: 1
token: 1
text: 1
token: 1
text: c
token: classification
text: c
token: classification
text: 	
token: expertise
text: 	
token:

In [None]:
# Text vectorization
tfidf_vectorizer = TfidfVectorizer()
X_text_vectorized = tfidf_vectorizer.fit_transform(X_text_preprocessed)
print(X_text_vectorized)

In [None]:
# Combine numerical and text features
X_combined_vectorized = pd.concat([X_numeric.reset_index(drop=True), pd.DataFrame(X_text_vectorized.toarray())], axis=1)
print(X_combined_vectorized)

In [None]:
# Define target variable
y = df['Student_Score']
print(y)

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_combined_vectorized, y, test_size=0.3, random_state=42)

## Linear Regression

In [None]:
# Build and train the model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = model.predict(X_test)
print(y_pred)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

In [None]:
# R-squared (R²)
r2 = r2_score(y_test, y_pred)
print(f'R-squared (R²): {r2}')

In [None]:
# Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error (RMSE): {rmse}')


In [None]:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error (MAE): {mae}')

## Logistic Regression 

In [None]:
# Build and train the model
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred_log = logistic_model.predict(X_test)
print(y_pred_log)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred_log)
r2 = r2_score(y_test, y_pred_log)
mae = mean_absolute_error(y_test, y_pred_log)

print(f'Mean Squared Error: {mse}')
print(f'R-squared (R²): {r2}')
print(f'Mean Absolute Error (MAE): {mae}')

## Gradient Boosting Regressor

In [None]:
import xgboost as xgb

In [None]:
# Build and train the model
xgb_model = xgb.XGBRegressor(objective ='reg:squarederror', random_state=42)
xgb_model.fit(X_train, y_train)


In [None]:
# Make predictions
y_pred_xgb = xgb_model.predict(X_test)
print(y_pred_xgb)

In [None]:
# Evaluate the model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f'XGBoost Mean Squared Error: {mse_xgb}')

In [None]:
# R-squared (R²)
r2 = r2_score(y_test, y_pred_xgb)
print(f'R-squared (R²): {r2}')

In [None]:
# Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, y_pred_xgb, squared=False)
print(f'Root Mean Squared Error (RMSE): {rmse}')


In [None]:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred_xgb)
print(f'Mean Absolute Error (MAE): {mae}')

## Support Vector Regressor (SVR)

In [None]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

In [None]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Build and train the model
svr_model = SVR()
svr_model.fit(X_train_scaled, y_train)


In [None]:
# Make predictions
y_pred_svr = svr_model.predict(X_test_scaled)
print(y_pred_svr)

In [None]:
# Evaluate the model
mse_svr = mean_squared_error(y_test, y_pred_svr)
print(f'SVR Mean Squared Error: {mse_svr}')

In [None]:
# R-squared (R²)
r2 = r2_score(y_test, y_pred_svr)
print(f'R-squared (R²): {r2}')

In [None]:
# Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, y_pred_svr, squared=False)
print(f'Root Mean Squared Error (RMSE): {rmse}')


In [None]:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred_svr)
print(f'Mean Absolute Error (MAE): {mae}')

## Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Build and train the model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred_rf = rf_model.predict(X_test)
print(y_pred_rf)

In [None]:
# Create a DataFrame to display actual and predicted values
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_rf})
print(results_df)

In [None]:
# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f'Random Forest Mean Squared Error: {mse_rf}')

In [None]:
# R-squared (R²)
r2 = r2_score(y_test, y_pred_rf)
print(f'R-squared (R²): {r2}')

In [None]:
# Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, y_pred_rf, squared=False)
print(f'Root Mean Squared Error (RMSE): {rmse}')


In [None]:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred_rf)
print(f'Mean Absolute Error (MAE): {mae}')

In [None]:
delta = 1.0
loss = huber_loss(y_test, y_pred_rf, delta)

print(f"Huber Loss: {loss}")

## KNN Regressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
# Create and train KNN Regressor
knn_regressor = KNeighborsRegressor(n_neighbors=5)  # You can adjust the number of neighbors (k) as needed
knn_regressor.fit(X_train, y_train)


In [None]:
# Make predictions
y_pred_knn = knn_regressor.predict(X_test)
print(y_pred_knn)

In [None]:
# Create a DataFrame to display actual and predicted values
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_knn})
print(results_df)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred_knn)
r2 = r2_score(y_test, y_pred_knn)
mae = mean_absolute_error(y_test, y_pred_knn)

print(f'Mean Squared Error: {mse}')
print(f'R-squared (R²): {r2}')
print(f'Mean Absolute Error (MAE): {mae}')

## Gaussain Naives Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
# Build and train the Gaussian Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred_nb = nb_model.predict(X_test)
print(y_pred_nb)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred_nb)
r2 = r2_score(y_test, y_pred_nb)
mae = mean_absolute_error(y_test, y_pred_nb)

print(f'Mean Squared Error: {mse}')
print(f'R-squared (R²): {r2}')
print(f'Mean Absolute Error (MAE): {mae}')

## Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
# Build and train the Decision Tree Regressor model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)


In [None]:
# Make predictions
y_pred_dt = dt_model.predict(X_test)
print(y_pred_dt)

In [None]:
# Create a DataFrame to display actual and predicted values
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_dt})
print(results_df)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred_dt)
r2 = r2_score(y_test, y_pred_dt)
mae = mean_absolute_error(y_test, y_pred_dt)

print(f'Mean Squared Error: {mse}')
print(f'R-squared (R²): {r2}')
print(f'Mean Absolute Error (MAE): {mae}')

In [None]:
delta = 1.0
loss = huber_loss(y_test, y_pred_dt, delta)

print(f"Huber Loss: {loss}")

## Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

In [None]:
 # Build and train the Ridge Regression model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train_scaled, y_train)


In [None]:
# Make predictions
y_pred_ridge = ridge_model.predict(X_test_scaled)
print(y_pred_ridge)

In [None]:
# Create a DataFrame to display actual and predicted values
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_knn})
print(results_df)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred_ridge)
r2 = r2_score(y_test, y_pred_ridge)
mae = mean_absolute_error(y_test, y_pred_ridge)

print(f'Mean Squared Error: {mse}')
print(f'R-squared (R²): {r2}')
print(f'Mean Absolute Error (MAE): {mae}')

In [None]:
delta = 1.0
loss = huber_loss(y_test, y_pred_ridge, delta)

print(f"Huber Loss: {loss}")

## Partial Least Squares Regression (PLSR)

In [None]:
from sklearn.cross_decomposition import PLSRegression

In [None]:
# Build and train the PLSR model
plsr_model = PLSRegression(n_components=2)
plsr_model.fit(X_train_scaled, y_train)

In [None]:
# Make predictions
y_pred_plsr = plsr_model.predict(X_test_scaled)
print(y_pred_plsr)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred_plsr)
r2 = r2_score(y_test, y_pred_plsr)
mae = mean_absolute_error(y_test, y_pred_plsr)

print(f'Mean Squared Error: {mse}')
print(f'R-squared (R²): {r2}')
print(f'Mean Absolute Error (MAE): {mae}')

## Ordinary Least Squares Regression (OLSR)

In [None]:
import statsmodels.api as sm

In [None]:
# Add a constant term to the features matrix
X = sm.add_constant(X_combined_vectorized)
print(X)

In [None]:
# Create and fit the OLS model
ols_model = sm.OLS(y, X).fit()

In [None]:
# Get the predicted values
y_pred_ols = ols_model.predict(X)
print(y_pred_ols)

In [None]:
# Compare actual vs. predicted values
result_df = pd.DataFrame({'Actual': y, 'Predicted': y_pred_ols})
print(result_df)

In [None]:
# Print the summary of the model
print(ols_model.summary())

In [None]:
delta = 1.0
loss = huber_loss(y, y_pred_ols, delta)

print(f"Huber Loss: {loss}")

## Dumped DecisionTree Regressor

In [None]:
import joblib

In [None]:
# dt_model
# Save the model to a file using joblib
joblib.dump(dt_model, 'dt_model.joblib')

In [None]:
import pickle

with open('dt_model.pkl', 'wb') as f:
    pickle.dump(dt_model, f)

In [None]:
# dt_model
# Save the model to a file using joblib
joblib.dump(dt_model, 'dt_model.new')