In [2]:
import pandas as pd

# Load the dataset (assuming CSV format)
data_f = pd.read_csv('Processed_data.csv')

# Check the dataset structure
print(data_f.head())


   Unnamed: 0  essay_id  essay_set  \
0           0         1          1   
1           1         2          1   
2           2         3          1   
3           3         4          1   
4           4         5          1   

                                               essay  final_score  \
0  Dear local newspaper, I think effects computer...            6   
1  Dear I believe that using computers will benef...            7   
2  Dear, More and more people use computers, but ...            5   
3  Dear Local Newspaper, I have found that many e...            8   
4  Dear I know having computers has a positive ef...            6   

                                         clean_essay  char_count  word_count  \
0  Dear local newspaper  I think effects computer...        1441         344   
1  Dear I believe using computers benefit us many...        1765         413   
2  Dear  More people use computers  everyone agre...        1185         276   
3  Dear Local Newspaper  I found man

In [5]:
import nltk
from nltk.corpus import stopwords
import re

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Check the actual column names in your DataFrame
print(data_f.columns)

# Assuming the column name is 'essay' (Replace with the actual name from the output above)
data_f['cleaned_essay'] = data_f['essay'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Index(['Unnamed: 0', 'essay_id', 'essay_set', 'essay', 'final_score',
       'clean_essay', 'char_count', 'word_count', 'sent_count', 'avg_word_len',
       'spell_err_count', 'noun_count', 'adj_count', 'verb_count',
       'adv_count'],
      dtype='object')


In [7]:
import spacy

nlp = spacy.load('en_core_web_sm')

# Function to count grammar errors (as a feature)
def grammar_check(text):
    doc = nlp(text)
    errors = len([token for token in doc if token.is_oov])  # Check for out-of-vocabulary words (as a proxy for errors)
    return errors

# Example: Apply grammar check to your preprocessed essays
data_f['grammar_errors'] = data_f['cleaned_essay'].apply(grammar_check)


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=1000)  # Adjust max_features to suit your needs

# Fit and transform your preprocessed text data
X_content = tfidf.fit_transform(data_f['cleaned_essay'])

# Convert TF-IDF matrix to DataFrame for easier handling
content_features = pd.DataFrame(X_content.toarray())


In [12]:
import nltk

# Ensure NLTK is set up
nltk.download('punkt')

# Function to calculate average sentence length
def structure_features(text):
    sentences = nltk.sent_tokenize(text)
    avg_sentence_length = sum(len(sentence.split()) for sentence in sentences) / len(sentences)
    return avg_sentence_length

# Apply structure feature extraction
data_f['structure_score'] = data_f['cleaned_essay'].apply(structure_features)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
# Combine all features into a single feature matrix
# (grammar_errors, structure_score, and TF-IDF content features)
X = pd.concat([data_f[['grammar_errors', 'structure_score']], pd.DataFrame(X_content.toarray())], axis=1)

# Convert all column names to strings
X.columns = X.columns.astype(str)  # This line is added to fix the error

# Your target variable (the score you want to predict)
y = data_f['final_score']  # Replace 'score' with your actual score column name

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
from sklearn.linear_model import LinearRegression

# Initialize the model
model = LinearRegression()

# Train the model on your training data
model.fit(X_train, y_train)

In [25]:
# Predict on the test set to evaluate the performance
y_pred = model.predict(X_test)

In [20]:
from sklearn.metrics import mean_squared_error, r2_score

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")


Mean Squared Error: 2.6320564160963307
R-squared Score: 0.5669570039527558


In [21]:
import pickle

# Save the trained model
with open('essay_grading_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)


In [30]:
# Load the trained model and vectorizer
with open('essay_grading_model.pkl', 'rb') as f:
    model = pickle.load(f)

with open('tfidf_vectorizer.pkl', 'rb') as f:
    tfidf = pickle.load(f)


In [28]:
# Function to preprocess, extract features, and predict the score
def predict_essay_score(essay):
    # Preprocess the input essay (apply your existing preprocessing function)
    cleaned_essay = preprocess_text(essay)

    # Extract grammar errors
    grammar_errors = grammar_check(cleaned_essay)

    # Extract structure score
    structure_score = structure_features(cleaned_essay)

    # Extract content features using TF-IDF
    content_features = tfidf.transform([cleaned_essay])

    # Combine all features into a single feature set
    features = pd.concat([pd.DataFrame([[grammar_errors, structure_score]]),
                          pd.DataFrame(content_features.toarray())], axis=1)

    # Predict the score using the loaded model
    predicted_score = model.predict(features)[0]

    # Assuming your score is out of 10, return it
    return round(predicted_score, 2)


In [32]:
# Review/Feedback based on the feature values

def generate_feedback(grammar_errors, structure_score, cleaned_essay): # Assuming these are available
    feedback = []

    # Grammar Feedback
    if grammar_errors > 5:
        feedback.append(f"Your essay contains {grammar_errors} grammar errors. Consider reviewing sentence structure and usage of correct tense.")
    else:
        feedback.append("Your grammar is generally good with only a few minor errors.")

    # Structure Feedback
    if structure_score > 20:  # Assume very long sentences can be a structural issue
        feedback.append("Your sentences tend to be long. Try breaking them up into shorter sentences for clarity.")
    else:
        feedback.append("Your sentence structure is concise and easy to follow.")

    # Content Feedback
    # Assuming content quality is indirectly reflected in TF-IDF features, you can analyze it based on key terms, uniqueness, or word count
    if len(cleaned_essay.split()) < 150:  # Word count threshold (150 words in this case)
        feedback.append("Your essay is a bit short. Try adding more details to support your arguments.")
    else:
        feedback.append("Your content is well-developed and detailed enough to cover the topic.")

    return feedback # Return the feedback list

In [35]:
 #Define a function that combines the prediction and feedback functionalities
def review_essay(essay):
    # 1. Preprocess the input essay
    cleaned_essay = preprocess_text(essay)  # Replace 'preprocess_text' with your actual function

    # 2. Extract features
    grammar_errors = grammar_check(cleaned_essay)  # Replace 'grammar_check' with your function
    structure_score = structure_features(cleaned_essay)  # Replace 'structure_features' with your function

    # 3. Predict the score
    predicted_score = predict_essay_score(essay)  # Call the existing predict_essay_score function

    # 4. Generate feedback
    review_feedback = generate_feedback(grammar_errors, structure_score, cleaned_essay)

    return predicted_score, review_feedback

In [36]:
# User Input Example
user_essay = input("Please enter your essay: ")

# Get the predicted score and feedback
predicted_score, review_feedback = review_essay(user_essay)

# Output the results
print(f"Predicted Score: {predicted_score} out of 10\n")
print("Review of Your Essay:")
for feedback in review_feedback:
    print(f"- {feedback}")


Please enter your essay: Global warming is a phenomenon where the earth’s average temperature rises due to increased amounts of greenhouse gases. Greenhouse gases such as carbon dioxide, methane and ozone trap the incoming radiation from the sun. This effect creates a natural “blanket”, which prevents the heat from escaping back into the atmosphere. This effect is called the greenhouse effect.  Contrary to popular belief, greenhouse gases are not inherently bad. In fact, the greenhouse effect is quite important for life on earth. Without this effect, the sun’s radiation would be reflected back into the atmosphere, freezing the surface and making life impossible. However, when greenhouse gases in excess amounts get trapped, serious repercussions begin to appear. The polar ice caps begin to melt, leading to a rise in sea levels. Furthermore, the greenhouse effect is accelerated when polar ice caps and sea ice melts. This is due to the fact the ice reflects 50% to 70% of the sun’s rays ba

