In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from scipy import stats
import textstat
from transformers import BertTokenizer, BertModel
import torch
import warnings
warnings.filterwarnings('ignore')

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...


True

In [2]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.5-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting cmudict (from textstat)
  Downloading cmudict-1.0.32-py3-none-any.whl.metadata (3.6 kB)
Downloading textstat-0.7.5-py3-none-any.whl (105 kB)
Downloading cmudict-1.0.32-py3-none-any.whl (939 kB)
   ---------------------------------------- 0.0/939.4 kB ? eta -:--:--
   ----------- ---------------------------- 262.1/939.4 kB ? eta -:--:--
   ----------- ---------------------------- 262.1/939.4 kB ? eta -:--:--
   --------------------- ---------------- 524.3/939.4 kB 621.2 kB/s eta 0:00:01
   ---------------------------------------- 939.4/939.4 kB 1.1 MB/s eta 0:00:00
Downloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   ---------- ----------------------------- 0.5/2.1 MB 3.4 MB/s eta 0:00:01
   ---------- ---------------------------

In [3]:
import pandas as pd
import random

# Sample essay data with realistic scores and rubric breakdowns
data = {
    "essay_id": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    "essay_text": [
        "Education is the most powerful weapon which you can use to change the world, as Nelson Mandela once said. I completely agree because education develops critical thinking and knowledge. Educated societies are more prosperous and peaceful. While experience is valuable, education provides the foundation for personal and societal growth.",
        "Climate change is evident through rising temperatures, melting ice caps, and extreme weather. Human activities like burning fossil fuels are the primary cause. We must transition to renewable energy, reforest areas, and reduce waste to mitigate these effects before it's too late for future generations.",
        "Artificial intelligence revolutionizes healthcare through improved diagnostics, enables personalized education, and enhances business operations. However, it raises concerns about job displacement and privacy. We need ethical guidelines to ensure AI benefits society while minimizing risks of misuse.",
        "Social media has transformed communication, enabling instant global connections but reducing face-to-face interactions. While it facilitates idea sharing, issues like cyberbullying and addiction persist. Balanced usage is key to harnessing its benefits without negative consequences.",
        "The Industrial Revolution began in 18th century Britain, introducing mechanized production that spurred urbanization. While it boosted economic output, it also created poor working conditions and environmental pollution. Its technological innovations still influence modern manufacturing processes.",
        "Regular exercise strengthens cardiovascular health, builds muscle, and reduces stress. I incorporate 30 minutes of activity daily through walking or sports. Finding enjoyable physical activities is crucial for maintaining long-term fitness and mental wellbeing.",
        "Reading books enhances imagination and vocabulary more than watching movies. Books allow personal interpretation of characters and settings, while films often omit key details. Reading develops critical thinking and creativity that visual media cannot replicate.",
        "Space exploration drives scientific discovery and technological innovation with Earth applications. Collaborative projects like the ISS demonstrate peaceful international cooperation. While costly, space research often provides solutions to terrestrial challenges, making it a worthwhile investment.",
        "School uniforms promote equality by reducing socioeconomic disparities visible through clothing. They minimize distractions, enhance safety, and foster school identity. Students can express individuality through accessories and achievements rather than fashion choices.",
        "Part-time jobs teach teenagers responsibility, time management, and financial literacy. Earning their own money provides practical budgeting experience. With proper scheduling, work complements rather than conflicts with academic responsibilities."
    ],
    "score": [8.5, 8.8, 9.2, 7.1, 8.7, 7.5, 8.9, 9.0, 7.8, 8.2],
    "grade_level": ["high_school"]*5 + ["middle_school"]*2 + ["college"]*3,
    "topic": ["education", "environment", "technology", "society", "history", 
             "health", "literature", "science", "education", "economics"],
    "rubric_scores": [
        "{'content':9, 'organization':8, 'grammar':9, 'vocabulary':8}",
        "{'content':9, 'organization':9, 'grammar':8, 'vocabulary':9}",
        "{'content':10, 'organization':9, 'grammar':9, 'vocabulary':9}",
        "{'content':7, 'organization':7, 'grammar':8, 'vocabulary':7}",
        "{'content':9, 'organization':9, 'grammar':8, 'vocabulary':9}",
        "{'content':7, 'organization':8, 'grammar':8, 'vocabulary':7}",
        "{'content':9, 'organization':9, 'grammar':9, 'vocabulary':9}",
        "{'content':9, 'organization':9, 'grammar':9, 'vocabulary':9}",
        "{'content':8, 'organization':8, 'grammar':8, 'vocabulary':7}",
        "{'content':8, 'organization':8, 'grammar':9, 'vocabulary':8}"
    ]
}

# Create DataFrame
df = pd.DataFrame(data)

# Add prompt_id (mapping topics to IDs)
topic_to_prompt = {topic: idx+100 for idx, topic in enumerate(df['topic'].unique())}
df['prompt_id'] = df['topic'].map(topic_to_prompt)

# Reorder columns
df = df[['essay_id', 'essay_text', 'score', 'prompt_id', 'topic', 'grade_level', 'rubric_scores']]

# Save to CSV
df.to_csv('essay_scoring_dataset.csv', index=False)

print("Dataset saved as 'essay_scoring_dataset.csv'")
print(df.head())

Dataset saved as 'essay_scoring_dataset.csv'
   essay_id                                         essay_text  score  \
0         1  Education is the most powerful weapon which yo...    8.5   
1         2  Climate change is evident through rising tempe...    8.8   
2         3  Artificial intelligence revolutionizes healthc...    9.2   
3         4  Social media has transformed communication, en...    7.1   
4         5  The Industrial Revolution began in 18th centur...    8.7   

   prompt_id        topic  grade_level  \
0        100    education  high_school   
1        101  environment  high_school   
2        102   technology  high_school   
3        103      society  high_school   
4        104      history  high_school   

                                       rubric_scores  
0  {'content':9, 'organization':8, 'grammar':9, '...  
1  {'content':9, 'organization':9, 'grammar':8, '...  
2  {'content':10, 'organization':9, 'grammar':9, ...  
3  {'content':7, 'organization':7, 'grammar

In [5]:
# Load your essay dataset (replace with your data)
# Expected columns: 'essay_id', 'essay_text', 'score'
df = pd.read_csv('essay_scoring_dataset.csv')

# Text cleaning function
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Preprocessing pipeline
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing
df['cleaned_text'] = df['essay_text'].apply(clean_text)
df['processed_text'] = df['cleaned_text'].apply(preprocess_text)

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\acer/nltk_data'
    - 'C:\\Users\\acer\\anaconda3\\nltk_data'
    - 'C:\\Users\\acer\\anaconda3\\share\\nltk_data'
    - 'C:\\Users\\acer\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\acer\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [None]:
def extract_features(text):
    features = {}
    
    # Basic statistics
    features['char_count'] = len(text)
    features['word_count'] = len(text.split())
    features['sentence_count'] = textstat.sentence_count(text)
    features['avg_word_length'] = features['char_count'] / features['word_count'] if features['word_count'] > 0 else 0
    features['avg_sentence_length'] = features['word_count'] / features['sentence_count'] if features['sentence_count'] > 0 else 0
    
    # Readability metrics
    features['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
    features['smog_index'] = textstat.smog_index(text)
    features['coleman_liau_index'] = textstat.coleman_liau_index(text)
    
    # Vocabulary diversity
    words = text.split()
    unique_words = set(words)
    features['lexical_diversity'] = len(unique_words) / len(words) if len(words) > 0 else 0
    
    # Grammar and style (simplified)
    features['pronoun_count'] = len([word for word in words if word in ['i', 'you', 'he', 'she', 'it', 'we', 'they']])
    features['preposition_count'] = len([word for word in words if word in ['in', 'on', 'at', 'by', 'for', 'with']])
    
    return features

# Extract features for all essays
feature_list = []
for text in df['cleaned_text']:
    feature_list.append(extract_features(text))
    
features_df = pd.DataFrame(feature_list)

In [None]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(text, max_length=512):
    # Tokenize text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=max_length, padding='max_length')
    
    # Get BERT embeddings
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Use mean of last hidden states as document embedding
    last_hidden_states = outputs.last_hidden_state
    doc_embedding = torch.mean(last_hidden_states, dim=1).squeeze().numpy()
    
    return doc_embedding

# Get BERT embeddings (this may take a while)
bert_embeddings = np.array([get_bert_embeddings(text) for text in df['cleaned_text']])
bert_columns = [f'bert_{i}' for i in range(bert_embeddings.shape[1])]
bert_df = pd.DataFrame(bert_embeddings, columns=bert_columns)

In [None]:
# Combine all features
X = pd.concat([features_df, bert_df], axis=1)
y = df['score']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Gradient Boosting Model
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

# Ensemble prediction
def ensemble_predict(X):
    rf_pred = rf_model.predict(X)
    gb_pred = gb_model.predict(X)
    return (rf_pred + gb_pred) / 2

In [None]:
# Predictions
y_pred = ensemble_predict(X_test)

# Evaluation metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
pearson_r = stats.pearsonr(y_test, y_pred)[0]

print(f"Mean Squared Error: {mse:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Pearson Correlation: {pearson_r:.4f}")

In [None]:
from collections import defaultdict

def generate_feedback(essay_text):
    # Preprocess
    cleaned_text = clean_text(essay_text)
    processed_text = preprocess_text(cleaned_text)
    
    # Extract features
    features = extract_features(cleaned_text)
    
    # Get BERT embedding
    bert_embedding = get_bert_embeddings(cleaned_text)
    
    # Combine features
    features_df = pd.DataFrame([features])
    bert_df = pd.DataFrame([bert_embedding], columns=bert_columns)
    X = pd.concat([features_df, bert_df], axis=1)
    
    # Predict score
    score = ensemble_predict(X)[0]
    
    # Generate feedback
    feedback = defaultdict(list)
    
    # Word count feedback
    if features['word_count'] < 200:
        feedback['Structure'].append("Your essay is too short. Try to expand your ideas with more details and examples.")
    elif features['word_count'] > 1000:
        feedback['Structure'].append("Your essay is too long. Try to be more concise and focus on your main points.")
    
    # Readability feedback
    if features['flesch_reading_ease'] < 60:
        feedback['Style'].append("Your writing may be too complex. Consider using simpler sentence structures.")
    
    # Vocabulary feedback
    if features['lexical_diversity'] < 0.5:
        feedback['Vocabulary'].append("Try to use more varied vocabulary. Consider using synonyms and different expressions.")
    
    # Grammar feedback (simplified)
    if features['pronoun_count'] / features['word_count'] > 0.15:
        feedback['Grammar'].append("You may be overusing pronouns. Try to vary your sentence structure.")
    
    # Convert feedback to string
    feedback_str = f"Predicted Score: {score:.1f}/10\n\n"
    for category, comments in feedback.items():
        feedback_str += f"{category}:\n"
        for comment in comments:
            feedback_str += f"- {comment}\n"
        feedback_str += "\n"
    
    if len(feedback) == 0:
        feedback_str += "Good job! Your essay meets basic requirements. Keep up the good work!"
    
    return feedback_str

# Example usage
sample_essay = """
The importance of education cannot be overstated. Education is key to personal development. 
Education helps people get better jobs. Education makes society better. Everyone should go to school.
"""

print(generate_feedback(sample_essay))

In [None]:
import joblib
import pickle

# Save models
joblib.dump(rf_model, 'random_forest_model.pkl')
joblib.dump(gb_model, 'gradient_boosting_model.pkl')

# Save BERT tokenizer and model
model.save_pretrained('./bert_model/')
tokenizer.save_pretrained('./bert_model/')

# Save feature columns for reference
with open('feature_columns.pkl', 'wb') as f:
    pickle.dump(list(X.columns), f)