In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/learning-agency-lab-automated-essay-scoring-4/train.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-4/test.csv


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load data
train_df = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-4/train.csv")
test_df = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-4/test.csv")

In [8]:
# Define stopwords
stop_words = set(stopwords.words('english'))


# Preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Remove punctuation and lowercase
    tokens = [word.lower() for word in tokens if word.isalpha()]
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [9]:
# Apply preprocessing
train_df['processed_text'] = train_df['full_text'].apply(preprocess_text)
test_df['processed_text'] = test_df['full_text'].apply(preprocess_text)

In [10]:
# Split data
X_train, X_val, y_train, y_val = train_test_split(train_df['processed_text'], train_df['score'], test_size=0.2, random_state=42)

# Define pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=2000)),
    ('clf', RandomForestClassifier(random_state=42))
])

# Define parameters for grid search
param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10, 20],
    'clf__min_samples_split': [2, 5]
}


In [11]:
# Define kappa scorer for grid search
kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

# Perform grid search with verbose=0 to suppress output
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=kappa_scorer, verbose=0, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [12]:

# Get best model
best_model = grid_search.best_estimator_

# Make predictions
predictions = best_model.predict(test_df['processed_text'])

In [13]:
# Generate submission file
submission_df = pd.DataFrame({'essay_id': test_df['essay_id'], 'score': predictions})
submission_df.to_csv('submission.csv', index=False)