In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import pandas as pd

# Load the datasets
train_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2-0/train.csv')
test_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2-0/test.csv')
sample_submission_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2-0/sample_submission.csv')

# Display the first few rows of the datasets
print("Training Data:")
print(train_df.head())

print("\nTest Data:")
print(test_df.head())

print("\nSample Submission:")
print(sample_submission_df.head())


Training Data:
  essay_id                                          full_text  score
0  000d118  Many people have car where they live. The thin...      3
1  000fe60  I am a scientist at NASA that is discussing th...      3
2  001ab80  People always wish they had the same technolog...      4
3  001bdc0  We all heard about Venus, the planet without a...      4
4  002ba53  Dear, State Senator\n\nThis is a letter to arg...      3

Test Data:
  essay_id                                          full_text
0  000d118  Many people have car where they live. The thin...
1  000fe60  I am a scientist at NASA that is discussing th...
2  001ab80  People always wish they had the same technolog...

Sample Submission:
  essay_id  score
0  000d118      3
1  000fe60      3
2  001ab80      4


In [3]:
# Display the structure of the datasets
print("\nTraining Data Info:")
print(train_df.info())

print("\nTest Data Info:")
print(test_df.info())

# Display basic statistics of the training data
print("\nTraining Data Statistics:")
print(train_df.describe())



Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17307 entries, 0 to 17306
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   essay_id   17307 non-null  object
 1   full_text  17307 non-null  object
 2   score      17307 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 405.8+ KB
None

Test Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   essay_id   3 non-null      object
 1   full_text  3 non-null      object
dtypes: object(2)
memory usage: 176.0+ bytes
None

Training Data Statistics:
              score
count  17307.000000
mean       2.948402
std        1.044899
min        1.000000
25%        2.000000
50%        3.000000
75%        4.000000
max        6.000000


In [5]:
# Display the column names of the datasets
print("Training Data Columns:", train_df.columns)
print("Test Data Columns:", test_df.columns)


Training Data Columns: Index(['essay_id', 'full_text', 'score'], dtype='object')
Test Data Columns: Index(['essay_id', 'full_text'], dtype='object')


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import cohen_kappa_score

# Load the datasets
train_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2-0/train.csv')
test_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2-0/test.csv')
sample_submission_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2-0/sample_submission.csv')

# Display the column names of the datasets
print("Training Data Columns:", train_df.columns)
print("Test Data Columns:", test_df.columns)

# Check for missing values
print("\nMissing values in Training Data:")
print(train_df.isnull().sum())

# Split the training data into training and validation sets
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)

# Define text and target columns
text_column = 'full_text'
target_column = 'score'

# Convert text data to numerical format using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train = tfidf_vectorizer.fit_transform(train_data[text_column])
X_val = tfidf_vectorizer.transform(val_data[text_column])
X_test = tfidf_vectorizer.transform(test_df[text_column])

y_train = train_data[target_column]
y_val = val_data[target_column]

# Train the model
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

# Predict on the validation set
val_predictions = model.predict(X_val)

# Evaluate the model using quadratic weighted kappa
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred.round(), weights='quadratic')

val_score = quadratic_weighted_kappa(y_val, val_predictions)
print(f'Validation Quadratic Weighted Kappa: {val_score:.4f}')

# Predict on the test set
test_predictions = model.predict(X_test)

# Prepare the submission
submission_df = pd.DataFrame({
    'essay_id': test_df['essay_id'],
    'score': test_predictions.round().astype(int)  # Round and convert to integer
})

# Save the submission file
submission_df.to_csv('/kaggle/working/submission.csv', index=False)

print("Submission file created successfully.")


Training Data Columns: Index(['essay_id', 'full_text', 'score'], dtype='object')
Test Data Columns: Index(['essay_id', 'full_text'], dtype='object')

Missing values in Training Data:
essay_id     0
full_text    0
score        0
dtype: int64
Validation Quadratic Weighted Kappa: 0.6951
Submission file created successfully.
