In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [1]:
# Data Loading
df_train = pd.read_csv('/content/train.csv')
df_test = pd.read_csv('/content/test.csv')

# Data Cleaning
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

df_train['cleaned_text'] = df_train['full_text'].apply(clean_text)
df_test['cleaned_text'] = df_test['full_text'].apply(clean_text)

NameError: name 'pd' is not defined

In [3]:
# Feature Engineering
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 3))
X = vectorizer.fit_transform(df_train['cleaned_text']).toarray().astype(np.float32)
y = df_train['score'].astype(int) - 1

In [4]:
# Train-Test Split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Model Training
model = XGBClassifier(
    n_estimators=500,
    max_depth=15,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

print("Training started...")
model.fit(X_train, y_train)
print("Training completed.")

Training started...


In [5]:
# Predictions
y_pred_valid = model.predict(X_valid) + 1



NameError: name 'model' is not defined

In [None]:
# Evaluation Metrics
accuracy = accuracy_score(y_valid, y_pred_valid)
f1 = f1_score(y_valid, y_pred_valid, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")


In [None]:
# Visualization
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_valid, y_pred_valid)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Predictions on Test Data
X_test = vectorizer.transform(df_test['cleaned_text']).toarray().astype(np.float32)
df_test['predicted_score'] = model.predict(X_test) + 1

# Save Final Predictions
df_test[['essay_id', 'predicted_score']].to_csv('final_predictions.csv', index=False)
