In [None]:
import re
import ipywidgets as widgets

import numpy as np
from scipy import sparse
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import gensim
import optuna
import string

from nltk.corpus import stopwords
from nltk import tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
from lightgbm import LGBMRegressor
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from scipy import spatial
from tqdm import tqdm
from time import time
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity

%matplotlib inline

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
SUMMARIES_TRAIN_FILE = '../../data/commonlit_evaluate_student_summaries/summaries_train.csv'
SUMMARIES_TEST_FILE = '../../data/commonlit_evaluate_student_summaries/summaries_test.csv'
PROMPTS_TRAIN_FILE = '../../data/commonlit_evaluate_student_summaries/prompts_train.csv'
PROMPTS_TEST_FILE = '../../data/commonlit_evaluate_student_summaries/prompts_test.csv'

In [None]:
summaries_train_df = pd.read_csv(SUMMARIES_TRAIN_FILE)
summaries_test_df = pd.read_csv(SUMMARIES_TEST_FILE)
prompts_train_df = pd.read_csv(PROMPTS_TRAIN_FILE)
prompts_test_df = pd.read_csv(PROMPTS_TEST_FILE)

In [None]:
nltk.download('stopwords')

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'good', 'bad', 'people']) #stopwords extended a bit
def preprocess_hard_base(text, join_back=True):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        # Stop Words Cleaning
        if (
            token not in gensim.parsing.preprocessing.STOPWORDS and 
            token not in stop_words
        ):
            result.append(token)
    if join_back:
        result = " ".join(result)
    return result

def preprocess_hard_stemming(text, join_back=True, stemmer = PorterStemmer()):
    tokens = preprocess_hard_base(text, join_back=False)
    
    result = [stemmer.stem(word) for word in tokens]
    if join_back:
        result = " ".join(result)
    
    return result

In [None]:
def collapse_dots(text):
    # Collapse sequential dots
    input = re.sub("\.+", ".", text)
    # Collapse dots separated by whitespaces
    all_collapsed = False
    while not all_collapsed:
        output = re.sub(r"\.(( )*)\.", ".", text)
        all_collapsed = input == output
        input = output
    return output

# Check how it will influence different ML models
def process_soft(text):
    if isinstance(text, str):
        text = " ".join(tokenize.sent_tokenize(text))
        text = re.sub(r"http\S+", "", text)
        text = re.sub(r"\n+", ". ", text)
        for symb in ["!", ",", ":", ";", "?"]:
            text = re.sub(rf"\{symb}\.", symb, text)
        text = re.sub("[^а-яА-Яa-zA-Z0-9!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ё]+", " ", text)
        text = re.sub(r"#\S+", "", text)
        text = collapse_dots(text)
        text = text.strip()
    return text

In [None]:
summaries_train_df['text_soft_preprocessed'] = summaries_train_df['text'].apply(process_soft)
summaries_test_df['text_soft_preprocessed'] = summaries_test_df['text'].apply(process_soft)
summaries_train_df['text_hard_preprocessed_stemmed'] = summaries_train_df['text'].apply(preprocess_hard_stemming)
summaries_test_df['text_hard_preprocessed_stemmed'] = summaries_test_df['text'].apply(preprocess_hard_stemming)

summaries_test_df.head()

In [None]:
prompts_train_df['prompt_question_soft_preprocessed'] = prompts_train_df['prompt_question'].apply(process_soft)
prompts_train_df['prompt_title_soft_preprocessed'] = prompts_train_df['prompt_title'].apply(process_soft)
prompts_train_df['prompt_text_soft_preprocessed'] = prompts_train_df['prompt_text'].apply(process_soft)

prompts_test_df['prompt_question_soft_preprocessed'] = prompts_test_df['prompt_question'].apply(process_soft)
prompts_test_df['prompt_title_soft_preprocessed'] = prompts_test_df['prompt_title'].apply(process_soft)
prompts_test_df['prompt_text_soft_preprocessed'] = prompts_test_df['prompt_text'].apply(process_soft)

prompts_train_df['prompt_question_hard_preprocessed_stemmed'] = prompts_train_df['prompt_question'].apply(preprocess_hard_stemming)
prompts_train_df['prompt_title_hard_preprocessed_stemmed'] = prompts_train_df['prompt_title'].apply(preprocess_hard_stemming)
prompts_train_df['prompt_text_hard_preprocessed_stemmed'] = prompts_train_df['prompt_text'].apply(preprocess_hard_stemming)

prompts_test_df['prompt_question_hard_preprocessed_stemmed'] = prompts_test_df['prompt_question'].apply(preprocess_hard_stemming)
prompts_test_df['prompt_title_hard_preprocessed_stemmed'] = prompts_test_df['prompt_title'].apply(preprocess_hard_stemming)
prompts_test_df['prompt_text_hard_preprocessed_stemmed'] = prompts_test_df['prompt_text'].apply(preprocess_hard_stemming)

prompts_test_df

In [None]:
# Count the stop words in the text.
def count_stopwords(text: str) -> int:
    stopword_list = set(stopwords.words('english'))
    words = text.split()
    stopwords_count = sum(1 for word in words if word.lower() in stopword_list)
    return stopwords_count

# Count the punctuations in the text.
# punctuation_set -> !"#$%&'()*+, -./:;<=>?@[\]^_`{|}~
def count_punctuation(text: str) -> int:
    punctuation_set = set(string.punctuation)
    punctuation_count = sum(1 for char in text if char in punctuation_set)
    return punctuation_count

# Count the digits in the text.
def count_numbers(text: str) -> int:
    numbers = re.findall(r'\d+', text)
    numbers_count = len(numbers)
    return numbers_count

# This function applies all the above preprocessing functions on a text feature.
def streamlit_feature_engineer(dataframe: pd.DataFrame, feature: str = 'text', preprocessed_hard: bool = False) -> pd.DataFrame:
    dataframe[f'{feature}_word_cnt'] = dataframe[feature].apply(lambda x: len(x.split(' ')))
    dataframe[f'{feature}_length'] = dataframe[feature].apply(lambda x: len(x))
    if not preprocessed_hard:
        dataframe[f'{feature}_stopword_cnt'] = dataframe[feature].apply(lambda x: count_stopwords(x))
        dataframe[f'{feature}_punct_cnt'] = dataframe[feature].apply(lambda x: count_punctuation(x))
        dataframe[f'{feature}_number_cnt'] = dataframe[feature].apply(lambda x: count_numbers(x))
    return dataframe

In [None]:
summaries_train_df = streamlit_feature_engineer(summaries_train_df, feature = "text_soft_preprocessed")
summaries_test_df = streamlit_feature_engineer(summaries_test_df, feature = "text_soft_preprocessed")
summaries_train_df = streamlit_feature_engineer(summaries_train_df, feature = "text_hard_preprocessed_stemmed", preprocessed_hard=True)
summaries_test_df = streamlit_feature_engineer(summaries_test_df, feature = "text_hard_preprocessed_stemmed", preprocessed_hard=True)

In [None]:
summaries_test_df

In [None]:
prompts_ids_to_is = {prompt_id: i for i, prompt_id in zip(prompts_train_df.index, prompts_train_df['prompt_id'])}
summaries_train_df['prompt_i'] = summaries_train_df['prompt_id'].apply(lambda prompt_id: prompts_ids_to_is[prompt_id])
prompts_ids_to_is_test = {prompt_id: i for i, prompt_id in zip(prompts_test_df.index, prompts_test_df['prompt_id'])}
summaries_test_df['prompt_i'] = summaries_test_df['prompt_id'].apply(lambda prompt_id: prompts_ids_to_is_test[prompt_id])
summaries_test_df.head()

In [None]:
# Wrapper class to match sklearn's interface
class SentenceTransformerVectorizer:
    def __init__(self, model='all-MiniLM-L6-v2', device="cuda"):
        self.sent_tr = SentenceTransformer(model,device=device)

    def fit(self, texts):
        return self
    
    def transform(self, texts):
        encoded_texts = self.sent_tr.encode(texts.to_numpy())
        return sparse.csr_matrix(encoded_texts)


In [None]:
features = ['text_soft_preprocessed', 'text_soft_preprocessed_word_cnt', 'text_soft_preprocessed_length', 'text_soft_preprocessed_stopword_cnt', 'text_soft_preprocessed_punct_cnt', 'text_soft_preprocessed_number_cnt']
features_to_scale = ['text_soft_preprocessed_word_cnt', 'text_soft_preprocessed_length', 'text_soft_preprocessed_stopword_cnt', 'text_soft_preprocessed_punct_cnt', 'text_soft_preprocessed_number_cnt']
vectorizer = SentenceTransformerVectorizer()
vectorizer_feature = "text_soft_preprocessed"
prompt_processed_features = {
  'prompt_question': 'prompt_question_soft_preprocessed',
  'prompt_title': 'prompt_title_soft_preprocessed',
  'prompt_text': 'prompt_text_soft_preprocessed'
}
target = "content"
lightgbm_content_params = {
    'lambda_l1': 0.1296119760419726, 
    'lambda_l2': 0.049549534856476465, 
    'learning_rate': 0.03336396209486242, 
    'num_leaves': 140, 
    'max_depth': 1, 
    'n_estimators': 186, 
    'feature_fraction': 0.981443658599143, 
    'bagging_fraction': 0.6007548488211474, 
    'bagging_freq_1': 12, 
    'min_child_samples': 64
}

X_train, y_train = summaries_train_df.loc[:, ['prompt_i', *features]], summaries_train_df.loc[:, target]
X_test = summaries_test_df.loc[:, ['prompt_i', *features]]

vectorizer = vectorizer.fit(X_train[vectorizer_feature])
train_summaries_vectors = vectorizer.transform(X_train[vectorizer_feature])
test_summaries_vectors = vectorizer.transform(X_test[vectorizer_feature])

prompts_texts_vectors = vectorizer.transform(prompts_train_df[prompt_processed_features['prompt_text']])
prompts_titles_vectors = vectorizer.transform(prompts_train_df[prompt_processed_features['prompt_title']])
prompts_questions_vectors = vectorizer.transform(prompts_train_df[prompt_processed_features['prompt_question']])

scaler = RobustScaler().fit(X_train[features_to_scale])
X_train[features_to_scale] = scaler.transform(X_train[features_to_scale])
X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])

y_scaler = RobustScaler().fit(y_train.to_numpy().reshape(-1, 1))
y_train_scaled = y_scaler.transform(y_train.to_numpy().reshape(-1, 1))

train_summaries_vectors_dense = train_summaries_vectors.todense()
test_summaries_vectors_dense = test_summaries_vectors.todense()

cosine_scores_train_prompts_texts = np.zeros((len(y_train), 1))
cosine_scores_train_prompts_titles = np.zeros((len(y_train), 1))
cosine_scores_train_prompts_questions = np.zeros((len(y_train), 1))

for i, (summary_vector, prompt_text_vector, prompt_title_vector, prompt_question_vector) in enumerate(zip(train_summaries_vectors_dense, prompts_texts_vectors[X_train['prompt_i']].todense(), prompts_titles_vectors[X_train['prompt_i']].todense(), prompts_questions_vectors[X_train['prompt_i']].todense())):
    cosine_scores_train_prompts_texts[i, 0] = cosine_similarity(np.asarray(prompt_text_vector), np.asarray(summary_vector))
    cosine_scores_train_prompts_titles[i, 0] = cosine_similarity(np.asarray(prompt_title_vector), np.asarray(summary_vector))
    cosine_scores_train_prompts_questions[i, 0] = cosine_similarity(np.asarray(prompt_question_vector), np.asarray(summary_vector))

cosine_scores_test_prompts_texts = np.zeros((X_test.shape[0], 1))
cosine_scores_test_prompts_titles = np.zeros((X_test.shape[0], 1))
cosine_scores_test_prompts_questions = np.zeros((X_test.shape[0], 1))

for i, (summary_vector, prompt_text_vector, prompt_title_vector, prompt_question_vector) in enumerate(zip(test_summaries_vectors_dense, prompts_texts_vectors[X_test['prompt_i']].todense(), prompts_titles_vectors[X_test['prompt_i']].todense(), prompts_questions_vectors[X_test['prompt_i']].todense())):
    cosine_scores_test_prompts_texts[i, 0] = cosine_similarity(np.asarray(prompt_text_vector), np.asarray(summary_vector))
    cosine_scores_test_prompts_titles[i, 0] = cosine_similarity(np.asarray(prompt_title_vector), np.asarray(summary_vector))
    cosine_scores_test_prompts_questions[i, 0] = cosine_similarity(np.asarray(prompt_question_vector), np.asarray(summary_vector))

X_train = sparse.hstack((
    train_summaries_vectors,
    sparse.coo_matrix(cosine_scores_train_prompts_texts),
    sparse.coo_matrix(cosine_scores_train_prompts_titles),
    sparse.coo_matrix(cosine_scores_train_prompts_questions),
    sparse.coo_matrix(X_train[features_to_scale].to_numpy()),
))
X_test = sparse.hstack((
    test_summaries_vectors,
    sparse.coo_matrix(cosine_scores_test_prompts_texts),
    sparse.coo_matrix(cosine_scores_test_prompts_titles),
    sparse.coo_matrix(cosine_scores_test_prompts_questions),
    sparse.coo_matrix(X_test[features_to_scale].to_numpy()),
))

model = LGBMRegressor(**lightgbm_content_params, verbose = -1)
model.fit(X_train, y_train_scaled)
y_train_pred_scaled = model.predict(X_train)
y_test_pred_scaled = model.predict(X_test)

y_test_pred_content = y_scaler.inverse_transform(y_test_pred_scaled)

In [None]:
features = ['text_hard_preprocessed_stemmed', 'text_hard_preprocessed_stemmed_word_cnt', 'text_hard_preprocessed_stemmed_length', 'text_soft_preprocessed_stopword_cnt', 'text_soft_preprocessed_punct_cnt', 'text_soft_preprocessed_number_cnt']
features_to_scale = ['text_hard_preprocessed_stemmed_word_cnt', 'text_hard_preprocessed_stemmed_length', 'text_soft_preprocessed_stopword_cnt', 'text_soft_preprocessed_punct_cnt', 'text_soft_preprocessed_number_cnt']
vectorizer = CountVectorizer(
    analyzer='word',
    stop_words='english',
    ngram_range=(1, 3),
    lowercase=True,
    min_df=1,
    max_features=30000
)
vectorizer_feature = "text_hard_preprocessed_stemmed"
prompt_processed_features = {
  'prompt_question': 'prompt_question_hard_preprocessed_stemmed',
  'prompt_title': 'prompt_title_hard_preprocessed_stemmed',
  'prompt_text': 'prompt_text_hard_preprocessed_stemmed'
}
target = "wording"
lightgbm_wording_params={
    'lambda_l1': 0.5499102131489506, 
    'lambda_l2': 0.13682389299339068, 
    'learning_rate': 0.179791006382902, 
    'num_leaves': 2, 
    'max_depth': 1, 
    'n_estimators': 121, 
    'feature_fraction': 0.7792526204040188, 
    'bagging_fraction': 0.41045226793565176, 
    'bagging_freq_1': 20, 
    'min_child_samples': 45
}

X_train, y_train = summaries_train_df.loc[:, ['prompt_i', *features]], summaries_train_df.loc[:, target]
X_test = summaries_test_df.loc[:, ['prompt_i', *features]]

vectorizer = vectorizer.fit(X_train[vectorizer_feature])
train_summaries_vectors = vectorizer.transform(X_train[vectorizer_feature])
test_summaries_vectors = vectorizer.transform(X_test[vectorizer_feature])

prompts_texts_vectors = vectorizer.transform(prompts_train_df[prompt_processed_features['prompt_text']])
prompts_titles_vectors = vectorizer.transform(prompts_train_df[prompt_processed_features['prompt_title']])
prompts_questions_vectors = vectorizer.transform(prompts_train_df[prompt_processed_features['prompt_question']])

scaler = RobustScaler().fit(X_train[features_to_scale])
X_train[features_to_scale] = scaler.transform(X_train[features_to_scale])
X_test[features_to_scale] = scaler.transform(X_test[features_to_scale])

y_scaler = RobustScaler().fit(y_train.to_numpy().reshape(-1, 1))
y_train_scaled = y_scaler.transform(y_train.to_numpy().reshape(-1, 1))

train_summaries_vectors_dense = train_summaries_vectors.todense()
test_summaries_vectors_dense = test_summaries_vectors.todense()

cosine_scores_train_prompts_texts = np.zeros((len(y_train), 1))
cosine_scores_train_prompts_titles = np.zeros((len(y_train), 1))
cosine_scores_train_prompts_questions = np.zeros((len(y_train), 1))

for i, (summary_vector, prompt_text_vector, prompt_title_vector, prompt_question_vector) in enumerate(zip(train_summaries_vectors_dense, prompts_texts_vectors[X_train['prompt_i']].todense(), prompts_titles_vectors[X_train['prompt_i']].todense(), prompts_questions_vectors[X_train['prompt_i']].todense())):
    cosine_scores_train_prompts_texts[i, 0] = cosine_similarity(np.asarray(prompt_text_vector), np.asarray(summary_vector))
    cosine_scores_train_prompts_titles[i, 0] = cosine_similarity(np.asarray(prompt_title_vector), np.asarray(summary_vector))
    cosine_scores_train_prompts_questions[i, 0] = cosine_similarity(np.asarray(prompt_question_vector), np.asarray(summary_vector))

cosine_scores_test_prompts_texts = np.zeros((X_test.shape[0], 1))
cosine_scores_test_prompts_titles = np.zeros((X_test.shape[0], 1))
cosine_scores_test_prompts_questions = np.zeros((X_test.shape[0], 1))

for i, (summary_vector, prompt_text_vector, prompt_title_vector, prompt_question_vector) in enumerate(zip(test_summaries_vectors_dense, prompts_texts_vectors[X_test['prompt_i']].todense(), prompts_titles_vectors[X_test['prompt_i']].todense(), prompts_questions_vectors[X_test['prompt_i']].todense())):
    cosine_scores_test_prompts_texts[i, 0] = cosine_similarity(np.asarray(prompt_text_vector), np.asarray(summary_vector))
    cosine_scores_test_prompts_titles[i, 0] = cosine_similarity(np.asarray(prompt_title_vector), np.asarray(summary_vector))
    cosine_scores_test_prompts_questions[i, 0] = cosine_similarity(np.asarray(prompt_question_vector), np.asarray(summary_vector))

X_train = sparse.hstack((
    train_summaries_vectors,
    sparse.coo_matrix(cosine_scores_train_prompts_texts),
    sparse.coo_matrix(cosine_scores_train_prompts_titles),
    sparse.coo_matrix(cosine_scores_train_prompts_questions),
    sparse.coo_matrix(X_train[features_to_scale].to_numpy()),
))
X_test = sparse.hstack((
    test_summaries_vectors,
    sparse.coo_matrix(cosine_scores_test_prompts_texts),
    sparse.coo_matrix(cosine_scores_test_prompts_titles),
    sparse.coo_matrix(cosine_scores_test_prompts_questions),
    sparse.coo_matrix(X_test[features_to_scale].to_numpy()),
))

model = LGBMRegressor(**lightgbm_wording_params, verbose = -1)
model.fit(X_train, y_train_scaled)
y_train_pred_scaled = model.predict(X_train)
y_test_pred_scaled = model.predict(X_test)

y_test_pred_wording = y_scaler.inverse_transform(y_test_pred_scaled)

In [None]:
summaries_test_df['content'] = y_test_pred_content
summaries_test_df['wording'] = y_test_pred_wording

In [None]:
summaries_test_df.head()

In [None]:
summaries_test_df[['student_id', 'content', 'wording']].to_csv('submission.csv',index=False)