## Imports and constants

In [2]:
import re
import ipywidgets as widgets

import numpy as np
from scipy import sparse
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import gensim
import string

from nltk.corpus import stopwords
from nltk import tokenize
from wordcloud import STOPWORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
from tqdm import tqdm

%matplotlib inline

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
SUMMARIES_TRAIN_FILE = '../../data/commonlit_evaluate_student_summaries/summaries_train.csv'
SUMMARIES_TEST_FILE = '../../data/commonlit_evaluate_student_summaries/summaries_test.csv'
PROMPTS_TRAIN_FILE = '../../data/commonlit_evaluate_student_summaries/prompts_train.csv'
PROMPTS_TEST_FILE = '../../data/commonlit_evaluate_student_summaries/prompts_test.csv'

## Datasets import

In [6]:
summaries_train_df = pd.read_csv(SUMMARIES_TRAIN_FILE)
summaries_test_df = pd.read_csv(SUMMARIES_TEST_FILE)
prompts_train_df = pd.read_csv(PROMPTS_TRAIN_FILE)
prompts_test_df = pd.read_csv(PROMPTS_TEST_FILE)

## Data preparation

In [7]:
nltk.download('stopwords')

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'good', 'bad', 'people']) #stopwords extended a bit
def preprocess_hard(text, join_back=True):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        # Stop Words Cleaning
        if (
            token not in gensim.parsing.preprocessing.STOPWORDS and 
            token not in stop_words
        ):
            result.append(token)
    if join_back:
        result = " ".join(result)
    return result

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Andrii\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
summaries_train_df['clean_text'] = summaries_train_df['text'].apply(preprocess_hard)
summaries_test_df['clean_text'] = summaries_test_df['text'].apply(preprocess_hard)
summaries_train_df

Unnamed: 0,student_id,prompt_id,text,content,wording,clean_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,wave experimentto reacted new leader governmen...
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,rub soda smell away wouldnt smell meat tossed ...
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,egypt occupations social classes involved day ...
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,highest class pharaohs gods nd highest class g...
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,wave developed rapidly students genuinly belie...
...,...,...,...,...,...,...
7160,ff7c7e70df07,ebad26,They used all sorts of chemical concoctions to...,0.205683,0.380538,sorts chemical concoctions meat fine shown quo...
7161,ffc34d056498,3b9047,The lowest classes are slaves and farmers slav...,-0.308448,0.048171,lowest classes slaves farmers slaves taken war...
7162,ffd1576d2e1b,3b9047,they sorta made people start workin...,-1.408180,-0.493603,sorta start working structour theyed barley go...
7163,ffe4a98093b2,39c16e,An ideal tragety has three elements that make ...,-0.393310,0.627128,ideal tragety elements ideal start great trage...


In [11]:
prompts_train_df['clean_prompt_question'] = prompts_train_df['prompt_question'].apply(preprocess_hard)
prompts_train_df['clean_prompt_title'] = prompts_train_df['prompt_title'].apply(preprocess_hard)
prompts_train_df['clean_prompt_text'] = prompts_train_df['prompt_text'].apply(preprocess_hard)
prompts_test_df['clean_prompt_question'] = prompts_test_df['prompt_question'].apply(preprocess_hard)
prompts_test_df['clean_prompt_title'] = prompts_test_df['prompt_title'].apply(preprocess_hard)
prompts_test_df['clean_prompt_text'] = prompts_test_df['prompt_text'].apply(preprocess_hard)

prompts_train_df

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,clean_prompt_question,clean_prompt_title,clean_prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,summarize elements ideal tragedy described ari...,tragedy,chapter sequel said proceed consider poet aim ...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,complete sentences summarize structure ancient...,egyptian social structure,egyptian society structured like pyramid gods ...
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,summarize wave developed short period time exp...,wave,background wave experiment took place cubberle...
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",summarize ways factory cover spoiled meat cite...,excerpt jungle,member trimming beef cannery working sausage f...


In [12]:
summaries_train_df['clean_text_length_words'] = summaries_train_df['clean_text'].apply(lambda x: len(nltk.tokenize.word_tokenize(x)))
summaries_train_df["clean_text_length_symbols"] = summaries_train_df["clean_text"].apply(len)
summaries_train_df["clean_text_mean_word_length"] = (summaries_train_df["clean_text_length_symbols"] - summaries_train_df['clean_text_length_words'] + 1) / summaries_train_df['clean_text_length_words']
summaries_test_df['clean_text_length_words'] = summaries_test_df['clean_text'].apply(lambda x: len(nltk.tokenize.word_tokenize(x)))
summaries_test_df["clean_text_length_symbols"] = summaries_test_df["clean_text"].apply(len)
summaries_test_df["clean_text_mean_word_length"] = (summaries_test_df["clean_text_length_symbols"] - summaries_train_df['clean_text_length_words'] + 1) / summaries_train_df['clean_text_length_words']

summaries_train_df

Unnamed: 0,student_id,prompt_id,text,content,wording,clean_text,clean_text_length_words,clean_text_length_symbols,clean_text_mean_word_length
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,wave experimentto reacted new leader governmen...,28,203,6.285714
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,rub soda smell away wouldnt smell meat tossed ...,14,84,5.071429
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,egypt occupations social classes involved day ...,101,737,6.306931
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,highest class pharaohs gods nd highest class g...,14,98,6.071429
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,wave developed rapidly students genuinly belie...,87,670,6.712644
...,...,...,...,...,...,...,...,...,...
7160,ff7c7e70df07,ebad26,They used all sorts of chemical concoctions to...,0.205683,0.380538,sorts chemical concoctions meat fine shown quo...,25,175,6.040000
7161,ffc34d056498,3b9047,The lowest classes are slaves and farmers slav...,-0.308448,0.048171,lowest classes slaves farmers slaves taken war...,23,165,6.217391
7162,ffd1576d2e1b,3b9047,they sorta made people start workin...,-1.408180,-0.493603,sorta start working structour theyed barley go...,25,161,5.480000
7163,ffe4a98093b2,39c16e,An ideal tragety has three elements that make ...,-0.393310,0.627128,ideal tragety elements ideal start great trage...,26,165,5.384615


In [13]:
# Count the stop words in the text.
def count_stopwords(text: str) -> int:
    stopword_list = set(stopwords.words('english'))
    words = text.split()
    stopwords_count = sum(1 for word in words if word.lower() in stopword_list)
    return stopwords_count

# Count the punctuations in the text.
# punctuation_set -> !"#$%&'()*+, -./:;<=>?@[\]^_`{|}~
def count_punctuation(text: str) -> int:
    punctuation_set = set(string.punctuation)
    punctuation_count = sum(1 for char in text if char in punctuation_set)
    return punctuation_count

# Count the digits in the text.
def count_numbers(text: str) -> int:
    numbers = re.findall(r'\d+', text)
    numbers_count = len(numbers)
    return numbers_count

# This function applies all the above preprocessing functions on a text feature.
def streamlit_feature_engineer(dataframe: pd.DataFrame, feature: str = 'text') -> pd.DataFrame:
    dataframe[f'{feature}_stopword_cnt'] = dataframe[feature].apply(lambda x: count_stopwords(x))
    dataframe[f'{feature}_punct_cnt'] = dataframe[feature].apply(lambda x: count_punctuation(x))
    dataframe[f'{feature}_number_cnt'] = dataframe[feature].apply(lambda x: count_numbers(x))
    return dataframe

In [14]:
summaries_train_df = streamlit_feature_engineer(summaries_train_df)
summaries_test_df = streamlit_feature_engineer(summaries_test_df)
summaries_train_df.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,clean_text,clean_text_length_words,clean_text_length_symbols,clean_text_mean_word_length,text_stopword_cnt,text_punct_cnt,text_number_cnt
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,wave experimentto reacted new leader governmen...,28,203,6.285714,25,3,0
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,rub soda smell away wouldnt smell meat tossed ...,14,84,5.071429,30,2,0
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,egypt occupations social classes involved day ...,101,737,6.306931,98,38,0
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,highest class pharaohs gods nd highest class g...,14,98,6.071429,11,6,2
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,wave developed rapidly students genuinly belie...,87,670,6.712644,92,30,3


In [15]:
prompts_train_df['merged_text'] = prompts_train_df['clean_prompt_title'] + '\n' + prompts_train_df['clean_prompt_question'] + '\n' + prompts_train_df['clean_prompt_text']
prompts_test_df['merged_text'] = prompts_test_df['clean_prompt_title'] + '\n' + prompts_test_df['clean_prompt_question'] + '\n' + prompts_test_df['clean_prompt_text']

In [16]:
prompts_ids_to_is_train = {prompt_id: i for i, prompt_id in zip(prompts_train_df.index, prompts_train_df['prompt_id'])}
summaries_train_df['prompt_i'] = summaries_train_df['prompt_id'].apply(lambda prompt_id: prompts_ids_to_is_train[prompt_id])
prompts_ids_to_is_test = {prompt_id: i for i, prompt_id in zip(prompts_test_df.index, prompts_test_df['prompt_id'])}
summaries_test_df['prompt_i'] = summaries_test_df['prompt_id'].apply(lambda prompt_id: prompts_ids_to_is_test[prompt_id])
summaries_train_df.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,clean_text,clean_text_length_words,clean_text_length_symbols,clean_text_mean_word_length,text_stopword_cnt,text_punct_cnt,text_number_cnt,prompt_i
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,wave experimentto reacted new leader governmen...,28,203,6.285714,25,3,0,2
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,rub soda smell away wouldnt smell meat tossed ...,14,84,5.071429,30,2,0,3
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,egypt occupations social classes involved day ...,101,737,6.306931,98,38,0,1
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,highest class pharaohs gods nd highest class g...,14,98,6.071429,11,6,2,1
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,wave developed rapidly students genuinly belie...,87,670,6.712644,92,30,3,2


## Modelling

In [17]:
FEATURES_TO_SCALE = ['clean_text_length_words', 'clean_text_mean_word_length', 'text_stopword_cnt', 'text_punct_cnt', 'text_number_cnt']

In [21]:
alpha1, alpha2 = 500, 500

In [22]:
X_train, y_train = summaries_train_df.loc[:, ['prompt_i', 'clean_text', *FEATURES_TO_SCALE]], summaries_train_df.loc[:, ['content', 'wording']]
X_test = summaries_test_df.loc[:, ['prompt_i', 'clean_text', *FEATURES_TO_SCALE]]

vectorizer = TfidfVectorizer(
    analyzer='word',
    stop_words='english',
    ngram_range=(1, 3),
    lowercase=True,
    min_df=1,
    max_features=30000
)
vectorizer = vectorizer.fit(X_train['clean_text'])
train_summaries_vectors = vectorizer.transform(X_train['clean_text'])
test_summaries_vectors = vectorizer.transform(X_test['clean_text'])
train_merged_prompts_vectors = vectorizer.transform(prompts_train_df['merged_text'])
test_merged_prompts_vectors = vectorizer.transform(prompts_test_df['merged_text'])

scaler = RobustScaler().fit(X_train[FEATURES_TO_SCALE])
X_train[FEATURES_TO_SCALE] = scaler.transform(X_train[FEATURES_TO_SCALE])
X_test[FEATURES_TO_SCALE] = scaler.transform(X_test[FEATURES_TO_SCALE])

y_content_scaler = RobustScaler().fit(y_train['content'].to_numpy().reshape(-1, 1))
y_content_train_scaled = y_content_scaler.transform(y_train['content'].to_numpy().reshape(-1, 1))
y_wording_scaler = RobustScaler().fit(y_train['wording'].to_numpy().reshape(-1, 1))
y_wording_train_scaled = y_wording_scaler.transform(y_train['wording'].to_numpy().reshape(-1, 1))

cosine_scores_train = (train_merged_prompts_vectors[X_train['prompt_i']] @ train_summaries_vectors.T).diagonal().reshape(-1, 1)
cosine_scores_test = (test_merged_prompts_vectors[X_test['prompt_i']] @ test_summaries_vectors.T).diagonal().reshape(-1, 1)

X_train = sparse.hstack((
    train_summaries_vectors,
    sparse.coo_matrix(cosine_scores_train),
    sparse.coo_matrix(X_train[FEATURES_TO_SCALE].to_numpy()),
))
X_test = sparse.hstack((
    test_summaries_vectors,
    sparse.coo_matrix(cosine_scores_test),
    sparse.coo_matrix(X_test[FEATURES_TO_SCALE].to_numpy()),
))

model_content = Ridge(alpha=alpha1)
model_content.fit(X_train, y_content_train_scaled)
y_content_test_pred_scaled = model_content.predict(X_test)

y_content_test_pred = y_content_scaler.inverse_transform(y_content_test_pred_scaled)

model_wording = Ridge(alpha=alpha2)
model_wording.fit(X_train, y_wording_train_scaled)
y_wording_test_pred_scaled = model_wording.predict(X_test)

y_wording_test_pred = y_wording_scaler.inverse_transform(y_wording_test_pred_scaled)

In [24]:
summaries_test_df['content'] = y_content_test_pred
summaries_test_df['wording'] = y_wording_test_pred

In [25]:
summaries_test_df.head()

Unnamed: 0,student_id,prompt_id,text,clean_text,clean_text_length_words,clean_text_length_symbols,clean_text_mean_word_length,text_stopword_cnt,text_punct_cnt,text_number_cnt,prompt_i,content,wording
0,000000ffffff,abc123,Example text 1,example text,2,12,-0.535714,0,0,1,0,-2.063028,-2.264019
1,111111eeeeee,def789,Example text 2,example text,2,12,-0.071429,0,0,1,1,-1.996206,-2.167202
2,222222cccccc,abc123,Example text 3,example text,2,12,-0.871287,0,0,1,0,-2.111325,-2.333996
3,333333dddddd,def789,Example text 4,example text,2,12,-0.071429,0,0,1,1,-1.996206,-2.167202


In [26]:
summaries_test_df[['student_id', 'content', 'wording']].to_csv('submission.csv',index=False)