**Prepare and setup environment**

In [148]:
#import libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn import metrics
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

**Load the Datasets**

In [149]:
prompts_train_dataset = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')
print("Full prompts train dataset shape is ",prompts_train_dataset.shape)
summaries_train_dataset = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv')
print("Full summaries train dataset shape is ",summaries_train_dataset.shape)

Full prompts train dataset shape is  (4, 4)
Full summaries train dataset shape is  (7165, 5)


In [150]:
prompts_train_dataset.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


In [151]:
prompts_train_dataset.keys()

Index(['prompt_id', 'prompt_question', 'prompt_title', 'prompt_text'], dtype='object')

In [152]:
summaries_train_dataset.head()

Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757


In [153]:
summaries_train_dataset.keys()

Index(['student_id', 'prompt_id', 'text', 'content', 'wording'], dtype='object')

**Merge the Prompts and Summaries Datasets**

In [154]:
train_dataset = summaries_train_dataset.merge(prompts_train_dataset,on='prompt_id')
train_dataset.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,00c20c6ddd23,814d6b,The experimen was orginally about how even whe...,0.567975,0.969062,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
4,00d40ad10dc9,814d6b,The third wave developed so quickly due to the...,-0.910596,-0.081769,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...


**Calculate the numeric features from the dataset.**

In [155]:
def count_syllables(word):
    vowels = "aeiouAEIOU"
    count = 0
    for char in word:
        if char in vowels:
            count += 1
    return count

def calculate_reading_ease(text):
    words = re.findall(r'\b\w+\b', text)
    
    total_words = len(words)
    sentences = re.split(r'[.!?]', text)
    total_sentences = len(sentences)
    total_syllables = sum(count_syllables(word) for word in words)
    
    reading_ease = 206.835 - 1.015 * (total_words / total_sentences) - 84.6 * (total_syllables / total_words)
    
    return reading_ease

def count_punctuation(text):
    punctuation_set = set(string.punctuation)
    punctuation_count = sum(1 for char in text if char in punctuation_set)
    return punctuation_count

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    text_without_punctuation = text.translate(translator)
    return text_without_punctuation

def count_stopwords(text):
    stopword_list = set(stopwords.words('english'))
    words = text.split()
    stopwords_count = sum(1 for word in words if word.lower() in stopword_list)
    return stopwords_count

def remove_stopwords(text):
    words = text.split()
    
    stop_words = set(stopwords.words('english'))
    
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    filtered_text = ' '.join(filtered_words)
    
    return filtered_text

def find_word_overlap_ratio(essay, summary):
    essay = remove_punctuation(essay)
    essay = remove_stopwords(essay)
    summary = remove_punctuation(summary)
    summary = remove_stopwords(summary)

    essay_words = essay.split()
    summary_words = summary.split()

    essay_word_set = set(essay_words)
    summary_word_set = set(summary_words)

    overlapping_words = essay_word_set.intersection(summary_word_set)
    overlap_count = len(overlapping_words)

    if len(essay_words) == 0:
        return 0.0
    else:
        ratio = overlap_count / len(essay_words)
        return ratio * 100

def find_disjoint_words_ratio(essay,summary):
    essay = remove_punctuation(essay)
    essay = remove_stopwords(essay)
    summary = remove_punctuation(summary)
    summary = remove_stopwords(summary)

    essay_words = essay.split()
    summary_words = summary.split()

    essay_word_set = set(essay_words)
    summary_word_set = set(summary_words)

    overlapping_words = essay_word_set.difference(summary_word_set)
    overlap_count = len(overlapping_words)

    if len(essay_words) == 0:
        return 0.0
    else:
        ratio = overlap_count / len(essay_words)
        return ratio * 100

In [156]:
def feature_engineer(dataset,feature = 'text'):
    dataset[feature+'_length'] = dataset[feature].apply(lambda x : len(x))
    dataset['punctuations_count'] = dataset[feature].apply(lambda x : count_punctuation(x))
    dataset['stop_words_count'] = dataset[feature].apply(lambda x : count_stopwords(x))
    dataset['reading_ease'] = dataset[feature].apply(lambda x : calculate_reading_ease(x))
    dataset['word_overlap_ratio'] = dataset.apply(lambda x : find_word_overlap_ratio(x['prompt_text'],x[feature]),axis=1)
    dataset['disjoint_words_ratio'] = dataset.apply(lambda x : find_disjoint_words_ratio(x['prompt_text'],x[feature]),axis=1)
    return dataset

**Extract Feature Columns from the dataset for Content and Wording**

In [157]:
train_dataset = feature_engineer(train_dataset)
train_dataset.describe()

Unnamed: 0,content,wording,text_length,punctuations_count,stop_words_count,reading_ease,word_overlap_ratio,disjoint_words_ratio
count,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0,7165.0
mean,-0.014853,-0.063072,418.776971,9.355059,36.306769,49.621101,5.260539,67.637867
std,1.043569,1.036048,307.833685,9.474397,25.813092,13.464521,4.124065,4.826268
min,-1.729859,-1.962614,114.0,0.0,5.0,-82.967208,0.0,23.322684
25%,-0.799545,-0.87272,216.0,4.0,19.0,41.553571,2.588997,65.248227
50%,-0.093814,-0.081769,320.0,6.0,28.0,50.086786,4.20712,67.633929
75%,0.49966,0.503833,513.0,12.0,45.0,58.50062,6.389776,70.212766
max,3.900326,4.310693,3940.0,162.0,315.0,97.433333,54.952077,77.955272


In [158]:
content_score = train_dataset['content']
wording_score = train_dataset['wording']
wording_feature_columns = train_dataset.drop(columns=['prompt_id','prompt_question','prompt_title','student_id','prompt_text','text','content','wording'],axis=1)
content_feature_columns = wording_feature_columns
wording_columns = wording_feature_columns
content_columns = content_feature_columns

In [159]:
scaler = MinMaxScaler()
wording_feature_columns = scaler.fit_transform(wording_feature_columns)
content_feature_columns = scaler.fit_transform(content_feature_columns)

**Split the Dataset in to Train and Test data**

In [160]:
Content_train,Content_test , content_score_train , content_score_test = train_test_split(content_feature_columns,content_score,test_size=0.2, random_state=42)
Wording_train,Wording_test , wording_score_train , wording_score_test = train_test_split(wording_feature_columns,wording_score,test_size=0.2, random_state=42)

**Train the model**

In [161]:
content_model = RandomForestRegressor()
content_model.fit(Content_train,content_score_train)
wording_model = RandomForestRegressor()
wording_model.fit(Wording_train,wording_score_train)

**Trying Predicting the Summary Scores using Test Data**

In [162]:
content_pred = content_model.predict(Content_test)
wording_pred = wording_model.predict(Wording_test)

****Calculate the Model Metrics****

In [163]:
content_mse = mean_squared_error(content_score_test,content_pred)
wording_mse = mean_squared_error(wording_score_test,wording_pred)
print('content_mse is ',content_mse)
print('wording_mse is ',wording_mse)

content_mse is  0.22276122363759246
wording_mse is  0.40127759426631654


**Submission**

In [164]:
df_test_prompt = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv')
df_test_summaries = pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv')

In [165]:
df_test = df_test_summaries.merge(df_test_prompt, on='prompt_id')

In [166]:
processed_test_df = feature_engineer(df_test)

In [167]:
test_ds = processed_test_df[content_columns.columns.to_list()]

In [168]:
processed_test_df['content'] = content_model.predict(test_ds)
processed_test_df['wording'] = wording_model.predict(test_ds)

In [169]:
processed_test_df.head()

Unnamed: 0,student_id,prompt_id,text,prompt_question,prompt_title,prompt_text,text_length,punctuations_count,stop_words_count,reading_ease,word_overlap_ratio,disjoint_words_ratio,content,wording
0,000000ffffff,abc123,Example text 1,Summarize...,Example Title 1,Heading\nText...,14,0,0,90.99,0.0,100.0,0.802849,0.400949
1,222222cccccc,abc123,Example text 3,Summarize...,Example Title 1,Heading\nText...,14,0,0,90.99,0.0,100.0,0.802849,0.400949
2,111111eeeeee,def789,Example text 2,Summarize...,Example Title 2,Heading\nText...,14,0,0,90.99,0.0,100.0,0.802849,0.400949
3,333333dddddd,def789,Example text 4,Summarize...,Example Title 2,Heading\nText...,14,0,0,90.99,0.0,100.0,0.802849,0.400949


In [170]:
processed_test_df[['student_id', 'content', 'wording']].to_csv('submission.csv',index=False)
display(pd.read_csv('submission.csv'))

Unnamed: 0,student_id,content,wording
0,000000ffffff,0.802849,0.400949
1,222222cccccc,0.802849,0.400949
2,111111eeeeee,0.802849,0.400949
3,333333dddddd,0.802849,0.400949
