In [1]:
!pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.7.2-py3-none-any.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2


In [2]:
!git clone "https://github.com/Sopralapanca/CommonLitChallenge.git"

Cloning into 'CommonLitChallenge'...
remote: Enumerating objects: 79, done.[K
remote: Counting objects: 100% (79/79), done.[K
remote: Compressing objects: 100% (63/63), done.[K
remote: Total 79 (delta 30), reused 44 (delta 12), pack-reused 0[K
Receiving objects: 100% (79/79), 1.99 MiB | 4.59 MiB/s, done.
Resolving deltas: 100% (30/30), done.


In [3]:
import pandas as pd
summaries_train_path = "/content/CommonLitChallenge/data/summaries_train.csv"
prompt_train_path = "/content/CommonLitChallenge/data/prompts_train.csv"

summaries_test_path = "/content/CommonLitChallenge/data/summaries_test.csv"
prompt_test_path = "/content/CommonLitChallenge/data/prompts_test.csv"

train_data = pd.read_csv(summaries_train_path, sep=',', index_col=0)
prompt_data = pd.read_csv(prompt_train_path, sep=',', index_col=0)


In [4]:
prompt_data.head()

Unnamed: 0_level_0,prompt_question,prompt_title,prompt_text
prompt_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \nAs the sequel to what has already...
3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \nThe Third Wave experiment took pl...
ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


In [5]:
from nltk.util import ngrams
from collections import Counter
import nltk
nltk.download('punkt')  # Download the required resources for tokenization

def count_ngrams(text, n):
  words = nltk.word_tokenize(text)
  ngram_counts = Counter(ngrams(words, n))
  return ngram_counts

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [6]:
del_columns = []

for n in range(2, 5):
  col = f"{n}grams-prompttext-count"
  prompt_data[col] = prompt_data.apply(lambda row: count_ngrams(row["prompt_text"], n), axis=1)

  del_columns.append(col)

  col = f"{n}grams-text-count"
  train_data[col] = train_data.apply(lambda row: count_ngrams(row["text"], n), axis=1)

  del_columns.append(col)

prompt_data["prompt_text_length"] = prompt_data["prompt_text"].apply(len)

In [7]:
training_data = train_data.merge(prompt_data, on='prompt_id')

In [8]:
def count_cooccurring_ngrams(text, prompt_text):
    cooccurring_count = sum((text & prompt_text).values())
    return cooccurring_count

In [9]:
for n in range(2,5):
  text_col = f"{n}grams-text-count"
  prompt_col = f"{n}grams-prompttext-count"

  new_col=f"{n}grams-cooccurence-count"
  training_data[new_col] = training_data.apply(lambda row: count_cooccurring_ngrams(row[text_col], row[prompt_col]), axis=1)

In [10]:
training_data.drop(columns=del_columns, inplace=True)

In [11]:
training_data["text_length"] = training_data["text"].apply(len)
training_data["length_ratio"] = training_data["text_length"] / training_data["prompt_text_length"]

In [12]:
from spellchecker import SpellChecker

spell = SpellChecker()

def misspelled_counter(text):
  words = text.split()
  misspelled = spell.unknown(words)
  return len(misspelled)

In [13]:
training_data["misspelled_counter"] = training_data["text"].apply(lambda x: misspelled_counter(x))

In [14]:
# normalize the data taking into consideration the prompt title

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

def normalize_col(training_data, col):
  # Create a new DataFrame to store the normalized text length values
  normalized_df = pd.DataFrame()

  # Group by 'prompt_title' and apply the normalization separately for each group
  for title, group in training_data.groupby('prompt_title'):
      normalized_text_length = scaler.fit_transform(group[[col]])
      new_name = "normalized_"+col
      group[new_name] = normalized_text_length
      normalized_df = pd.concat([normalized_df, group])
  training_data = normalized_df.copy()
  return training_data

In [15]:
normalize_cols = ["text_length", "misspelled_counter", "2grams-cooccurence-count", "3grams-cooccurence-count", "4grams-cooccurence-count"]
for col in normalize_cols:
  training_data = normalize_col(training_data, col)

In [16]:
import string
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocessText(text):
    # replace newline with space
    text = text.replace("\n", "")

    text = text.replace('\r', '')
    # Replace curly apostrophe with straight single quote
    text = text.replace('’', "'")

    # Normalize spaces around punctuation marks
    #text = re.sub(r'\s+', ' ', text)
    #text = re.sub(r'\s([.,!?])', r'\1', text)
    #text = re.sub(r'([.,!?])\s', r'\1', text)
    text = text.strip()

    # lower case
    text = text.lower()

    # split text
    words = text.split()

    # stop word removal
    words = [w for w in words if not w in stop_words]

    # stemming
    #words = [stemmer.stem(w) for w in words]

    # lemmatization
    words = [lemmatizer.lemmatize(w) for w in words]

    text = ' '.join(words)

    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [17]:
def add_row(df1, df2):
  row = df2.unique().tolist()[0]
  row = preprocessText(row)
  combined_data = pd.concat([pd.Series([row]),df1.loc[:]]).reset_index(drop=True) #append row on the head of the dataframe
  return combined_data

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Group by 'prompt_id' and compute TF-IDF separately for each class
tfidf_vectorizers = {}

for class_id, group in training_data.groupby('prompt_id'):
    text_data = group['text'].apply(preprocessText)


    prompt_question_data = group['prompt_question']
    prompt_title_data = group['prompt_title']
    prompt_text_data = group['prompt_text']

    # Concatenate the preprocessed data for TF-IDF calculation
    combined_data = add_row(text_data, prompt_question_data)
    combined_data = add_row(combined_data, prompt_title_data)
    combined_data = add_row(combined_data, prompt_text_data)


    # Compute TF-IDF
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(combined_data)
    tfidf_vectorizers[class_id] = {'vectorizer': tfidf_vectorizer, 'matrix': tfidf_matrix}

In [21]:
# VERSIONE CHRIS
# import simpy

# average_tfidf_scores = {}

# # Calculate TF-IDF scores for each document
# for class_id, group in training_data.groupby('prompt_id'):
#     tfidf_vectorizer = tfidf_vectorizers[class_id]['vectorizer']
#     tfidf_matrix = tfidf_vectorizers[class_id]['matrix']
#     tfidf_matrix = tfidf_matrix[3:] #remove first 3 rows f the matrix since they belongs to prompt_text, prompt_question, prompt_title
#     modulus = sympy.randprime(tfidf_matrix.shape[0]*(10**-2), tfidf_matrix.shape[0])
#     # Iterate through documents and calculate TF-IDF scores
#     for index, row in group.iterrows():
#         doc_tfidf = tfidf_matrix[index - group.index[0]].toarray()[0]

#         doc_tfidf = doc_tfidf[doc_tfidf>0]

#         # Calculate the average TF-IDF score for the document
#         gamma = 1e-2
#         single_tfidf_score = sum([t**(gamma*i) for i, t in enumerate(doc_tfidf)]) % modulus

#         average_tfidf_scores[index] = single_tfidf_score

average_tfidf_scores = {}

# Calculate TF-IDF scores for each document
for class_id, group in training_data.groupby('prompt_id'):
    tfidf_vectorizer = tfidf_vectorizers[class_id]['vectorizer']
    tfidf_matrix = tfidf_vectorizers[class_id]['matrix']
    tfidf_matrix = tfidf_matrix[3:] #remove first 3 rows f the matrix since they belongs to prompt_text, prompt_question, prompt_title

    # Iterate through documents and calculate TF-IDF scores
    for index, row in group.iterrows():
        doc_tfidf = tfidf_matrix[index - group.index[0]].toarray()[0]

        doc_tfidf = doc_tfidf[doc_tfidf>0]

        # Calculate the average TF-IDF score for the document
        average_tfidf_score = sum(doc_tfidf) / len(doc_tfidf)

        average_tfidf_scores[index] = average_tfidf_score

In [22]:
# Add the calculated average TF-IDF scores as a new column to the DataFrame
training_data['average_tfidf_score'] = [average_tfidf_scores[index] for index in training_data.index]
training_data.head()

Unnamed: 0,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text,prompt_text_length,2grams-cooccurence-count,3grams-cooccurence-count,4grams-cooccurence-count,text_length,length_ratio,misspelled_counter,normalized_text_length,normalized_misspelled_counter,normalized_2grams-cooccurence-count,normalized_3grams-cooccurence-count,normalized_4grams-cooccurence-count,average_tfidf_score
3099,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,3329,57,23,11,1370,0.411535,32,0.32828,0.316832,0.143216,0.058376,0.028205,0.088198
3100,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,3329,6,5,4,157,0.047161,5,0.011239,0.049505,0.015075,0.01269,0.010256,0.239143
3101,3b9047,The Egyptian society is really different from ...,0.205683,0.380538,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,3329,8,0,0,453,0.136077,7,0.088604,0.069307,0.020101,0.0,0.0,0.167005
3102,3b9047,We have the gods and then Logan and If Logan ...,-1.547163,-1.461245,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,3329,1,0,0,131,0.039351,3,0.004443,0.029703,0.002513,0.0,0.0,0.257462
3103,3b9047,The social classes are different because they ...,-0.066112,-0.715083,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,3329,41,35,34,437,0.131271,6,0.084422,0.059406,0.103015,0.088832,0.087179,0.16287


In [23]:
training_data.corrwith(training_data["content"])

  training_data.corrwith(training_data["content"])


content                                1.000000
wording                                0.751380
prompt_text_length                    -0.038532
2grams-cooccurence-count               0.498155
3grams-cooccurence-count               0.363915
4grams-cooccurence-count               0.319285
text_length                            0.797244
length_ratio                           0.777115
misspelled_counter                     0.694396
normalized_text_length                 0.781679
normalized_misspelled_counter          0.725544
normalized_2grams-cooccurence-count    0.533596
normalized_3grams-cooccurence-count    0.385057
normalized_4grams-cooccurence-count    0.336164
average_tfidf_score                   -0.814088
dtype: float64

In [24]:
training_data.corrwith(training_data["wording"])

  training_data.corrwith(training_data["wording"])


content                                0.751380
wording                                1.000000
prompt_text_length                    -0.121241
2grams-cooccurence-count               0.133163
3grams-cooccurence-count               0.012620
4grams-cooccurence-count              -0.016339
text_length                            0.540138
length_ratio                           0.546272
misspelled_counter                     0.412939
normalized_text_length                 0.556444
normalized_misspelled_counter          0.492186
normalized_2grams-cooccurence-count    0.204767
normalized_3grams-cooccurence-count    0.052393
normalized_4grams-cooccurence-count    0.015281
average_tfidf_score                   -0.577712
dtype: float64

In [25]:
training_data.to_csv('dataset.csv', index=False)