In [None]:
import pandas as pd

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')


In [31]:
df = pd.read_csv("/kaggle/input/scientsummarize/main.csv")
df["text"] = df["text"].apply(lambda x: x.replace("summarize: ", ""))
df.head()

Unnamed: 0,text,summary
0,Human evaluations of machine translation (MT) ...,Bleu: A Method For Automatic Evaluation Of Mac...
1,A large number of current language processing ...,TnT - A Statistical Part-Of-Speech Tagger\nTri...
2,Current automatic summarizers usually rely on ...,Sentence Reduction For Automatic Text Summariz...
3,Even moderately long documents typically addre...,Advances In Domain Independent Linear Text Seg...
4,Word sense disambiguation is often cast as a p...,A Simple Approach To Building Ensembles Of Nai...


In [43]:
def corr(
    s: str,
    add_space_when_numerics=False,
    exceptions=["e.g.", "i.e.", "etc.", "cf.", "vs.", "p."],
) -> str:
    """corrects spacing in a string
    Args:
        s (str): the string to correct
        add_space_when_numerics (bool, optional): [add a space when a period is between two numbers, example 5.73]. Defaults to False.
        exceptions (list, optional): [do not change these substrings]. Defaults to ['e.g.', 'i.e.', 'etc.', 'cf.', 'vs.', 'p.'].
    Returns:
        str: the corrected string
    """
    if add_space_when_numerics:
        s = re.sub(r"(\d)\.(\d)", r"\1. \2", s)

    s = re.sub(r"\s+", " ", s)
    s = re.sub(r'\s([?.!"](?:\s|$))', r"\1", s)

    # fix space before apostrophe
    s = re.sub(r"\s\'", r"'", s)
    # fix space after apostrophe
    s = re.sub(r"'\s", r"'", s)
    # fix space before comma
    s = re.sub(r"\s,", r",", s)

    for e in exceptions:
        expected_sub = re.sub(r"\s", "", e)
        s = s.replace(expected_sub, e)

    return s

def fix_punct_spaces(string):
    """
    fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
    Parameters
    ----------
    string : str, required, input string to be corrected
    Returns
    -------
    str, corrected string
    """

    fix_spaces = re.compile(r"\s*([?!.,]+(?:\s+[?!.,]+)*)\s*")
    string = fix_spaces.sub(lambda x: "{} ".format(x.group(1).replace(" ", "")), string)
    string = string.replace(" ' ", "'")
    string = string.replace(' " ', '"')
    return string.strip()

def remove_stopwords(text, lang='english'):
    stop_words = set(stopwords.words(lang))
    word_tokens = word_tokenize(text)
    filtered_text = [w for w in word_tokens if not w in stop_words]
    return " ".join(filtered_text)

def get_pos(tag):
    if tag.startswith('R'):
        return wordnet.ADV
    elif tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    else:
        return wordnet.NOUN

class Lemmatization:
    def __init__(self):
        self.lemmatizer=WordNetLemmatizer()
    def __call__(self, doc):
        lemma_list=[]
        doc=''.join(doc)
        word_and_pos=nltk.pos_tag(doc.split())
        for word, pos in word_and_pos:
            _pos=get_pos(pos)
            lemma_list.append(self.lemmatizer.lemmatize(
                word, pos=_pos
            ))
        return " ".join(lemma_list)

In [45]:
# lemmatize the dataset
lemmatizer = Lemmatization()
df["text"] = df["text"].apply(lemmatizer)

# correct and fixes punctuation spaces in dataset
df["text"] = df["text"].apply(corr)
df["text"] = df["text"].apply(fix_punct_spaces)
df["text"] = df["text"].apply(remove_stopwords)

df.head()

Unnamed: 0,text,summary
0,Human evaluation machine translation ( MT ) we...,Bleu: A Method For Automatic Evaluation Of Mac...
1,A large number current language processing sys...,TnT - A Statistical Part-Of-Speech Tagger\nTri...
2,Current automatic summarizers usually rely sen...,Sentence Reduction For Automatic Text Summariz...
3,Even moderately long document typically addres...,Advances In Domain Independent Linear Text Seg...
4,Word sense disambiguation often cast problem s...,A Simple Approach To Building Ensembles Of Nai...


In [59]:
df.to_json("preprocessed.json", orient="records")