In [30]:
# !apt-get update && apt-get install -y libsndfile1 ffmpeg
# !pip install Cython
# !pip install nemo_toolkit['all']

In [31]:
from google.colab import files
import pandas as pd
import random, re
# nltk
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nemo.collections.nlp.models import PunctuationCapitalizationModel
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
files.upload()
!mkdir /root/.kaggle/
!mv /content/kaggle.json /root/.kaggle/kaggle.json
!kaggle datasets download -d thedevastator/nlp-mental-health-conversations
!rm /root/.kaggle/kaggle.json
!unzip /content/nlp-mental-health-conversations.zip
!rm /content/nlp-mental-health-conversations.zip

Saving kaggle.json to kaggle.json
mkdir: cannot create directory ‘/root/.kaggle/’: File exists
Downloading nlp-mental-health-conversations.zip to /content
 68% 1.00M/1.48M [00:00<00:00, 1.43MB/s]
100% 1.48M/1.48M [00:00<00:00, 1.95MB/s]
Archive:  /content/nlp-mental-health-conversations.zip
replace train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: train.csv               


## Load dataset

In [20]:
all_sents = pd.read_csv("/content/train.csv")["Response"].tolist()[:10]

In [21]:
PUNCTUATIONS = ". ? ' \" - [ ] { } ( ) ... : ;"
PUNCTUATIONS = PUNCTUATIONS.split(" ")
print("Number of punctuations:", len(PUNCTUATIONS))

Number of punctuations: 14


## Cleaning and pre-processing

In [22]:
def remove_punc(sent_in):
  fin = ""
  for i in sent_in:
    if (i.isalnum()) or (i in PUNCTUATIONS+["<", ">", " "]):
      fin += i
  return fin
def clean_text(all_sent):
  """
    This functions cleans the given text by performing the following changes
    - remove Nan
    - replace "\xa0" with " "
    - replace "\n", "\r", '\t" with " "
    - replace multiple spaces with single space
    - lowercase
    - tokenize the sentences
    - <URL> for urls
    - <NUM> for numbers
    - <PHNUM> for phone numbers [NUM-NUM-NUM-NUM], [NUM-NUM-NUM]
    - <DATE> for dates
    - Replace 3+ consecutive periods with an ellipse.
    - Ensure all punctutions have a space before and after it
    - Remove all other special charectors
  """
  all_sent = [sent for sent in all_sent if type(sent)==type("string")]  # remove nan
  fin = []
  for sent in all_sent:
    fin += nltk.tokenize.sent_tokenize(sent.replace('\xa0', ' ').replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').lower())
  fin = [re.sub(r'\d+', "<NUM>", sent) for sent in fin]  # Numbers
  # Ph. num
  fin = [sent.replace("<NUM>-<NUM>-<NUM>-<NUM>", "<PHNUM>") for sent in fin]
  fin = [sent.replace("<NUM>-<NUM>-<NUM>", "<PHNUM>") for sent in fin]
  # Dates
  fin = [sent.replace("<NUM>/<NUM>/<NUM>", "<DATE>") for sent in fin]
  fin = [re.sub(r'https?://\S+|www\.\S+', "<URL>", sent) for sent in fin]  # URLs
  fin = [re.sub(r'\.{4,}', '...', sent) for sent in fin]  # Multiple periods
  # space around punctuations
  for pun in PUNCTUATIONS:
    fin = [sent.replace(pun, " " + pun + " ") for sent in fin]
  fin = [re.sub(r' +', " ", sent) for sent in fin]  # Remove multiple Spaces
  # remove other special charectors
  fin = [remove_punc(sent) for sent in fin]
  return fin

In [23]:
def lemmatize_text(all_sent):
    fin = []
    lemmatizer = WordNetLemmatizer()
    for sent in all_sent:
      tokens = sent.split(" ")
      lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
      lemmatized_tokens  = [i for i in lemmatized_tokens if len(i)]
      fin += [lemmatized_tokens]
    fin = [i for i in fin if len(i)]
    return fin

In [24]:
def remove_all_punc(s_in):
  fin = ""
  for i in s_in:
    if i.isalpha() or i==" ":
      fin += i
  return fin

In [25]:
all_sents = clean_text(all_sents)
all_sents = lemmatize_text(all_sents)
all_sents = [ " ".join(i) for i in all_sents ]
wo_punc = [remove_all_punc(i) for i in all_sents]

## Restoration

In [26]:
# Download and load the pre-trained BERT-based model
model = PunctuationCapitalizationModel.from_pretrained("punctuation_en_bert")

# Add punctuation using the model
restored_punc = model.add_punctuation_capitalization(wo_punc)
restored_punc = [i.lower() for i in restored_punc]

100%|██████████| 2/2 [00:19<00:00,  9.96s/batch]


## Score

In [27]:
def calculate_jaccard_similarity(original_sentence, restored_sentence):
    original_tokens = set(word_tokenize(original_sentence))
    restored_tokens = set(word_tokenize(restored_sentence))

    # Calculate Jaccard similarity
    intersection = len(original_tokens.intersection(restored_tokens))
    union = len(original_tokens.union(restored_tokens))

    jaccard_similarity = intersection / union if union != 0 else 0
    return jaccard_similarity

In [28]:
fin_score = sum( [calculate_jaccard_similarity(orig, restored) for orig, restored in zip(all_sents, restored_punc)] ) / len(all_sents)

In [29]:
print("Final score for pretrained-Nemo:", fin_score)

Final score for pretrained-Nemo: 0.9537062875409791
