In [1]:
!pip install sentence-transformers
!pip install nltk

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence-transformers)
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB

In [2]:
import pandas as pd
import re
import nltk
from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [4]:
def calculate_similarity(sentences, unique_sentences):
    similarities = []
    for sentence in sentences:
        if unique_sentences:
            embeddings = model.encode([sentence] + unique_sentences)
            similarity_scores = cosine_similarity([embeddings[0]], embeddings[1:])[0]
            max_similarity = max(similarity_scores)
            if max_similarity < 0.8:
                unique_sentences.append(sentence)
        else:
            unique_sentences.append(sentence)

    return unique_sentences

In [5]:
def is_potential_name(word):
    return word[0].isupper() and word[1:].islower()

In [6]:
def case_formatter(review):
    review = re.sub(r'\.{2,}', '.', review)
    sentences = nltk.sent_tokenize(review)
    formatted_sentences = []
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        formatted_words = []
        for i, word in enumerate(words):
            if i == 0 or not is_potential_name(word):
                word = word.lower()
            formatted_words.append(word)
        formatted_sentence = ' '.join(formatted_words)
        formatted_sentence = formatted_sentence.capitalize()
        formatted_sentences.append(formatted_sentence)
    formatted_review = '. '.join(formatted_sentences)
    formatted_review += '.'
    return formatted_review

In [7]:


def clean_reviews(review):
    review = re.sub(r'\.{2,}', '.', review)
    sentences = review.split('.')
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    unique_sentences = list(set(sentences))
    cleaned_review = '. '.join(unique_sentences)
    return cleaned_review + "."

In [8]:
def summary_finalizer(dataframe, batch_size):
    dataframe = dataframe.sort_values(by='Sequence')  # Sort dataframe by sequence number

    batch_strings_list = []
    unique_strings_list = []
    final_cleaned_strings_list = []

    for i in range(0, len(dataframe), batch_size):
        batch = dataframe[i:i+batch_size]
        batch_strings = list(batch['Summeries column'])  # Get the strings from the 'Summeries column' in sequence number order
        batch_strings_list.extend(batch_strings)

        unique_strings = calculate_similarity(batch_strings, unique_strings_list)  # Pass the batch strings through the similarity calculation
        unique_strings_list.extend(unique_strings)

    for string in unique_strings_list:
        formatted_string = case_formatter(string)
        cleaned_string = clean_reviews(formatted_string)
        final_cleaned_strings_list.append(cleaned_string)

    cleaned_dataframe = pd.DataFrame(columns=['Sequence',  'Summeries column', 'Cleaned summary'])

    for i, sequence in enumerate(dataframe['Sequence']):
        # review = dataframe.loc[dataframe['Sequence'] == sequence, 'review'].iloc[0]
        summer_or_review = dataframe.loc[dataframe['Sequence'] == sequence, 'Summeries column'].iloc[0]
        cleaned_summary = final_cleaned_strings_list[i]

        cleaned_dataframe = cleaned_dataframe.append({'Sequence': sequence,
                                                      # 'review': review,
                                                      'Summeries column': summer_or_review,
                                                      'Cleaned summary': cleaned_summary},
                                                     ignore_index=True)
    return cleaned_dataframe


In [9]:
import pandas as pd

summaries = [
    "a tsxi. a tsxi......... The wine...... The staff could smile... The staff could smile. The pool.. The staff could smile... Room view was of generator..",
    ". Very small. Very noisy air conditioning. Pool. The room. Location. The room. Wi Fi connection poor. Pool. The room. Location. The room. Location. The room. Location. The room. Location. The room. Location. The room. Location. The room. Location. The room. Location. The room. Location. The room. Location. The room. Location. The room. Location. The room.",
    "....... No tea coffee coffee or water in room. No tea coffee or water in room. No tea coffee or water in room. No tea coffee or water in room. No tea coffee or water in room. No tea coffee or water in room. No tea coffee or water in room. No tea coffee or water in room. No coffee. No water in room. No tea or coffee in room.. No tea coffee or water in room. No tea coffee.",
    "Poor service from staff Poor service from staff Emails not returned. No water hous. Poor service from staff Emails not returned. Poor service from staff Emails not returned. No water hous. Poor service from staff Emails not returned. Poor service from staff Emails not returned. Poor service from staff Emails not returned. Not much space in the rooms but expected that in central london.",
    "The staff were awful. The pool wifi was a bit too expensive The pool wifi was a bit too expensive. The pool wifi was a bit too expensive. Breakfast was not good.......... Bathrooms a bit small... Small bathroom. Small room. Small bathroom. Small bathroom. Small bathroom... Appalling customer service... .",
    "The staff were TERRIBLE UNHELPFUL SLOW AND WITH A COULDN t CARE LESS ATITUDE PARKING OF THE LACK OF IT WAS HORRENDOUS. Rooms were small. No wi fi in room when states on ad that it was standard size of room and ridiculous prices in bar.",
    "The staff were TERRIBLE UNHELPFUL SLOW AND WITH A COULDN t CARE LESS ATITUDE PARKING OF THE LACK OF IT WAS HORRENDOUS. Rooms were small. No wi fi in room when states on ad that it was standard size of room and ridiculous prices in bar.",
    ".. Bathroom to small shower was a Trickle of water. Bathroom to small shower was a Trickle of water. Bathroom very small....... Bathroom is small and dated........... Bathroom was tiny. Not applicable.. Bathroom was tiny.. Location..... Bathroom is small."
]

df = pd.DataFrame({'Summeries column': summaries})
df['Sequence'] =  [3,2,7,4,5,6,1,8]


In [10]:
batch_size = 9  # Adjust the batch size as per your requirement
cleaned_df = summary_finalizer(df, batch_size)
display(cleaned_df)

  cleaned_dataframe = cleaned_dataframe.append({'Sequence': sequence,
  cleaned_dataframe = cleaned_dataframe.append({'Sequence': sequence,
  cleaned_dataframe = cleaned_dataframe.append({'Sequence': sequence,
  cleaned_dataframe = cleaned_dataframe.append({'Sequence': sequence,
  cleaned_dataframe = cleaned_dataframe.append({'Sequence': sequence,
  cleaned_dataframe = cleaned_dataframe.append({'Sequence': sequence,
  cleaned_dataframe = cleaned_dataframe.append({'Sequence': sequence,
  cleaned_dataframe = cleaned_dataframe.append({'Sequence': sequence,


Unnamed: 0,Sequence,Summeries column,Cleaned summary
0,1,The staff were TERRIBLE UNHELPFUL SLOW AND WIT...,The staff were terrible unhelpful slow and wit...
1,2,. Very small. Very noisy air conditioning. Poo...,Very small. Location. Pool. The room. Wi fi co...
2,3,a tsxi. a tsxi......... The wine...... The sta...,Room view was of generator. The wine. The pool...
3,4,Poor service from staff Poor service from staf...,Poor service from staff poor service from staf...
4,5,The staff were awful. The pool wifi was a bit ...,Appalling customer service. The pool wifi was ...
5,6,The staff were TERRIBLE UNHELPFUL SLOW AND WIT...,No tea coffee or water in room. No water in ro...
6,7,....... No tea coffee coffee or water in room....,Location. Bathroom very small. Bathroom is sma...
7,8,.. Bathroom to small shower was a Trickle of w...,The staff were terrible unhelpful slow and wit...


In [11]:
display(cleaned_df)

Unnamed: 0,Sequence,Summeries column,Cleaned summary
0,1,The staff were TERRIBLE UNHELPFUL SLOW AND WIT...,The staff were terrible unhelpful slow and wit...
1,2,. Very small. Very noisy air conditioning. Poo...,Very small. Location. Pool. The room. Wi fi co...
2,3,a tsxi. a tsxi......... The wine...... The sta...,Room view was of generator. The wine. The pool...
3,4,Poor service from staff Poor service from staf...,Poor service from staff poor service from staf...
4,5,The staff were awful. The pool wifi was a bit ...,Appalling customer service. The pool wifi was ...
5,6,The staff were TERRIBLE UNHELPFUL SLOW AND WIT...,No tea coffee or water in room. No water in ro...
6,7,....... No tea coffee coffee or water in room....,Location. Bathroom very small. Bathroom is sma...
7,8,.. Bathroom to small shower was a Trickle of w...,The staff were terrible unhelpful slow and wit...
