### Importing necessary libraries 

In [1]:
import time
import warnings
import itertools
import numpy as np
import pandas as pd
from IPython.display import display 
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
start = time.time()

In [3]:
warnings.filterwarnings("ignore")

### Importing the DataFrame 

In [4]:
df = pd.read_csv("./clean_and_structured_news.csv")
df.head()

Unnamed: 0,clean_data,structured_data
0,"After reaching his hotel in the city, RM revea...",reaching hotel city rm revealed stay would fou...
1,RM aka Kim Namjoon was the first member to joi...,rm aka kim namjoon first member join bts group...
2,"Billie Eilish's concert was held in Seoul, Sou...",billie eilishs concert held seoul south korea ...
3,BTS ARMY you all would be missing the members ...,bts army would missing members lot right well ...
4,BTS member Kim Seokjin aka Jin has the capacit...,bts member kim seokjin aka jin capacity create...


### Removing the duplicates 

In [5]:
class RemoveDuplicates:
    def __init__(self, df: pd.DataFrame) -> None:
        self.__df = df
        self.__vectorizer = TfidfVectorizer(analyzer='word',
                                            ngram_range=(2, 5),
                                            stop_words='english',
                                            max_features=5000)

    def __combine_text(self) -> list[str]:
        """
        this function takes 'structured_data' column and converts it into list of string
        :return: list of string of sentences
        """
        self.__corpus = '.'.join(self.__df['structured_data'].to_list())
        return self.__corpus.split(".")

    def train(self) -> TfidfVectorizer:
        """
        this function, fits and transform tfidf vectorizer using the data under 'structured_data' column
        :return: vectors generated on the basis of data present under 'structured_data' column
        """
        self.__corpus = self.__combine_text()  # corpus: collection of texts
        self.__vectorizer.fit_transform(self.__corpus)
        # self.__only_idf = self.__vectorizer.idf_
        return self.__vectorizer

    def __get_sentence_embeddings(self, vectorizer, row_no: int, no_of_sentences: int) -> list:
        """
        this function returns indices of sentence that are repeated many times throughout the DataFrame
        :param row_no: indicates the DataFrame's index on which processing is to be done
        :param no_of_sentences: indicates the number of sentences present in DataFrame's index specified above
        :return: indices of sentences within the DataFrame's index that are repeated many times
        """
        self.__vectorizer = vectorizer
        structured_data = self.__df['structured_data'][row_no].split(".")
        cleaned_data = self.__df['clean_data'][row_no].split(".")
        
        X_test = [[np.exp(np.sqrt(self.__vectorizer.transform(structured_data)[i].toarray())).mean(), i, row_no, cleaned_data[i]] for i in range(no_of_sentences)]
        X_test.sort()
        return X_test

    def embeddings_for_sentence(self, vectorizer) -> list:
        """
        this function returns the sentence that are removed from the data.
        :return: sentences removed from the data.
        """
        __row = []
        for i in range(self.__df.shape[0]):
            structured_data = self.__df['structured_data'][i].split(".")
            cleaned_data = self.__df['clean_data'][i].split(".")
            no_of_sentences = len(structured_data)
            sentence_embeddings = self.__get_sentence_embeddings(vectorizer, i, no_of_sentences)
            __row.append(sentence_embeddings)
        return __row
    
    def __compute_threshold(self) -> float:
        """
        this function computes appropriate threshold
        :return: returns the calculated threshold
        """
        if "sentence_embeddings" in self.__df.columns:
            lst = list(itertools.chain(*self.__df['sentence_embeddings'].tolist()))
            sentence_embeddings = [i for (i, j, k, l) in lst]
            sentence_embeddings.sort()
            thresh = np.percentile(sentence_embeddings, 15, interpolation = 'midpoint')
            Q1 = np.percentile(sentence_embeddings, 25, interpolation = 'midpoint') # 0.00019836157232631794
            Q2 = np.percentile(sentence_embeddings, 50, interpolation = 'midpoint') # 0.0002630437164689158
            Q3 = np.percentile(sentence_embeddings, 75, interpolation = 'midpoint') # 0.00012697415854370304
            IQR = Q3 - Q1 # 0.00012697415854370304
            #sns.boxplot(sentence_embeddings)
            #print(f"Q1 25 percentile of sentence_embeddings: {Q1}")
            #print(f"Q1 50 percentile of sentence_embeddings: {Q2}")
            #print(f"Q1 75 percentile of sentence_embeddings: {Q3}")
            #print(f"IQR of sentence_embeddings: {IQR}")
            threshold = Q1 - 1.5 * IQR # 7.900334510763384e-06
            return Q1
    
    def duplicate_sentence_indices(self) -> list:
        """
        this function gives the indices of the duplicate sentences
        :return: returns the list duplicate sentence indices
        """
        __all_rows = []
        threshold = self.__compute_threshold()
        for i in self.__df['sentence_embeddings']:
            __row = []
            for [j, k, l, m] in i:
                if 0 < j < threshold:
                    __row.append([j, k, l, m])
            __all_rows.append(__row)
        return __all_rows
    
    def duplicate_sentences(self) -> list:
        """
        this function returns list of duplicate sentences
        :return: returns duplicate sentence's list
        """
        __all_rows = []
        for i in range(self.__df.shape[0]):
            cleaned_data = self.__df['clean_data'][i].split(".")
            duplicate_sentence_indices = self.__df['duplicate_sentence_indices'][i]
            __rows = [cleaned_data[k] for [j, k, l, m] in duplicate_sentence_indices]
            __all_rows.append('.'.join(__rows))
        return __all_rows

In [6]:
rd = RemoveDuplicates(df)

In [7]:
%%time
vectorizer = rd.train()

CPU times: total: 1.25 s
Wall time: 1.49 s


In [8]:
%%time
df['sentence_embeddings'] = rd.embeddings_for_sentence(vectorizer)

CPU times: total: 4.41 s
Wall time: 8.76 s


In [9]:
%%time
df['duplicate_sentence_indices'] = rd.duplicate_sentence_indices()

CPU times: total: 0 ns
Wall time: 14 ms


In [10]:
%%time
df['duplicate_sentences'] = rd.duplicate_sentences()

CPU times: total: 0 ns
Wall time: 10 ms


In [11]:
df.head()

Unnamed: 0,clean_data,structured_data,sentence_embeddings,duplicate_sentence_indices,duplicate_sentences
0,"After reaching his hotel in the city, RM revea...",reaching hotel city rm revealed stay would fou...,"[[1.0, 0, 0, After reaching his hotel in the c...","[[1.0, 0, 0, After reaching his hotel in the c...","After reaching his hotel in the city, RM revea..."
1,RM aka Kim Namjoon was the first member to joi...,rm aka kim namjoon first member join bts group...,"[[1.0, 4, 1, In my journey with BTS, I drifted...","[[1.0, 4, 1, In my journey with BTS, I drifted...","In my journey with BTS, I drifted further and ..."
2,"Billie Eilish's concert was held in Seoul, Sou...",billie eilishs concert held seoul south korea ...,"[[1.0005180427568212, 7, 2, RM captioned the p...","[[1.0005180427568212, 7, 2, RM captioned the p...","RM captioned the post saying, ""bad guys Well, ..."
3,BTS ARMY you all would be missing the members ...,bts army would missing members lot right well ...,"[[1.0003436563656918, 3, 3, He performed on h...","[[1.0003436563656918, 3, 3, He performed on h...",He performed on his latest solo album and als...
4,BTS member Kim Seokjin aka Jin has the capacit...,bts member kim seokjin aka jin capacity create...,"[[1.000518070901389, 3, 4, He will soon turn ...","[[1.000518070901389, 3, 4, He will soon turn ...",He will soon turn 31 Jungkook is also gettin...


In [12]:
def compute_threshold(series: pd.Series) -> float:
    lst = list(itertools.chain(*series.tolist()))
    sentence_embeddings = [i for (i,j,k,l) in lst]
    sentence_embeddings.sort()
    """Q1 = np.percentile(sentence_embeddings, 25, interpolation = 'midpoint')
    Q2 = np.percentile(sentence_embeddings, 50, interpolation = 'midpoint')
    Q3 = np.percentile(sentence_embeddings, 75, interpolation = 'midpoint')
    IQR = Q3 - Q1 
    sns.boxplot(sentence_embeddings)
    print(f"Q1 25 percentile of sentence_embeddings: {Q1}")
    print(f"Q1 50 percentile of sentence_embeddings: {Q2}")
    print(f"Q1 75 percentile of sentence_embeddings: {Q3}")
    print(f"IQR of sentence_embeddings: {IQR}")
    threshold = Q1 - 1.5 * IQR # 0.0001785034309020805 """
    res = []
    for m in range(1, 26):
        thres = np.percentile(sentence_embeddings, m, interpolation = 'midpoint')
        removed_sentences_indices = []
        for (i,j,k,l) in lst:
            if i < thres:
                removed_sentences_indices.append([i, j, k, l])
                removed_sentences_indices.sort()
        res.append([removed_sentences_indices, len(removed_sentences_indices), m])
    return res

In [13]:
to_find_avg = []
res = compute_threshold(df['sentence_embeddings'])
for i in range(1, len(res)):
    to_find_avg.append(res[i][1])
    print(f"When {i}th percentile is set as threshold, no. of duplicate sentences found are = {res[i-1][1]}")

When 1th percentile is set as threshold, no. of duplicate sentences found are = 0
When 2th percentile is set as threshold, no. of duplicate sentences found are = 0
When 3th percentile is set as threshold, no. of duplicate sentences found are = 0
When 4th percentile is set as threshold, no. of duplicate sentences found are = 0
When 5th percentile is set as threshold, no. of duplicate sentences found are = 0
When 6th percentile is set as threshold, no. of duplicate sentences found are = 0
When 7th percentile is set as threshold, no. of duplicate sentences found are = 0
When 8th percentile is set as threshold, no. of duplicate sentences found are = 0
When 9th percentile is set as threshold, no. of duplicate sentences found are = 560
When 10th percentile is set as threshold, no. of duplicate sentences found are = 560
When 11th percentile is set as threshold, no. of duplicate sentences found are = 560
When 12th percentile is set as threshold, no. of duplicate sentences found are = 560
When 

In [14]:
def create_dataframe(data: list[str], tdidf: list[float], row: list[int], offset: list[int]) -> pd.DataFrame:
    result = pd.DataFrame()
    result['data'] = data
    result['tfidf_value'] = tfidf
    result['row_no'] = row
    result['sentence_offset'] = offset
    return result

In [15]:
percentile = int(input("Enter percentile: "))
data, tfidf, row, offset = [], [], [], []
for [i,j,k] in res:
    if k == percentile:
        for l in i:
            data.append(l[3])
            tfidf.append(l[0])
            row.append(l[2])
            offset.append(l[1])
result = create_dataframe(data, tfidf, row, offset)
result.to_csv('./using_exp_sqrt_mean.csv', index=False)
display(result)

Enter percentile: 24


Unnamed: 0,data,tfidf_value,row_no,sentence_offset
0,"After reaching his hotel in the city, RM revea...",1.00000,0,0
1,"In the pictures, several new designs and colou...",1.00000,19,0
2,While the craze for K-Pop and K-Dramas apparen...,1.00000,23,0
3,"After Jimin met J-Hope, he spoke to the camera...",1.00000,49,0
4,"Charlie replied, ""I heard that too, and everyb...",1.00000,64,0
...,...,...,...,...
1584,BTS member Jeon Jungkook will be turning a yea...,1.00052,440,0
1585,waves internationally with his fervent music a...,1.00052,721,10
1586,He also called the group 'the most popular and...,1.00052,709,1
1587,"Narcissistic, my god, I love it, I HAVE has ha...",1.00052,646,4


In [16]:
result['tfidf_value'][1]

1.0

In [17]:
result.sort_values(by='sentence_offset', ascending=False)

Unnamed: 0,data,tfidf_value,row_no,sentence_offset
559,"Cha In-ha Cha In-ha, aged 27, was a young newb...",1.000000,529,52
1445,Not only did he reveal his struggle with menta...,1.000344,529,49
1444,"Thankfully, his fans quickly sent their suppor...",1.000344,529,44
558,"""I was more apprehensive about others and did ...",1.000000,529,41
557,"When we are promoting, we do not have a break ...",1.000000,529,40
...,...,...,...,...
573,She posted the photos of her soaking up the su...,1.000344,123,0
574,He was recently featured on the comedy chat sh...,1.000344,135,0
575,Jungkook revealed that if he could speak in En...,1.000344,137,0
576,"The singer, while walking, smiled and made hea...",1.000344,138,0


In [18]:
for i in range(100):
    print(f"{i+1}. {result['data'][i]}")

1. After reaching his hotel in the city, RM revealed that his stay would be for four days and added that he would step out for dinner
2. In the pictures, several new designs and colours were also seen added to Jungkook's existing tattoos  The artist also added his 'nickname' on Jungkook's arm 
3. While the craze for K-Pop and K-Dramas apparently originated from the Northeast, the BTS wave is spreading all over the nation Some months back, we told you how Jungkook's pic was used to advertise an English-speaking class in Sonepat, Haryana, here is a new incident
4. After Jimin met J-Hope, he spoke to the camera in the US, "I bet he is super nervous I think I will be nervous too He will do fine J-Hope is the most professional one out of us He will do great I have no doubts"
5. Charlie replied, "I heard that too, and everybody in my camp does not know the date it comes out, we legitimately have no idea the day it comes out, we do, but we just figured it out"
6. On the show, when RM was aske

In [19]:
print(f"Total time taken in complete program execution: {(int)((time.time()-start)//60)} mins {(int)((time.time()-start)%60)} secs")

Total time taken in complete program execution: 0 mins 26 secs
