## Importing necessary libraries

In [1]:
import re
import os
import time
import spacy
import string
import warnings
import itertools
import numpy as np
import contractions
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from IPython.display import display # just to display

In [2]:
start = time.time()

In [3]:
warnings.filterwarnings('ignore')

## Importing the DataFrame 

In [4]:
df = pd.read_csv("./news.csv")
df.head()

Unnamed: 0,title,content,published_at,source,topic
0,BTS: RM is reminded of Bon Voyage as he travel...,"After reaching his hotel in the city, RM revea...",2022-07-30T07:00:00Z,2,13
1,RM recalls wondering if he 'made right decisio...,RM aka Kim Namjoon was the first member to joi...,2022-12-22T15:57:55Z,2,13
2,BTS: J-Hope and RM go bonkers at Billie Eilish...,"Billie Eilish's concert was held in Seoul, Sou...",2022-08-16T07:00:00Z,1,7
3,"BTS: J-Hope proudly states he raised Jungkook,...",BTS ARMY y'all would be missing the members a ...,2022-12-18T13:08:40Z,1,7
4,BTS: Jin aka Kim Seokjin takes us through the ...,BTS member Kim Seokjin aka Jin has the capacit...,2022-11-21T08:00:00Z,1,8


## Analyzing the DataFrame 

In [5]:
df.isna().sum()

title           0
content         4
published_at    0
source          0
topic           0
dtype: int64

In [6]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [7]:
df.isna().sum()

title           0
content         0
published_at    0
source          0
topic           0
dtype: int64

In [8]:
df.drop(['title', 'published_at', 'source', 'topic'], axis=1, inplace=True)

In [9]:
df.head()

Unnamed: 0,content
0,"After reaching his hotel in the city, RM revea..."
1,RM aka Kim Namjoon was the first member to joi...
2,"Billie Eilish's concert was held in Seoul, Sou..."
3,BTS ARMY y'all would be missing the members a ...
4,BTS member Kim Seokjin aka Jin has the capacit...


## Performing the DataCleaning

In [10]:
small_model = spacy.load('en_core_web_sm')
medium_model = spacy.load('en_core_web_md')
large_model = spacy.load('en_core_web_lg')

In [11]:
class DataCleaning:
    def __init__(self) -> None:
        self.__stopwords = stopwords.words("english")
        self.__punctuations = string.punctuation.replace(".", "")
    
    
    def __remove_html_tags_entities(self, data: str) -> str:
        """
        to remove HTML tags, entities and links if any from the data
        :param data: string with HTML tags, entities and links
        :return: string without HTML tags, entities and links
        """
        data = BeautifulSoup(str(data), "lxml").get_text(strip=True)
        data = data.encode("ascii", "ignore").decode()
        return data.strip()
    
    
    def __remove_last_fullstop(self, data: str) -> str:
        """
        to remove the last fullstop if present, because in later stages 
        while splitting the data on the basis of fullstop, the last 
        fullstop leads to generation of empty string
        :param data: string which may have fullstop at end
        :return: string without fullstop at the end
        """
        data = self.__remove_html_tags_entities(data)
        if data[-1] == '.':
            data = data[:-1]
            return data
        else:
            return data
        
        
    def __remove_links(self, data: str) -> str: 
        """
        to remove any website links that might get scrapped with text
        :param data: string with or without links
        :return: string without links
        """
        data = self.__remove_last_fullstop(data)
        return re.sub(r"(http(s)?:\/\/.)+(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}(\.)*[ a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&\/=]*)", '', data)
    
    
    def __general_preprocessing(self, data: str) -> str:
        """
        to remove any \n, \t, ]r characters
        :param data: string with any irrelevant characters
        :return: string without any irrelevant characters
        """
        data = self.__remove_links(data)
        data = re.sub(r"(\n)+", ".", data)
        data = re.sub("[\t\r]", ' ', data)
        data = re.sub("(  )+", " ", data)
        return data
    
    
    def __retain_fullstops_within_quotes(self, data: str) -> str:
        """
        to retain the fullstops without the double quotes
        :param data: string with fullstops inside double quotes
        :return: string in which fullstops inside double quotes are replaced with ''
        """
        data = self.__general_preprocessing(data)
        return re.sub(r'"[^"]*"', lambda m: m.group(0).replace('.', '') + ".", data)

    
    
    def __remove_contractions(self, data: str) -> str:
        """
        to convert contractions to expanded words
        :param data: string with contractions, ex: "It's"
        :return: string without contractions, ex: "It's" -> "It is"
        """
        data = self.__retain_fullstops_within_quotes(data)
        return " ".join([contractions.fix(i) for i in data.split(" ")])
    
    
    def __remove_empty_sentences(self, data: str):
        """
        to remove any sentence that has length zero or it's just more than 1 consecutive fullstops
        :param data: string with or without consecutive fullstops
        :return: string without empty sentence
        """
        data = self.__remove_contractions(data)
        data = re.sub(" \.", ".", data)
        data = re.sub("\. ", ".", data)
        data = re.sub(" \. ", ".", data)
        data = re.sub("(\.)+", ".", data)
        return ".".join(sentence for sentence in data.split(".") if len(sentence) != 0)
   

    def __remove_punctuations(self, data: str) -> str:
        """
        to remove punctuations
        Note: Here, we are neither inserting nor removing any fullstop. Therefore no. of sentences 
        in clean data as well as structured data remains same after applying the pre-processing.
        :param data: string which has punctuations
        :return: string without punctuations
        """
        data = data.lower()
        return data.translate(data.maketrans('', '', self.__punctuations))
    
    
    def __remove_stopwords(self, data: str) -> str:
        """
        to remove stopwords from the sentence
        :param data: string with stopwords, e.g.: 'This is great'
        :return: string without stopwords, e.g.: 'This great'
        """
        data = self.__remove_punctuations(data)
        return " ".join([i for i in data.split(" ") if i not in self.__stopwords])
        
        
    def clean_data_nf(self, data: str) -> str:
        """
        to perform the appropriate preprocessing on the data
        :param data: raw data
        :return: clean data which can be used for further processing
        """
        data = self.__remove_empty_sentences(data)
        return data
    
    def structured_data_nf(self, data: str) -> str:
        """
        to remove the punctuations and stopwords from the cleaned data
        :param data: cleaned data
        :return: structured data
        """
        data = self.__remove_stopwords(data)
        return data
    
    
    def __sentence_formatting(self, data: list[str]) -> str:
        flag, sentences = 0, []
        for i in range(len(data)):
            if flag == 1:
                flag = 0
                continue
            sentence = data[i]
            total_words = len(sentence.split(" "))
            if total_words > 25 and sentence not in sentences:
                sentences.append(sentence)
            elif 0 < total_words <= 25 and sentence not in sentences:
                if i == 0:
                    sentences.append(data[i] + " " + data[i+1])
                elif i == len(data)-1:
                    sentences.append(data[i-1] + " " + data[i])
                elif len(data[i-1].split(" ")) <= len(data[i+1].split(" ")):
                    sentences.pop(-1)
                    sentences.append(data[i-1] + " " + data[i])
                elif len(data[i-1].split(" ")) > len(data[i+1].split(" ")):
                    sentences.append(data[i] + " " + data[i+1])
                    flag = 1
        return '.'.join(sentences)
    

    
    def __sentence_boundary_detection_small_model(self, data: str) -> str:
        doc = small_model(data)
        sentences = list(doc.sents)
        data = ""
        for i in sentences:
            data += str(i).strip() + "."
        data = data.split(".")
        sentences = self.__sentence_formatting(data)
        data = []
        for i in sentences.split("."):
            if len(i) != 0:
                data.append(i)
        sentences = ".".join(data)
        sentences = re.sub("(  )+", " ", sentences)
        savg = len(sentences.split(" "))/len(sentences.split("."))
        return sentences.strip(), savg
    
    def __sentence_boundary_detection_medium_model(self, data: str) -> str:
        doc = medium_model(data)
        sentences = list(doc.sents)
        data = ""
        for i in sentences:
            data += str(i).strip() + "."
        data = data.split(".")
        sentences = self.__sentence_formatting(data)
        data = []
        for i in sentences.split("."):
            if len(i) != 0:
                data.append(i)
        sentences = ".".join(data)
        sentences = re.sub("(  )+", " ", sentences)
        mavg = len(sentences.split(" "))/len(sentences.split("."))
        return sentences.strip(), mavg
    
    
    def __sentence_boundary_detection_large_model(self, data: str) -> str:
        doc = large_model(data)
        sentences = list(doc.sents)
        data = ""
        for i in sentences:
            data += str(i).strip() + "."
        data = data.split(".")
        sentences = self.__sentence_formatting(data)
        data = []
        for i in sentences.split("."):
            if len(i) != 0:
                data.append(i)
        sentences = ".".join(data)
        sentences = re.sub("(  )+", " ", sentences)
        lavg = len(sentences.split(" "))/len(sentences.split("."))
        return sentences.strip(), lavg
        
    
    def __use_appropriate_model(self, data: str) -> str:
        small, savg = self.__sentence_boundary_detection_small_model(data)
        # print(f"SMALL:\n{small}")
        medium, mavg = self.__sentence_boundary_detection_medium_model(data)
        # print(f"MEDIUM:\n{medium}")
        large, lavg = self.__sentence_boundary_detection_large_model(data)
        # print(f"LARGE:\n{large}")
        selection_criteia = [[savg, small], [mavg, medium], [lavg, large]]
        selection_criteia.sort()
        final_selected_sentence = self.__sentence_formatting(selection_criteia[0][1].split("."))
        # print(final_selected_sentence)
        return final_selected_sentence
    
    
    def sentence_boundary_detection(self) -> None:
        for no_of_sentence in range(1, 20):
            indices = df[df['no_of_sentence'] == no_of_sentence].index
            for i in indices:
                data = df['clean_data'][i]
                res = len(data.split("."))
                print(f"Index:{i} Before Cleaning: {res}")
                data = self.__use_appropriate_model(data)
                df['clean_data'][i] = data
                res, res1 = len(data.split(".")), len(df['clean_data'][i].split("."))
                print(f"After Cleaning {res} and {res1}")
                df['structured_data'][i] = self.structured_data_nf(df['clean_data'][i])
                print("-"*60)
            df['no_of_sentence'] = df['structured_data'].apply(lambda x: len(x.split(".")))
        
    def remove_unnecessary_fullstops(self) -> None:
        max_sentences_in_a_row = df.sort_values(by='no_of_sentence').max()[2]+1
        for no_of_sentence in range(max_sentences_in_a_row, 9, -1):
            indices = df[df['no_of_sentence'] == no_of_sentence].index
            for i in indices:
                data = df['clean_data'][i]
                res = len(data.split("."))
                print(f"Index:{i} Before Cleaning: {res}")
                data = self.__sentence_formatting(data.split("."))
                df['clean_data'][i] = data
                res, res1 = len(data.split(".")), len(df['clean_data'][i].split("."))
                print(f"After Cleaning {res} and {res1}")
                df['structured_data'][i] = self.structured_data_nf(df['clean_data'][i])
                print("-"*60)
            df['no_of_sentence'] = df['structured_data'].apply(lambda x: len(x.split(".")))
    
    
    def clean_data_f(self, data: str) -> str:
        return self.__sentence_formatting(data.split("."))

In [12]:
dc = DataCleaning()

In [13]:
clean_data = dc.clean_data_nf(df['content'][0])
clean_data.split(".")

['After reaching his hotel in the city, RM revealed that his stay would be for four days and added that he would step out for dinner',
 'As he sat at a roadside open-air restaurant, RM feasted on beer, burgers and fries',
 'He said, "I am starving right now I am out to grab some food It is much quieter than I expected and feels like a rural town I like the familiar atmosphere"',
 'RM attended Art Basel and explained on camera the details of the art fair',
 'He also gave a glimpse as he had noodles and beer which was followed by soup noodles and wrap',
 'Showing the pattern of a ping pong table, RM said, "The table looks like our (BTS) symbol"',
 'He also spoke about the art pieces as he viewed them',
 'After that, RM took a tram to visit the Foundation Beyeler, a museum',
 'He later took a walk through the city',
 'On his third day, RM visited the Kunstmuseum Basel, the Vitra Design Museum and the gallery',
 'As he walked around, RM showed a chair to his fans and said, "I have breaking

In [14]:
df['clean_data'] = df['content'].apply(dc.clean_data_nf)
df.head()

Unnamed: 0,content,clean_data
0,"After reaching his hotel in the city, RM revea...","After reaching his hotel in the city, RM revea..."
1,RM aka Kim Namjoon was the first member to joi...,RM aka Kim Namjoon was the first member to joi...
2,"Billie Eilish's concert was held in Seoul, Sou...","Billie Eilish's concert was held in Seoul, Sou..."
3,BTS ARMY y'all would be missing the members a ...,BTS ARMY you all would be missing the members ...
4,BTS member Kim Seokjin aka Jin has the capacit...,BTS member Kim Seokjin aka Jin has the capacit...


In [15]:
structured_data = dc.structured_data_nf(clean_data)
structured_data.split(".")

['reaching hotel city rm revealed stay would four days added would step dinner',
 'as sat roadside openair restaurant rm feasted beer burgers fries',
 'he said starving right grab food much quieter expected feels like rural town like familiar atmosphere',
 'rm attended art basel explained camera details art fair',
 'he also gave glimpse noodles beer followed soup noodles wrap',
 'showing pattern ping pong table rm said table looks like bts symbol',
 'he also spoke art pieces viewed them',
 'after rm took tram visit foundation beyeler museum',
 'he later took walk city',
 'on third day rm visited kunstmuseum basel vitra design museum gallery',
 'as walked around rm showed chair fans said breaking news guys coldplays chris martin made chair displayed vitra design museum see chris give call amazing',
 'rm next visited lucerne hiked mount rigi',
 'recalling previous visit lucerne rm added remember day crossing bridge buying souvenirs',
 'he also reminded bon voyage reality show featuring b

In [16]:
df['structured_data'] = df['clean_data'].apply(dc.structured_data_nf)
df.head()

Unnamed: 0,content,clean_data,structured_data
0,"After reaching his hotel in the city, RM revea...","After reaching his hotel in the city, RM revea...",reaching hotel city rm revealed stay would fou...
1,RM aka Kim Namjoon was the first member to joi...,RM aka Kim Namjoon was the first member to joi...,rm aka kim namjoon first member join bts.the g...
2,"Billie Eilish's concert was held in Seoul, Sou...","Billie Eilish's concert was held in Seoul, Sou...",billie eilishs concert held seoul south korea ...
3,BTS ARMY y'all would be missing the members a ...,BTS ARMY you all would be missing the members ...,bts army would missing members lot right well ...
4,BTS member Kim Seokjin aka Jin has the capacit...,BTS member Kim Seokjin aka Jin has the capacit...,bts member kim seokjin aka jin capacity create...


In [17]:
df.drop(['content'], axis=1, inplace=True)

In [18]:
df.head()

Unnamed: 0,clean_data,structured_data
0,"After reaching his hotel in the city, RM revea...",reaching hotel city rm revealed stay would fou...
1,RM aka Kim Namjoon was the first member to joi...,rm aka kim namjoon first member join bts.the g...
2,"Billie Eilish's concert was held in Seoul, Sou...",billie eilishs concert held seoul south korea ...
3,BTS ARMY you all would be missing the members ...,bts army would missing members lot right well ...
4,BTS member Kim Seokjin aka Jin has the capacit...,bts member kim seokjin aka jin capacity create...


In [19]:
df['no_of_sentence'] = df['structured_data'].apply(lambda x: len(x.split(".")))
df.head()

Unnamed: 0,clean_data,structured_data,no_of_sentence
0,"After reaching his hotel in the city, RM revea...",reaching hotel city rm revealed stay would fou...,19
1,RM aka Kim Namjoon was the first member to joi...,rm aka kim namjoon first member join bts.the g...,16
2,"Billie Eilish's concert was held in Seoul, Sou...",billie eilishs concert held seoul south korea ...,23
3,BTS ARMY you all would be missing the members ...,bts army would missing members lot right well ...,16
4,BTS member Kim Seokjin aka Jin has the capacit...,bts member kim seokjin aka jin capacity create...,17


In [20]:
df.sort_values(by='no_of_sentence')

Unnamed: 0,clean_data,structured_data,no_of_sentence
634,BTS member J-Hope will be the first singer fro...,bts member jhope first singer group officially...,1
595,SUGA breaks new ground on Spotify and extended...,suga breaks new ground spotify extended record...,1
424,The dominion of K-pops this masterpiece is one...,dominion kpops masterpiece one spread across s...,1
599,"BTS V or Kim Taehyung is, without a doubt, one...",bts v kim taehyung without doubt one handsome ...,1
602,With BTS announcing their upcoming anthology a...,bts announcing upcoming anthology album proof ...,1
...,...,...,...
431,Spotify has finally released Spotify Wrapped 2...,spotify finally released spotify wrapped 2022 ...,71
774,The rankings for the brand reputation of girl ...,rankings brand reputation girl group members b...,76
379,We are in 2023 and we already have some amazin...,2023 already amazing comebacks let us take loo...,85
529,One of the best things about today's generatio...,one best things todays generation vocal mental...,96


In [21]:
df['clean_data'][150].split(".")

['BTS members RM, Jin, Suga, J-Hope, Jimin, V and Jungkook featured in a new Bangtan Bomb video',
 "The clip, released on Sunday on Bangtan TV's YouTube channel, showed them during their lunch break at the Crosswalk Concert on The Late Late Show With James Corden",
 'As the video started, BTS members came to have their lunch after their performance on the show',
 'As Jungkook took a peek inside his packed lunch of chipotle, he asked, " How do you eat this?"',
 'Later, as he mixed his food with a sauce, he was seen asking a crew member, "What is this, chicotle?"',
 'As he asked, "chipotle?"',
 'J-Hope said, "I thought they said chocolate at first"',
 'Jungkook announced that "the sour cream is good"',
 'to which J-Hope asked, "Fight? Fight what?"',
 "He joked about the English words 'sour cream' and Korean word 'fight'",
 'Jungkook then said, "It is that sour sauce that tastes like yoghurt"',
 'RM not seen in the video, was heard asking, "Someone is fighting?"',
 'to which Jungkook repl

In [22]:
df[df['no_of_sentence'] == 1].index

Int64Index([364, 366, 367, 368, 370, 371, 372, 373, 374, 375, 386, 410, 411,
            417, 418, 422, 424, 427, 433, 444, 451, 452, 465, 469, 472, 482,
            486, 487, 502, 503, 508, 514, 516, 526, 527, 536, 541, 551, 555,
            556, 583, 584, 588, 593, 594, 595, 599, 602, 613, 616, 625, 629,
            632, 634, 642, 648, 651, 653, 655, 673, 687, 693, 696, 698, 701,
            712, 718, 729, 730, 739, 759, 760, 779, 780, 784, 786, 799, 802],
           dtype='int64')

In [23]:
%%time
dc.sentence_boundary_detection()

Index:364 Before Cleaning: 1
After Cleaning 9 and 9
------------------------------------------------------------
Index:366 Before Cleaning: 1
After Cleaning 4 and 4
------------------------------------------------------------
Index:367 Before Cleaning: 1
After Cleaning 10 and 10
------------------------------------------------------------
Index:368 Before Cleaning: 1
After Cleaning 3 and 3
------------------------------------------------------------
Index:370 Before Cleaning: 1
After Cleaning 6 and 6
------------------------------------------------------------
Index:371 Before Cleaning: 1
After Cleaning 4 and 4
------------------------------------------------------------
Index:372 Before Cleaning: 1
After Cleaning 5 and 5
------------------------------------------------------------
Index:373 Before Cleaning: 1
After Cleaning 5 and 5
------------------------------------------------------------
Index:374 Before Cleaning: 1
After Cleaning 4 and 4
------------------------------------------

In [24]:
df['clean_data'][367].split(".")

['Born Min Yoongi, SUGA debuted as a part of BTS in June 2013 SUGA went on to release his first solo mixtape in 2016, titled Agust D, which is also another one of his stage names A rapper, songwriter and record producer, SUGA is particularly loved for his songwriting and producing skills, and his hard-hitting flow Todays lovely letter has been penned by Virginia Schleich from the US to BTS SUGA Read the letter, below Dear Yoongi, Suga, Agust D,I do not know if this is a good idea or not but if there is any chance you will see this, here it goes Ill have to admit you were not my initial bias; it was RM',
 'But as I explored more and more of BTS videos, I of course was lead into Agust Ds world and I fell hard and fast down the rabbit hole Honestly, you are my favorite rapper I did more and more research on you as time went on and discovered how much alike we may be I Am from a small town, and did not grow up with much privilege or opportunity so reading about you and how you have worked 

In [25]:
print(df.sort_values(by='no_of_sentence').max()[2])

235


In [26]:
df['clean_data'][534].split(".")

["BTS has made history at the Billboard Music Awards The K-pop group had broken its own record of the highest number of nominations that any group or duo has received in any year, after it was nominated seven times in six categories at the 2022 BBMAs It broke another record after it scored three wins at the award ceremony held in Las Vegas on Sunday Also Read| BTS to skip Billboard Music Awards despite making history with nominations; ARMY is 'low-key happy' Here is whyBTS members RM, Jin, Jimin, V, Suga, J-Hope, and Jungkook, who skipped the Billboard Music Awards to focus on their upcoming album, won the awards for Top Duo/Group, Top Song Sales Artist, and Top Selling Song for Butter They have now become the music band with the most wins in the Billboard Music Awards' history, with a total of 12 awards, a record which was set by Destiny's Child with 11 wins, 17 years ago Beyonc, Kelly Rowland, and Michelle Williams were member of the former band In 2017, BTS became the first K-pop gr

In [27]:
%%time
dc.remove_unnecessary_fullstops()

Index:530 Before Cleaning: 235
After Cleaning 60 and 60
------------------------------------------------------------
Index:529 Before Cleaning: 96
After Cleaning 65 and 65
------------------------------------------------------------
Index:379 Before Cleaning: 85
After Cleaning 28 and 28
------------------------------------------------------------
Index:774 Before Cleaning: 76
After Cleaning 23 and 23
------------------------------------------------------------
Index:431 Before Cleaning: 71
After Cleaning 16 and 16
------------------------------------------------------------
Index:434 Before Cleaning: 66
After Cleaning 18 and 18
------------------------------------------------------------
Index:529 Before Cleaning: 65
After Cleaning 60 and 60
------------------------------------------------------------
Index:407 Before Cleaning: 60
After Cleaning 22 and 22
------------------------------------------------------------
Index:529 Before Cleaning: 60
After Cleaning 60 and 60
----------------

In [28]:
df['clean_data'][534].split(".")

["BTS has made history at the Billboard Music Awards The K-pop group had broken its own record of the highest number of nominations that any group or duo has received in any year, after it was nominated seven times in six categories at the 2022 BBMAs It broke another record after it scored three wins at the award ceremony held in Las Vegas on Sunday Also Read| BTS to skip Billboard Music Awards despite making history with nominations; ARMY is 'low-key happy' Here is whyBTS members RM, Jin, Jimin, V, Suga, J-Hope, and Jungkook, who skipped the Billboard Music Awards to focus on their upcoming album, won the awards for Top Duo/Group, Top Song Sales Artist, and Top Selling Song for Butter They have now become the music band with the most wins in the Billboard Music Awards' history, with a total of 12 awards, a record which was set by Destiny's Child with 11 wins, 17 years ago Beyonc, Kelly Rowland, and Michelle Williams were member of the former band In 2017, BTS became the first K-pop gr

In [29]:
df.sort_values(by='no_of_sentence')

Unnamed: 0,clean_data,structured_data,no_of_sentence
243,Kim Seokjin aka Jin is getting noticed at qui...,kim seokjin aka jin getting noticed quite mov...,2
634,BTS member J-Hope will be the first singer fro...,bts member jhope first singer group officially...,2
673,Just when we thought nothing can push us out o...,thought nothing push us warm cosy heated rooms...,2
69,"But, whatever the case might be, one can sure...",whatever case might one surely say art pieces...,3
568,It was recently that his brother revealed that...,recently brother revealed jin personally deliv...,3
...,...,...,...
401,"J-Hope of BTS, earlier released his debut solo...",jhope bts earlier released debut solo album ja...,21
415,BTS' V's record was published in the 2023 edit...,bts vs record published 2023 edition guinness ...,21
362,Run BTS is BTS fame variety show which used to...,run bts bts fame variety show used weekly catc...,25
399,"On December 30, BTS V celebrated his birthday ...",december 30 bts v celebrated birthday armys jo...,31


In [30]:
df['clean_data'][594].split(".")

["BTS' Jungkook recovered from Covid-19 just in time to arrive with the other members at the Grammys red carpet this Monday (IST) at MGM Grand Garden Arena in Las Vegas RM, Jin, Suga, J-Hope, Jimin, V and JK stepped out for the music's biggest night, looking sleek in dapper suits and proving why they are named leaders in great men's fashion The",
 "members coordinated their 'Dynamite' red carpet looks like always They chose Louis Vuitton suits in tan, white, brown, and slate blue colour palettes and embellished them with floral pins Each member looked dashing in their own way However, V went all out with a bouquet of giant flowers pinned on his tan suit We have only three words to describe them all - smooth like butter Jungkook and Jimin harmonised the colour palette of their red carpet ensembles in slate blue suits by Louis Vuitton JK wore a button-down ice blue shirt and teamed it with a cropped mandarin collar jacket and matching loose fitted pants He styled the outfit with black he

In [31]:
len(df['clean_data'][533].split("."))

8

In [32]:
len(df['structured_data'][533].split("."))

8

In [33]:
df.head()

Unnamed: 0,clean_data,structured_data,no_of_sentence
0,"After reaching his hotel in the city, RM revea...",reaching hotel city rm revealed stay would fou...,7
1,RM aka Kim Namjoon was the first member to joi...,rm aka kim namjoon first member join bts group...,9
2,"Billie Eilish's concert was held in Seoul, Sou...",billie eilishs concert held seoul south korea ...,10
3,BTS ARMY you all would be missing the members ...,bts army would missing members lot right well ...,8
4,BTS member Kim Seokjin aka Jin has the capacit...,bts member kim seokjin aka jin capacity create...,6


In [34]:
df.shape

(806, 3)

In [35]:
df.isna().sum()

clean_data         0
structured_data    0
no_of_sentence     0
dtype: int64

In [36]:
df.shape

(806, 3)

In [37]:
df.sort_values(by='no_of_sentence')

Unnamed: 0,clean_data,structured_data,no_of_sentence
243,Kim Seokjin aka Jin is getting noticed at qui...,kim seokjin aka jin getting noticed quite mov...,2
634,BTS member J-Hope will be the first singer fro...,bts member jhope first singer group officially...,2
673,Just when we thought nothing can push us out o...,thought nothing push us warm cosy heated rooms...,2
69,"But, whatever the case might be, one can sure...",whatever case might one surely say art pieces...,3
568,It was recently that his brother revealed that...,recently brother revealed jin personally deliv...,3
...,...,...,...
401,"J-Hope of BTS, earlier released his debut solo...",jhope bts earlier released debut solo album ja...,21
415,BTS' V's record was published in the 2023 edit...,bts vs record published 2023 edition guinness ...,21
362,Run BTS is BTS fame variety show which used to...,run bts bts fame variety show used weekly catc...,25
399,"On December 30, BTS V celebrated his birthday ...",december 30 bts v celebrated birthday armys jo...,31


In [38]:
df.sort_values(by='no_of_sentence', ascending=False)

Unnamed: 0,clean_data,structured_data,no_of_sentence
529,One of the best things about today's generatio...,one best things todays generation vocal mental...,60
399,"On December 30, BTS V celebrated his birthday ...",december 30 bts v celebrated birthday armys jo...,31
362,Run BTS is BTS fame variety show which used to...,run bts bts fame variety show used weekly catc...,25
415,BTS' V's record was published in the 2023 edit...,bts vs record published 2023 edition guinness ...,21
401,"J-Hope of BTS, earlier released his debut solo...",jhope bts earlier released debut solo album ja...,21
...,...,...,...
595,SUGA breaks new ground on Spotify and extended...,suga breaks new ground spotify extended record...,3
69,"But, whatever the case might be, one can sure...",whatever case might one surely say art pieces...,3
243,Kim Seokjin aka Jin is getting noticed at qui...,kim seokjin aka jin getting noticed quite mov...,2
634,BTS member J-Hope will be the first singer fro...,bts member jhope first singer group officially...,2


In [39]:
df['clean_data'][529]



In [40]:
df['clean_data'] = df['clean_data'].apply(dc.clean_data_f)

In [41]:
df['structured_data'] = df['clean_data'].apply(dc.structured_data_nf)

In [42]:
df.sort_values(by='no_of_sentence', ascending=False)

Unnamed: 0,clean_data,structured_data,no_of_sentence
529,One of the best things about today's generatio...,one best things todays generation vocal mental...,60
399,"On December 30, BTS V celebrated his birthday ...",december 30 bts v celebrated birthday armys jo...,31
362,Run BTS is BTS fame variety show which used to...,run bts bts fame variety show used weekly catc...,25
415,BTS' V's record was published in the 2023 edit...,bts vs record published 2023 edition guinness ...,21
401,"J-Hope of BTS, earlier released his debut solo...",jhope bts earlier released debut solo album ja...,21
...,...,...,...
595,SUGA breaks new ground on Spotify and extended...,suga breaks new ground spotify extended record...,3
69,"But, whatever the case might be, one can sure...",whatever case might one surely say art pieces...,3
243,Kim Seokjin aka Jin is getting noticed at qui...,kim seokjin aka jin getting noticed quite mov...,2
634,BTS member J-Hope will be the first singer fro...,bts member jhope first singer group officially...,2


In [43]:
df.drop(['no_of_sentence'], axis=1, inplace=True)

In [44]:
df.to_csv('./clean_and_structured_news.csv', index=False)

In [45]:
print(f"Total time taken in complete program execution: {(int)((time.time()-start)//60)} mins {(int)((time.time()-start)%60)} secs")

Total time taken in complete program execution: 4 mins 39 secs
