In [312]:
import numpy as np
import pandas as pd
import nltk
import string

In [313]:
data =pd.read_csv("/content/sample_data/news.csv")

In [314]:
df=pd.DataFrame(data["content"]).rename(columns={"content":"original content"})
df.dropna(inplace=True)
df

Unnamed: 0,original content
0,"After reaching his hotel in the city, RM revea..."
1,RM aka Kim Namjoon was the first member to joi...
2,"Billie Eilish's concert was held in Seoul, Sou..."
3,BTS ARMY y'all would be missing the members a ...
4,BTS member Kim Seokjin aka Jin has the capacit...
...,...
805,BTS has conquered the world with their group r...
806,Today marks 700 days since BTS' worldwide hand...
807,BTS' youngest member Jungkook came online on W...
808,BTS' eldest member Jin has shared pictures and...


# **Step1: convert to lower case**

In [315]:
df["lowercase"]=df["original content"].apply(lambda row : " ".join(word.lower() for word in row.split()))
df["lowercase"]

0      after reaching his hotel in the city, rm revea...
1      rm aka kim namjoon was the first member to joi...
2      billie eilish's concert was held in seoul, sou...
3      bts army y'all would be missing the members a ...
4      bts member kim seokjin aka jin has the capacit...
                             ...                        
805    bts has conquered the world with their group r...
806    today marks 700 days since bts' worldwide hand...
807    bts' youngest member jungkook came online on w...
808    bts' eldest member jin has shared pictures and...
809    after a lot of teasing, benny blanco’s collabo...
Name: lowercase, Length: 806, dtype: object

In [316]:
df["lowercase"][0]

'after reaching his hotel in the city, rm revealed that his stay would be for four days and added that he would step out for dinner. as he sat at a roadside open-air restaurant, rm feasted on beer, burgers and fries. he said, "i\'m starving right now. i\'m out to grab some food. it\'s much quieter than i expected and feels like a rural town. i like the familiar atmosphere." rm attended art basel and explained on camera the details of the art fair. he also gave a glimpse as he had noodles and beer which was followed by soup noodles and wrap. showing the pattern of a ping pong table, rm said, "the table looks like our (bts) symbol." he also spoke about the art pieces as he viewed them. after that, rm took a tram to visit the foundation beyeler, a museum. he later took a walk through the city. on his third day, rm visited the kunstmuseum basel, the vitra design museum and the gallery. as he walked around, rm showed a chair to his fans and said, "i have breaking news for you guys. coldplay

# **Removing Stopwords**

In [317]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [318]:
from nltk.corpus import stopwords
lsw=stopwords.words("english")

In [319]:
df["stopwords"]=df["lowercase"].apply(lambda x: " ".join(word for word in x.split() if word not in lsw ))
df["stopwords"]

0      reaching hotel city, rm revealed stay would fo...
1      rm aka kim namjoon first member join bts. grou...
2      billie eilish's concert held seoul, south kore...
3      bts army y'all would missing members lot, righ...
4      bts member kim seokjin aka jin capacity create...
                             ...                        
805    bts conquered world group releases ever since ...
806    today marks 700 days since bts' worldwide hand...
807    bts' youngest member jungkook came online weve...
808    bts' eldest member jin shared pictures message...
809    lot teasing, benny blanco’s collaborative song...
Name: stopwords, Length: 806, dtype: object

# **Lemmatization:**

In [320]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [321]:
from nltk.stem import WordNetLemmatizer
lmz=WordNetLemmatizer()

In [323]:
df["lemmatization"]=df["stopwords"].apply(lambda x: " ".join(lmz.lemmatize(word,pos="v") for word in x.split() ))
df["lemmatization"]

0      reach hotel city, rm reveal stay would four da...
1      rm aka kim namjoon first member join bts. grou...
2      billie eilish's concert hold seoul, south kore...
3      bts army y'all would miss members lot, right? ...
4      bts member kim seokjin aka jin capacity create...
                             ...                        
805    bts conquer world group release ever since deb...
806    today mark 700 days since bts' worldwide hands...
807    bts' youngest member jungkook come online weve...
808    bts' eldest member jin share picture message f...
809    lot teasing, benny blanco’s collaborative song...
Name: lemmatization, Length: 806, dtype: object

# **step2: Tokenization**

In [324]:
 nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [325]:
df["tokenize"]=df["lemmatization"].apply(lambda row:nltk.sent_tokenize(row))
df["tokenize"][0]

['reach hotel city, rm reveal stay would four days add would step dinner.',
 'sit roadside open-air restaurant, rm feast beer, burgers fries.',
 'said, "i\'m starve right now.',
 "i'm grab food.",
 'much quieter expect feel like rural town.',
 'like familiar atmosphere."',
 'rm attend art basel explain camera detail art fair.',
 'also give glimpse noodles beer follow soup noodles wrap.',
 'show pattern ping pong table, rm said, "the table look like (bts) symbol."',
 'also speak art piece view them.',
 'that, rm take tram visit foundation beyeler, museum.',
 'later take walk city.',
 'third day, rm visit kunstmuseum basel, vitra design museum gallery.',
 'walk around, rm show chair fan said, "i break news guys.',
 "coldplay's chris martin make chair display vitra design museum.",
 'see chris, give call.',
 'amazing."',
 'rm next visit lucerne hike mount rigi.',
 'recall previous visit lucerne, rm added, "i remember day cross bridge buy souvenirs."',
 'also remind bon voyage, reality sho

# **step3: Remove Punctuation**

In [326]:
string.punctuation


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [327]:
import re 
regex=re.compile("[%s]"% re.escape(string.punctuation))
def remove_punc(row): 
    new_list=[]
    for sent_token in row:
        new_token=regex.sub(u"",sent_token)
        if not new_token==u"":
            new_list.append(new_token)  
    return new_list

df["punctuation"]=df["tokenize"].apply(remove_punc)
df["punctuation"][0]

['reach hotel city rm reveal stay would four days add would step dinner',
 'sit roadside openair restaurant rm feast beer burgers fries',
 'said im starve right now',
 'im grab food',
 'much quieter expect feel like rural town',
 'like familiar atmosphere',
 'rm attend art basel explain camera detail art fair',
 'also give glimpse noodles beer follow soup noodles wrap',
 'show pattern ping pong table rm said the table look like bts symbol',
 'also speak art piece view them',
 'that rm take tram visit foundation beyeler museum',
 'later take walk city',
 'third day rm visit kunstmuseum basel vitra design museum gallery',
 'walk around rm show chair fan said i break news guys',
 'coldplays chris martin make chair display vitra design museum',
 'see chris give call',
 'amazing',
 'rm next visit lucerne hike mount rigi',
 'recall previous visit lucerne rm added i remember day cross bridge buy souvenirs',
 'also remind bon voyage reality show feature bts members rm jin suga jhope jimin v ju

In [328]:
df["final content"]=df["punctuation"].str.join(" ")
df["final content"]

0      reach hotel city rm reveal stay would four day...
1      rm aka kim namjoon first member join bts group...
2      billie eilishs concert hold seoul south korea ...
3      bts army yall would miss members lot right wel...
4      bts member kim seokjin aka jin capacity create...
                             ...                        
805    bts conquer world group release ever since deb...
806    today mark 700 days since bts worldwide handso...
807    bts youngest member jungkook come online wever...
808    bts eldest member jin share picture message fa...
809    lot teasing benny blanco’s collaborative song ...
Name: final content, Length: 806, dtype: object

In [329]:
df

Unnamed: 0,original content,lowercase,stopwords,lemmatization,tokenize,punctuation,final content
0,"After reaching his hotel in the city, RM revea...","after reaching his hotel in the city, rm revea...","reaching hotel city, rm revealed stay would fo...","reach hotel city, rm reveal stay would four da...","[reach hotel city, rm reveal stay would four d...",[reach hotel city rm reveal stay would four da...,reach hotel city rm reveal stay would four day...
1,RM aka Kim Namjoon was the first member to joi...,rm aka kim namjoon was the first member to joi...,rm aka kim namjoon first member join bts. grou...,rm aka kim namjoon first member join bts. grou...,"[rm aka kim namjoon first member join bts., gr...","[rm aka kim namjoon first member join bts, gro...",rm aka kim namjoon first member join bts group...
2,"Billie Eilish's concert was held in Seoul, Sou...","billie eilish's concert was held in seoul, sou...","billie eilish's concert held seoul, south kore...","billie eilish's concert hold seoul, south kore...","[billie eilish's concert hold seoul, south kor...",[billie eilishs concert hold seoul south korea...,billie eilishs concert hold seoul south korea ...
3,BTS ARMY y'all would be missing the members a ...,bts army y'all would be missing the members a ...,"bts army y'all would missing members lot, righ...","bts army y'all would miss members lot, right? ...","[bts army y'all would miss members lot, right?...","[bts army yall would miss members lot right, w...",bts army yall would miss members lot right wel...
4,BTS member Kim Seokjin aka Jin has the capacit...,bts member kim seokjin aka jin has the capacit...,bts member kim seokjin aka jin capacity create...,bts member kim seokjin aka jin capacity create...,[bts member kim seokjin aka jin capacity creat...,[bts member kim seokjin aka jin capacity creat...,bts member kim seokjin aka jin capacity create...
...,...,...,...,...,...,...,...
805,BTS has conquered the world with their group r...,bts has conquered the world with their group r...,bts conquered world group releases ever since ...,bts conquer world group release ever since deb...,[bts conquer world group release ever since de...,[bts conquer world group release ever since de...,bts conquer world group release ever since deb...
806,Today marks 700 days since BTS' worldwide hand...,today marks 700 days since bts' worldwide hand...,today marks 700 days since bts' worldwide hand...,today mark 700 days since bts' worldwide hands...,[today mark 700 days since bts' worldwide hand...,[today mark 700 days since bts worldwide hands...,today mark 700 days since bts worldwide handso...
807,BTS' youngest member Jungkook came online on W...,bts' youngest member jungkook came online on w...,bts' youngest member jungkook came online weve...,bts' youngest member jungkook come online weve...,[bts' youngest member jungkook come online wev...,[bts youngest member jungkook come online weve...,bts youngest member jungkook come online wever...
808,BTS' eldest member Jin has shared pictures and...,bts' eldest member jin has shared pictures and...,bts' eldest member jin shared pictures message...,bts' eldest member jin share picture message f...,[bts' eldest member jin share picture message ...,[bts eldest member jin share picture message f...,bts eldest member jin share picture message fa...


# **feature extraction**

In [253]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [301]:
vec=TfidfVectorizer(max_features=500)

In [302]:
sample_df=df["punctuation"].str.join(" ")[:5]
sample_df

0    reach hotel city rm reveal stay would four day...
1    rm aka kim namjoon first member join bts group...
2    billie eilishs concert hold seoul south korea ...
3    bts army yall would miss members lot right wel...
4    bts member kim seokjin aka jin capacity create...
Name: punctuation, dtype: object

In [None]:
sample_df.values

In [304]:
vec.fit(sample_df)

In [None]:
vec.get_feature_names_out()

In [None]:
vec.vocabulary_

In [307]:
vectors=vec.fit_transform(sample_df)

In [308]:
vectors

<5x489 sparse matrix of type '<class 'numpy.float64'>'
	with 636 stored elements in Compressed Sparse Row format>

In [309]:
vectors.shape

(5, 489)

In [310]:
sample_df[:2]

0    reach hotel city rm reveal stay would four day...
1    rm aka kim namjoon first member join bts group...
Name: punctuation, dtype: object

In [311]:
vectors.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.0681383 , 0.0681383 , 0.0681383 , ..., 0.        , 0.0681383 ,
        0.0681383 ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.08221287, 0.        ,
        0.        ]])