In [2]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [5]:
news_df = pd.read_csv('Final_News_Scrapped.csv')
news_df

Unnamed: 0,Title,Content
0,Eight dead in Japan earthquake as search for s...,Thousands of people have spent the night in ev...
1,Watch: How earthquake shook Japan on New Year'...,"Dozens of buildings collapse in several towns,..."
2,Australian jailed in Iraq reaches grim milestone,"Robert Pether has been detained for 1,000 days..."
3,Nobel laureate Muhammad Yunus sentenced to jail,Supporters of the Bangladeshi economist say th...
4,Israel Supreme Court strikes down judicial ref...,The controversial plans triggered nationwide p...
...,...,...
498,Taiwan's President Tsai urges China to seek 'p...,Taiwan is less than two weeks from an election...
499,UN peacekeeping mission wraps up Mali deployme...,"The UN stabilisation mission MINUSMA, had been..."
500,Guinea junta announces constitutional referendum,The junta also said that people appointed by t...
501,U.S. Supreme Court Chief Justice urges ‘cautio...,The Chief Justice’s commentary is his most sig...


In [7]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503 entries, 0 to 502
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Title      503 non-null    object
 1     Content  503 non-null    object
dtypes: object(2)
memory usage: 8.0+ KB


In [9]:
news_df.shape

(503, 2)

In [18]:
news_df = news_df .rename(columns={'  Content': 'Content'})
news_df.columns

Index(['Title ', 'Content'], dtype='object')

### Remove HTML tags, advertisements, and non-text content:

In [12]:
from bs4 import BeautifulSoup
import re

def clean_html_tags(raw_html):
    clean_text = BeautifulSoup(raw_html, 'html.parser').get_text()
    return clean_text

def remove_non_text(content):
    # Remove non-text content using regular expression
    clean_text = re.sub(r'[^\w\s]', '', content)
    return clean_text


### Tokenization and stop words removal:

In [11]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\VENKA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\VENKA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Lemmatization (using WordNet Lemmatizer from NLTK):

In [14]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def lemmatize_words(tokens):
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\VENKA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
news_df['Content'] = news_df['Content'].apply(clean_html_tags)
news_df['Content'] = news_df['Content'].apply(remove_non_text)
news_df['Content'] = news_df['Content'].apply(tokenize_and_remove_stopwords)
news_df['Content'] = news_df['Content'].apply(lemmatize_words)

news_df.dropna(inplace=True)

news_df.to_csv('cleaned_final_content.csv', index=False)

In [3]:
final_con = pd.read_csv('cleaned_final_content.csv')
final_con

Unnamed: 0,Title,Content
0,Eight dead in Japan earthquake as search for s...,"['Thousands', 'people', 'spent', 'night', 'eva..."
1,Watch: How earthquake shook Japan on New Year'...,"['Dozens', 'building', 'collapse', 'several', ..."
2,Australian jailed in Iraq reaches grim milestone,"['Robert', 'Pether', 'detained', '1000', 'day'..."
3,Nobel laureate Muhammad Yunus sentenced to jail,"['Supporters', 'Bangladeshi', 'economist', 'sa..."
4,Israel Supreme Court strikes down judicial ref...,"['controversial', 'plan', 'triggered', 'nation..."
...,...,...
498,Taiwan's President Tsai urges China to seek 'p...,"['Taiwan', 'le', 'two', 'week', 'election', 'c..."
499,UN peacekeeping mission wraps up Mali deployme...,"['UN', 'stabilisation', 'mission', 'MINUSMA', ..."
500,Guinea junta announces constitutional referendum,"['junta', 'also', 'said', 'people', 'appointed..."
501,U.S. Supreme Court Chief Justice urges ‘cautio...,"['Chief', 'Justices', 'commentary', 'significa..."


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

content_list = final_con['Content'].apply(lambda x: ' '.join(x)).tolist()

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the content data
tfidf_matrix = tfidf_vectorizer.fit_transform(content_list)