Importing Important libraries

In [36]:
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from textblob import TextBlob
import spacy
from collections import defaultdict
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
nlp = spacy.load('en_core_web_sm')

Creating Dataframe by Reading CSV file

In [23]:
data = pd.read_csv("articles.csv", encoding='latin-1')
data.head()

Unnamed: 0,Article,Title
0,Data analysis is the process of inspecting and...,Best Books to Learn Data Analysis
1,The performance of a machine learning algorith...,Assumptions of Machine Learning Algorithms
2,You must have seen the news divided into categ...,News Classification with Machine Learning
3,When there are only two classes in a classific...,Multiclass Classification Algorithms in Machin...
4,The Multinomial Naive Bayes is one of the vari...,Multinomial Naive Bayes in Machine Learning


Creating word clouds to visualize the most frequent words in the titles.

In [24]:
# Combining all the titles into single string
titles_text = ' '.join(data['Title'])

In [25]:
# Creating a wordcloud object
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(titles_text)

In [26]:
# Plotting the wordcloud
fig = px.imshow(wordcloud, title='Word Cloud of Titles')
fig.update_layout(showlegend=False)
fig.show()

#### Sentiment Analysis

Analyzing the sentiment expressed in the articles to understand the overall tone or sentiment of the content

In [28]:
#Create two new columns ‘Subjectivity’ & ‘Polarity’
data['Subjectivity'] = data['Article'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
data['Polarity'] = data['Article'].apply(lambda x: TextBlob(x).sentiment.polarity)
def getAnalysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'
data['Analysis'] = data['Polarity'].apply(getAnalysis)
data

Unnamed: 0,Article,Title,Subjectivity,Polarity,Analysis
0,Data analysis is the process of inspecting and...,Best Books to Learn Data Analysis,0.358333,0.666667,Positive
1,The performance of a machine learning algorith...,Assumptions of Machine Learning Algorithms,0.354167,0.020833,Positive
2,You must have seen the news divided into categ...,News Classification with Machine Learning,0.9,0.6,Positive
3,When there are only two classes in a classific...,Multiclass Classification Algorithms in Machin...,0.525,0.625,Positive
4,The Multinomial Naive Bayes is one of the vari...,Multinomial Naive Bayes in Machine Learning,0.771429,-0.101429,Negative
5,You must have seen the news divided into categ...,News Classification with Machine Learning,0.9,0.6,Positive
6,Natural language processing or NLP is a subfie...,Best Books to Learn NLP,0.5,0.283333,Positive
7,By using a third-party application or API to m...,Send Instagram Messages using Python,0.2,0.05,Positive
8,Twitter is one of the most popular social medi...,Pfizer Vaccine Sentiment Analysis using Python,0.553333,0.406667,Positive
9,The squid game is currently one of the most tr...,Squid Game Sentiment Analysis using Python,0.383333,-0.108333,Negative


Subjectivity quantifies the amount of personal opinion and factual information contained in the text. The higher subjectivity means that the text contains personal opinion rather than factual information. Polarity implies Positive, Negative or Neutral Sentiments

In [45]:
fig1 = px.histogram(data, x='Subjectivity', title='Subjectivity Distribution', histfunc='count', text_auto=True)
fig2 = px.histogram(data, x='Polarity', title='polarity Distribution')
fig3 = px.histogram(data, x='Analysis', title='Complete Analysis', histfunc='count', text_auto=True)
fig1.show()
fig2.show()
fig3.show()

From above analysis it can be observed that most of the articles have Positive polarity. Whereas majority of articles have factual information.

#### Named Entity Reconition

In [10]:
def extract_named_entities(text):
    doc = nlp(text)
    entities = defaultdict(list)
    for ent in doc.ents:
        entities[ent.label_].append(ent.text)
    return dict(entities)

data['Named_Entities'] = data['Article'].apply(extract_named_entities)

In [11]:
entity_counts = Counter(entity for entities in data['Named_Entities'] for entity in entities)
entity_df = pd.DataFrame.from_dict(entity_counts, orient='index').reset_index()
entity_df.columns = ['Entity', 'Count']

In [12]:
fig = px.bar(entity_df.head(10), x='Entity', y='Count', title='Top 10 Named Entities')
fig.show()

In [13]:
vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
tf = vectorizer.fit_transform(data['Article'])
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_topic_matrix = lda_model.fit_transform(tf)

In [17]:
topic_names = ["Topic " + str(i) for i in range(lda_model.n_components)]
topic_names

['Topic 0', 'Topic 1', 'Topic 2', 'Topic 3', 'Topic 4']

In [18]:
data['Dominant_Topic'] = [topic_names[i] for i in lda_topic_matrix.argmax(axis=1)]

0     Topic 1
1     Topic 0
2     Topic 1
3     Topic 3
4     Topic 4
5     Topic 1
6     Topic 0
7     Topic 1
8     Topic 2
9     Topic 2
10    Topic 1
11    Topic 1
12    Topic 1
13    Topic 1
14    Topic 1
15    Topic 1
16    Topic 1
17    Topic 4
18    Topic 1
19    Topic 2
20    Topic 3
21    Topic 3
22    Topic 3
23    Topic 3
24    Topic 3
25    Topic 1
26    Topic 1
27    Topic 1
28    Topic 0
29    Topic 0
30    Topic 1
31    Topic 1
32    Topic 4
33    Topic 1
Name: Dominant_Topic, dtype: object

In [19]:
data['Dominant_Topic'].value_counts()

Dominant_Topic
Topic 1    18
Topic 3     6
Topic 0     4
Topic 4     3
Topic 2     3
Name: count, dtype: int64

In [20]:
data['Dominant_Topic'].value_counts().reset_index()

Unnamed: 0,Dominant_Topic,count
0,Topic 1,18
1,Topic 3,6
2,Topic 0,4
3,Topic 4,3
4,Topic 2,3


In [22]:
fig = px.bar(data['Dominant_Topic'].value_counts().reset_index(), x='Dominant_Topic', y='count', title='Topic Distribution')
fig.show()