Importing Important libraries

In [1]:
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from textblob import TextBlob
import spacy
from collections import defaultdict
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
nlp = spacy.load('en_core_web_sm')

Creating Dataframe by Reading CSV file

In [3]:
data = pd.read_csv("articles.csv", encoding='latin-1')
data.head()

Unnamed: 0,Article,Title
0,Data analysis is the process of inspecting and...,Best Books to Learn Data Analysis
1,The performance of a machine learning algorith...,Assumptions of Machine Learning Algorithms
2,You must have seen the news divided into categ...,News Classification with Machine Learning
3,When there are only two classes in a classific...,Multiclass Classification Algorithms in Machin...
4,The Multinomial Naive Bayes is one of the vari...,Multinomial Naive Bayes in Machine Learning


Creating word clouds to visualize the most frequent words in the titles.

In [4]:
# Combining all the titles into single string
titles_text = ' '.join(data['Title'])

In [5]:
# Creating a wordcloud object
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(titles_text)

In [6]:
# Plotting the wordcloud
fig = px.imshow(wordcloud, title='Word Cloud of Titles')
fig.update_layout(showlegend=False)
fig.show()

#### Sentiment Analysis

Analyzing the sentiment expressed in the articles to understand the overall tone or sentiment of the content

In [9]:
#Create two new columns ‘Subjectivity’ & ‘Polarity’
data['Subjectivity'] = data['Article'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
data['Polarity'] = data['Article'].apply(lambda x: TextBlob(x).sentiment.polarity)
def getAnalysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'
data['Sentiment'] = data['Polarity'].apply(getAnalysis)
data

Unnamed: 0,Article,Title,Subjectivity,Polarity,Analysis,Sentiment
0,Data analysis is the process of inspecting and...,Best Books to Learn Data Analysis,0.358333,0.666667,Positive,Positive
1,The performance of a machine learning algorith...,Assumptions of Machine Learning Algorithms,0.354167,0.020833,Positive,Positive
2,You must have seen the news divided into categ...,News Classification with Machine Learning,0.9,0.6,Positive,Positive
3,When there are only two classes in a classific...,Multiclass Classification Algorithms in Machin...,0.525,0.625,Positive,Positive
4,The Multinomial Naive Bayes is one of the vari...,Multinomial Naive Bayes in Machine Learning,0.771429,-0.101429,Negative,Negative
5,You must have seen the news divided into categ...,News Classification with Machine Learning,0.9,0.6,Positive,Positive
6,Natural language processing or NLP is a subfie...,Best Books to Learn NLP,0.5,0.283333,Positive,Positive
7,By using a third-party application or API to m...,Send Instagram Messages using Python,0.2,0.05,Positive,Positive
8,Twitter is one of the most popular social medi...,Pfizer Vaccine Sentiment Analysis using Python,0.553333,0.406667,Positive,Positive
9,The squid game is currently one of the most tr...,Squid Game Sentiment Analysis using Python,0.383333,-0.108333,Negative,Negative


Subjectivity quantifies the amount of personal opinion and factual information contained in the text. The higher subjectivity means that the text contains personal opinion rather than factual information. Polarity implies Positive, Negative or Neutral Sentiments

In [10]:
fig1 = px.histogram(data, x='Subjectivity', title='Subjectivity Distribution', histfunc='count', text_auto=True)
fig2 = px.histogram(data, x='Polarity', title='polarity Distribution')
fig3 = px.histogram(data, x='Sentiment', title='Complete Analysis', histfunc='count', text_auto=True)
fig1.show()
fig2.show()
fig3.show()

From above analysis it can be observed that most of the articles have Positive polarity. Whereas majority of articles have factual information.

#### Named Entity Recognition

In [17]:
text = data['Article'][0]
text

'Data analysis is the process of inspecting and exploring data generated by a particular population to find the information needed to make decisions and draw conclusions. With the use of data in decision making, most businesses today need data analysts. So, if you want to know about the best books to learn data analysis, this article is for you. In this article, I will introduce you to some of the best books to learn data analysis.'

In [18]:
doc = nlp(text)

In [19]:
sentence = list(doc.sents)
sentence

[Data analysis is the process of inspecting and exploring data generated by a particular population to find the information needed to make decisions and draw conclusions.,
 With the use of data in decision making, most businesses today need data analysts.,
 So, if you want to know about the best books to learn data analysis, this article is for you.,
 In this article, I will introduce you to some of the best books to learn data analysis.]

In [20]:
for token in doc:
    print(token.text)

Data
analysis
is
the
process
of
inspecting
and
exploring
data
generated
by
a
particular
population
to
find
the
information
needed
to
make
decisions
and
draw
conclusions
.
With
the
use
of
data
in
decision
making
,
most
businesses
today
need
data
analysts
.
So
,
if
you
want
to
know
about
the
best
books
to
learn
data
analysis
,
this
article
is
for
you
.
In
this
article
,
I
will
introduce
you
to
some
of
the
best
books
to
learn
data
analysis
.


In [21]:
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)

[('today', 227, 232, 'DATE')]


In [22]:
def extract_named_entities(text):
    doc = nlp(text)
    entities = defaultdict(list)
    for ent in doc.ents:
        entities[ent.label_].append(ent.text)
    return dict(entities)

data['Named_Entities'] = data['Article'].apply(extract_named_entities)

In [23]:
data['Named_Entities']

0                                   {'DATE': ['today']}
1                                                    {}
2                                                    {}
3           {'CARDINAL': ['only two', 'more than two']}
4     {'ORG': ['The Multinomial Naive Bayes', 'Naive...
5                                                    {}
6     {'ORG': ['NLP', 'NLP', 'NLP', 'NLP'], 'CARDINA...
7     {'ORDINAL': ['third'], 'ORG': ['API', 'Instagr...
8     {'PRODUCT': ['Twitter', 'Twitter'], 'CARDINAL'...
9             {'ORG': ['NetFlix'], 'CARDINAL': ['One']}
10    {'ORG': ['Computer Vision', 'Artificial Intell...
11    {'DATE': ['today', 'today'], 'ORG': ['Google',...
12    {'ORG': ['API', 'Application Programming Inter...
13                                  {'DATE': ['today']}
14                        {'DATE': ['a few years ago']}
15    {'ORG': ['Multilayer Perceptron', 'Multilayer ...
16                          {'ORG': ['Data Scientist']}
17                                              

In [None]:
entity_counts = Counter(entity for entities in data['Named_Entities'] for entity in entities)
entity_df = pd.DataFrame.from_dict(entity_counts, orient='index').reset_index()
entity_df.columns = ['Entity', 'Count']

In [None]:
fig = px.bar(entity_df.head(10), x='Entity', y='Count', title='Top 10 Named Entities')
fig.show()

In [None]:
vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
tf = vectorizer.fit_transform(data['Article'])
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_topic_matrix = lda_model.fit_transform(tf)

In [None]:
topic_names = ["Topic " + str(i) for i in range(lda_model.n_components)]
topic_names

In [None]:
data['Dominant_Topic'] = [topic_names[i] for i in lda_topic_matrix.argmax(axis=1)]

In [None]:
data['Dominant_Topic'].value_counts()

In [None]:
data['Dominant_Topic'].value_counts().reset_index()

In [None]:
fig = px.bar(data['Dominant_Topic'].value_counts().reset_index(), x='Dominant_Topic', y='count', title='Topic Distribution')
fig.show()