In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

In [2]:
data= pd.read_excel('Input.xlsx')

In [3]:
data.head()

Unnamed: 0,URL_ID,URL
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...


## Scraping the Title of the Articles

In [4]:
def article_title(link):
    response = requests.get(link)
    soup = BeautifulSoup(response.content, 'html.parser')
    if soup.h1 is not None:
        art_title = soup.h1.string
        return art_title
    else:
        print(f"No title found for {link}")
        return "No Title Found"

In [5]:
data['Title']= data['URL'].apply(article_title)

No title found for https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
No title found for https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/


* No title found for two URLs

In [6]:
data.head()

Unnamed: 0,URL_ID,URL,Title
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,Rising IT cities and its impact on the economy...
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,Rising IT Cities and Their Impact on the Econo...
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,"Internet Demand’s Evolution, Communication Imp..."
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,Rise of Cybercrime and its Effect in upcoming ...
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,OTT platform and its impact on the entertainme...


## Scraped the Articles content from the Links

In [7]:
def article_para(link):
    response = requests.get(link)
    soup = BeautifulSoup(response.content, "html.parser")
    art_content = soup.article  # Look for the main content container

    if art_content is not None:
        paragraphs = []
        for i in art_content.find_all(['p', 'li']):  # Find all <p> and <li> tags within the content
            paragraphs.append(i.get_text())
        para = '\n'.join(paragraphs)
        return para.replace('\n',' ')
    else:
        print("No article content found.")

In [8]:
data['Content']= data['URL'].apply(article_para)

No article content found.
No article content found.


* Since there are two links, which are not working as there's no content available in them
* Therefore, dropping these two rows

In [9]:
data.loc[data['URL']=='https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/',]

Unnamed: 0,URL_ID,URL,Title,Content
35,blackassign0036,https://insights.blackcoffer.com/how-neural-ne...,No Title Found,


In [10]:
data.loc[data['URL']=='https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/',]

Unnamed: 0,URL_ID,URL,Title,Content
48,blackassign0049,https://insights.blackcoffer.com/covid-19-envi...,No Title Found,


In [11]:
data.drop(index=[35,48],inplace=True)

In [12]:
data.shape

(98, 4)

In [13]:
data.head()

Unnamed: 0,URL_ID,URL,Title,Content
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,Rising IT cities and its impact on the economy...,Entertainment Broadcasting Future Forecasts Wh...
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,Rising IT Cities and Their Impact on the Econo...,What We Think Future Prediction Throughout his...
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,"Internet Demand’s Evolution, Communication Imp...",What We Think Entertainment Future Introductio...
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,Rise of Cybercrime and its Effect in upcoming ...,Entertainment Broadcasting What We Think Futur...
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,OTT platform and its impact on the entertainme...,Entertainment Broadcasting What We Think Futur...


In [14]:
data['Content'][2]

'What We Think Entertainment Future Introduction In the span of just a few decades, the internet has undergone an astounding transformation, becoming an integral part of our lives. As we approach the year 2035, the demand for internet connectivity continues to surge, promising to revolutionize the way we communicate and interact with the world. It also has transformed from a limited communication tool to an all-encompassing global network that shapes our daily lives. As we hurtle toward the year 2035, the trajectory of internet demand and its profound impact on communication is poised to reshape the very foundations of how we connect and interact. This article delves into the intricate interplay between internet demand, communication dynamics, and the alternative pathways that could define our hyper-connected future. The Internet’s Unstoppable Rise The proliferation of smartphones, the Internet of Things (IoT), and the increasing reliance on digital services have driven an exponential 

# Text Preparation

## Removing HTML Tags

In [15]:
import re
def striphtml(x):
    p= re.compile(r'<.*?>')
    return p.sub('',x)

In [16]:
data.columns

Index(['URL_ID', 'URL', 'Title', 'Content'], dtype='object')

In [17]:
data['Content']= data['Content'].apply(striphtml)

In [18]:
data['Content'][0]

'Entertainment Broadcasting Future Forecasts What We Think Futurist We have seen a huge development and dependence of people on technology in recent years. We have also seen the development of AI and ChatGPT in recent years. So it is a normal thing that we will become fully dependent on technology by 2040. Information technology will be a major power for all the developing nations. As a member of a developing nation, India is rapidly growing its IT base. It has also grown some IT cities which will be the major control centres for Information technology by 2040. Rising IT cities Noida:- Noida in Uttar Pradesh near New Delhi is an emerging IT sector now. Many large companies like Google, Microsoft, IBM, Infosys and others have set up their companies here. Noida has a market base of billions of dollars and is doing a great job of boosting the national economy. The establishment of so many software companies has made Noida an information technology hub. Gurgaon:- Gurgaon in Haryana is also

## Removing Numbers

In [19]:
def remove_numb(text):
    return ''.join(i for i in text if not i.isdigit())

In [20]:
data['Content']= data['Content'].apply(remove_numb)

##  Removing Stopwords

* Importing the text Files given 

In [21]:
a= ['StopWords_Auditor.txt','StopWords_Currencies.txt','StopWords_DatesandNumbers.txt','StopWords_Generic.txt',\
    'StopWords_GenericLong.txt','StopWords_Geographic.txt','StopWords_Names.txt']
b=[]
for i in a:
    with open(i,'r') as stopfile:
        stopword= stopfile.read().lower()
        b.extend(stopword.split('\n'))
print(b)

['ernst', 'young', 'deloitte', 'touche', 'kpmg', 'pricewaterhousecoopers', 'pricewaterhouse', 'coopers', '', 'afghani  | afghanistan ', 'ariary | madagascar ', 'baht | thailand ', 'balboa | panama ', 'birr | ethiopia ', 'bolivar | venezuela ', 'boliviano  | bolivia ', 'cedi | ghana ', 'colon  | costa rica ', 'córdoba  | nicaragua ', 'dalasi | gambia ', 'denar | macedonia (former yug. rep.) ', 'dinar | algeria ', 'dirham  | morocco ', 'dobra | são tom and príncipe ', 'dong | vietnam ', 'dram | armenia ', 'escudo  | cape verde ', 'euro  | belgium ', 'florin | aruba ', 'forint | hungary ', 'gourde | haiti ', 'guarani | paraguay ', 'gulden | netherlands antilles ', 'hryvnia  | ukraine ', 'kina | papua new guinea ', 'kip | laos ', 'konvertibilna marka  | bosnia-herzegovina ', 'koruna  | czech republic ', 'krona | sweden ', 'krone | denmark ', 'kroon | estonia ', 'kuna | croatia ', 'kwacha | zambia ', 'kwanza | angola ', 'kyat | myanmar ', 'lari | georgia ', 'lats | latvia ', 'lek | albania 

* Removed stopwords and Lemmatized the text afer tokenizing the text into words

In [22]:
from nltk.stem import WordNetLemmatizer
lem= WordNetLemmatizer()
def text_corpuss(x):
    string_format= str(x).lower()
    Lower_words = re.sub('[^a-zA-Z]+', ' ', string_format).strip()
    token= word_tokenize(Lower_words)
    #p= re.compile(r'<.*?>')
    #cleaned_data= p.sub('',token)
    token_word= [t for t in token if t not in b]
    lemmatized= [lem.lemmatize(w) for w in token_word]
    return lemmatized

In [23]:
data['Contt']= data['Content'].apply(text_corpuss)

In [24]:
len(data['Contt'][2])

608

In [25]:
len(data['Content'][2].split())

1068

In [26]:
data.head()

Unnamed: 0,URL_ID,URL,Title,Content,Contt
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,Rising IT cities and its impact on the economy...,Entertainment Broadcasting Future Forecasts Wh...,"[entertainment, broadcasting, future, forecast..."
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,Rising IT Cities and Their Impact on the Econo...,What We Think Future Prediction Throughout his...,"[future, prediction, history, industrial, revo..."
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,"Internet Demand’s Evolution, Communication Imp...",What We Think Entertainment Future Introductio...,"[entertainment, future, introduction, span, de..."
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,Rise of Cybercrime and its Effect in upcoming ...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, forecast..."
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,OTT platform and its impact on the entertainme...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, poised, ..."


* Importing the text files of Positive and Negative words

In [27]:
WordFile = 'positive-words.txt'
with open(WordFile ,'r') as positivefile:
    posword=positivefile.read().lower()
PositiveWordList=posword.split('\n')

In [28]:
WordFile = 'negative-words.txt'
with open(WordFile ,'r') as negativefile:
    negword=negativefile.read().lower()
NegativeWordList=negword.split('\n')

In [29]:
data['Contt'][0][0]

'entertainment'

In [30]:
len(data['Contt'][0])

473

* Functions defined below to calculate the count of Positive words and Negative words out of the Content

In [31]:
def positive_w(text):
    a=0
    for i in text:
        if i in PositiveWordList:
            a+=1
    return a

In [32]:
data['Positive Words']= data['Contt'].apply(positive_w)

In [33]:
def negative_w(text):
    a=0
    for i in text:
        if i in NegativeWordList:
            a-=1
    return a

In [34]:
data['Negative Words']= data['Contt'].apply(negative_w)

In [35]:
data.head()

Unnamed: 0,URL_ID,URL,Title,Content,Contt,Positive Words,Negative Words
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,Rising IT cities and its impact on the economy...,Entertainment Broadcasting Future Forecasts Wh...,"[entertainment, broadcasting, future, forecast...",28,-6
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,Rising IT Cities and Their Impact on the Econo...,What We Think Future Prediction Throughout his...,"[future, prediction, history, industrial, revo...",54,-31
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,"Internet Demand’s Evolution, Communication Imp...",What We Think Entertainment Future Introductio...,"[entertainment, future, introduction, span, de...",39,-27
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,Rise of Cybercrime and its Effect in upcoming ...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, forecast...",36,-80
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,OTT platform and its impact on the entertainme...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, poised, ...",20,-9


In [36]:
data['Polarity Score']= np.nan

In [37]:
for i in data.index:
    data['Polarity Score'][i]= np.round((data['Positive Words'][i]-data['Negative Words'][i])/((data['Positive Words'][i]+data['Negative Words'][i])+0.000001),2)

In [38]:
data.head()

Unnamed: 0,URL_ID,URL,Title,Content,Contt,Positive Words,Negative Words,Polarity Score
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,Rising IT cities and its impact on the economy...,Entertainment Broadcasting Future Forecasts Wh...,"[entertainment, broadcasting, future, forecast...",28,-6,1.55
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,Rising IT Cities and Their Impact on the Econo...,What We Think Future Prediction Throughout his...,"[future, prediction, history, industrial, revo...",54,-31,3.7
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,"Internet Demand’s Evolution, Communication Imp...",What We Think Entertainment Future Introductio...,"[entertainment, future, introduction, span, de...",39,-27,5.5
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,Rise of Cybercrime and its Effect in upcoming ...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, forecast...",36,-80,-2.64
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,OTT platform and its impact on the entertainme...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, poised, ...",20,-9,2.64


* Calculated Subjectivity Score with the given fomula- Subjectivity Score = (Positive Score + Negative Score)/ ((Total Words after cleaning) + 0.000001)

In [39]:
data['Subjectivity Score']= np.nan

In [40]:
len(data['Contt'][0])

473

In [41]:
for i in data.index:
    data['Subjectivity Score'][i]= np.round((data['Positive Words'][i]+data['Negative Words'][i])/((len(data['Contt'][i]))+0.000001),2)

In [42]:
data.head()

Unnamed: 0,URL_ID,URL,Title,Content,Contt,Positive Words,Negative Words,Polarity Score,Subjectivity Score
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,Rising IT cities and its impact on the economy...,Entertainment Broadcasting Future Forecasts Wh...,"[entertainment, broadcasting, future, forecast...",28,-6,1.55,0.05
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,Rising IT Cities and Their Impact on the Econo...,What We Think Future Prediction Throughout his...,"[future, prediction, history, industrial, revo...",54,-31,3.7,0.03
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,"Internet Demand’s Evolution, Communication Imp...",What We Think Entertainment Future Introductio...,"[entertainment, future, introduction, span, de...",39,-27,5.5,0.02
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,Rise of Cybercrime and its Effect in upcoming ...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, forecast...",36,-80,-2.64,-0.08
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,OTT platform and its impact on the entertainme...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, poised, ...",20,-9,2.64,0.03


* Sentence Tokenize the Content to calculate the number of sentences in each article

In [43]:
from nltk import sent_tokenize
data['Sentences']= data['Content'].apply(sent_tokenize)

In [44]:
len(data['Sentences'][0])

77

* Calculated Average per sentence with the given formula- Average Sentence Length = the number of words / the number of sentences

In [45]:
data['Average Sentence Length']= np.nan

In [46]:
for i in data.index:
    data['Average Sentence Length'][i]= np.round(len(data['Contt'][i])/len(data['Sentences'][i]),2)

In [47]:
data.head()

Unnamed: 0,URL_ID,URL,Title,Content,Contt,Positive Words,Negative Words,Polarity Score,Subjectivity Score,Sentences,Average Sentence Length
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,Rising IT cities and its impact on the economy...,Entertainment Broadcasting Future Forecasts Wh...,"[entertainment, broadcasting, future, forecast...",28,-6,1.55,0.05,[Entertainment Broadcasting Future Forecasts W...,6.14
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,Rising IT Cities and Their Impact on the Econo...,What We Think Future Prediction Throughout his...,"[future, prediction, history, industrial, revo...",54,-31,3.7,0.03,[What We Think Future Prediction Throughout hi...,8.38
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,"Internet Demand’s Evolution, Communication Imp...",What We Think Entertainment Future Introductio...,"[entertainment, future, introduction, span, de...",39,-27,5.5,0.02,[What We Think Entertainment Future Introducti...,10.67
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,Rise of Cybercrime and its Effect in upcoming ...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, forecast...",36,-80,-2.64,-0.08,[Entertainment Broadcasting What We Think Futu...,11.29
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,OTT platform and its impact on the entertainme...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, poised, ...",20,-9,2.64,0.03,[Entertainment Broadcasting What We Think Futu...,8.76


* Defined Function to calculate the complex word count- Complex words are words in the text that contain more than two syllables.

In [48]:
def Complex(text):
    x = []
    for i in text:
        count=0
        for j in i:
            if j in 'aeiou':
                count+=1
        if count>=2:
            x.append(i)
    return len(x)

In [49]:
data['Complex Word Count'] = data['Contt'].apply(Complex)

In [50]:
data.head()

Unnamed: 0,URL_ID,URL,Title,Content,Contt,Positive Words,Negative Words,Polarity Score,Subjectivity Score,Sentences,Average Sentence Length,Complex Word Count
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,Rising IT cities and its impact on the economy...,Entertainment Broadcasting Future Forecasts Wh...,"[entertainment, broadcasting, future, forecast...",28,-6,1.55,0.05,[Entertainment Broadcasting Future Forecasts W...,6.14,383
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,Rising IT Cities and Their Impact on the Econo...,What We Think Future Prediction Throughout his...,"[future, prediction, history, industrial, revo...",54,-31,3.7,0.03,[What We Think Future Prediction Throughout hi...,8.38,578
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,"Internet Demand’s Evolution, Communication Imp...",What We Think Entertainment Future Introductio...,"[entertainment, future, introduction, span, de...",39,-27,5.5,0.02,[What We Think Entertainment Future Introducti...,10.67,564
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,Rise of Cybercrime and its Effect in upcoming ...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, forecast...",36,-80,-2.64,-0.08,[Entertainment Broadcasting What We Think Futu...,11.29,525
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,OTT platform and its impact on the entertainme...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, poised, ...",20,-9,2.64,0.03,[Entertainment Broadcasting What We Think Futu...,8.76,301


* Calculated Percentage of Complex words with the given formula- Percentage of Complex words = the number of complex words / the number of words 

In [51]:
data['Percentage of Complex Words']= np.nan

In [52]:
for i in data.index:
    data['Percentage of Complex Words'][i]= np.round(data['Complex Word Count'][i]/len(data['Contt'][i]),2)

In [53]:
data.tail(12)

Unnamed: 0,URL_ID,URL,Title,Content,Contt,Positive Words,Negative Words,Polarity Score,Subjectivity Score,Sentences,Average Sentence Length,Complex Word Count,Percentage of Complex Words
88,blackassign0089,https://insights.blackcoffer.com/coronavirus-i...,COVID-19 Impact on Hospitality Industry,Blackcoffer What We Think Healthcare In Decemb...,"[blackcoffer, healthcare, coronavirus, sars, c...",14,-44,-1.93,-0.07,[Blackcoffer What We Think Healthcare In Decem...,8.53,368,0.88
89,blackassign0090,https://insights.blackcoffer.com/lessons-from-...,Lessons from the past: Some key learnings rele...,Blackcoffer What We Think Healthcare As a race...,"[blackcoffer, healthcare, kind, crisis, fire, ...",34,-55,-4.24,-0.04,[Blackcoffer What We Think Healthcare As a rac...,6.66,412,0.85
90,blackassign0091,https://insights.blackcoffer.com/estimating-th...,Estimating the impact of COVID-19 on the world...,Blackcoffer What We Think Healthcare COVID- an...,"[blackcoffer, healthcare, covid, unprecedented...",23,-28,-10.2,-0.01,[Blackcoffer What We Think Healthcare COVID- a...,11.08,383,0.86
91,blackassign0092,https://insights.blackcoffer.com/estimating-th...,Estimating the impact of COVID-19 on the world...,Will COVID END Globalization? Globalization:...,"[covid, end, globalization, globalization, glo...",27,-48,-3.57,-0.03,"[ Will COVID END Globalization?, Globalizatio...",10.02,475,0.79
92,blackassign0093,https://insights.blackcoffer.com/travel-and-to...,Travel and Tourism Outlook,Future Futurist What We Think The UN projects ...,"[future, futurist, project, decline, internati...",3,-3,6000000.0,0.0,[Future Futurist What We Think The UN projects...,6.1,54,0.89
93,blackassign0094,https://insights.blackcoffer.com/gaming-disord...,Gaming Disorder and Effects of Gaming on Health.,Blackcoffer What We Think Entertainment Perhap...,"[blackcoffer, entertainment, virtual, illusion...",27,-44,-4.18,-0.03,[Blackcoffer What We Think Entertainment Perha...,7.85,417,0.86
94,blackassign0095,https://insights.blackcoffer.com/what-is-the-r...,What is the repercussion of the environment du...,Blackcoffer What We Think Healthcare What is C...,"[blackcoffer, healthcare, covid, pandemic, cor...",7,-26,-1.74,-0.07,[Blackcoffer What We Think Healthcare What is ...,8.12,253,0.92
95,blackassign0096,https://insights.blackcoffer.com/what-is-the-r...,Due to the COVID-19 the repercussion of the en...,Blackcoffer What We Think Healthcare Epidemics...,"[blackcoffer, healthcare, epidemic, general, d...",26,-61,-2.49,-0.07,[Blackcoffer What We Think Healthcare Epidemic...,10.66,469,0.88
96,blackassign0097,https://insights.blackcoffer.com/impact-of-cov...,Impact of COVID-19 pandemic on office space an...,Blackcoffer What We Think Commercial & Profess...,"[blackcoffer, commercial, professional, servic...",21,-36,-3.8,-0.04,[Blackcoffer What We Think Commercial & Profes...,10.54,329,0.84
97,blackassign0098,https://insights.blackcoffer.com/contribution-...,Contribution of handicrafts (Visual Arts & Lit...,Blackcoffer Consumer Staples Household & Perso...,"[blackcoffer, consumer, household, personal, p...",6,-2,2.0,0.02,[Blackcoffer Consumer Staples Household & Pers...,8.91,170,0.87


* Calculated FOG Index with the given formula- Fog Index = 0.4 * (Average Sentence Length + Percentage of Complex words)

In [54]:
 data['FOG Index']= np.nan

In [55]:
for i in data.index:
    data['FOG Index'][i]= np.round(0.4*(data['Average Sentence Length'][i]+data['Percentage of Complex Words'][i]),2)

In [56]:
data.head()

Unnamed: 0,URL_ID,URL,Title,Content,Contt,Positive Words,Negative Words,Polarity Score,Subjectivity Score,Sentences,Average Sentence Length,Complex Word Count,Percentage of Complex Words,FOG Index
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,Rising IT cities and its impact on the economy...,Entertainment Broadcasting Future Forecasts Wh...,"[entertainment, broadcasting, future, forecast...",28,-6,1.55,0.05,[Entertainment Broadcasting Future Forecasts W...,6.14,383,0.81,2.78
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,Rising IT Cities and Their Impact on the Econo...,What We Think Future Prediction Throughout his...,"[future, prediction, history, industrial, revo...",54,-31,3.7,0.03,[What We Think Future Prediction Throughout hi...,8.38,578,0.85,3.69
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,"Internet Demand’s Evolution, Communication Imp...",What We Think Entertainment Future Introductio...,"[entertainment, future, introduction, span, de...",39,-27,5.5,0.02,[What We Think Entertainment Future Introducti...,10.67,564,0.93,4.64
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,Rise of Cybercrime and its Effect in upcoming ...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, forecast...",36,-80,-2.64,-0.08,[Entertainment Broadcasting What We Think Futu...,11.29,525,0.91,4.88
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,OTT platform and its impact on the entertainme...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, poised, ...",20,-9,2.64,0.03,[Entertainment Broadcasting What We Think Futu...,8.76,301,0.9,3.86


* Calculated Average number per sentence with the given formula- Average Number of Words Per Sentence = the total number of words / the total number of sentences

In [57]:
data['Average number of sentence']= np.nan

In [58]:
for i in data.index:
    data['Average number of sentence'][i]= np.round(len(data['Contt'][i])/len(data['Sentences'][i]),2)

* Calculated Word Count for each Article

In [59]:
data['Word Count']= np.nan

In [60]:
for i in data.index:
    data['Word Count'][i]= len(data['Contt'][i])

In [61]:
data.head()

Unnamed: 0,URL_ID,URL,Title,Content,Contt,Positive Words,Negative Words,Polarity Score,Subjectivity Score,Sentences,Average Sentence Length,Complex Word Count,Percentage of Complex Words,FOG Index,Average number of sentence,Word Count
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,Rising IT cities and its impact on the economy...,Entertainment Broadcasting Future Forecasts Wh...,"[entertainment, broadcasting, future, forecast...",28,-6,1.55,0.05,[Entertainment Broadcasting Future Forecasts W...,6.14,383,0.81,2.78,6.14,473.0
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,Rising IT Cities and Their Impact on the Econo...,What We Think Future Prediction Throughout his...,"[future, prediction, history, industrial, revo...",54,-31,3.7,0.03,[What We Think Future Prediction Throughout hi...,8.38,578,0.85,3.69,8.38,679.0
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,"Internet Demand’s Evolution, Communication Imp...",What We Think Entertainment Future Introductio...,"[entertainment, future, introduction, span, de...",39,-27,5.5,0.02,[What We Think Entertainment Future Introducti...,10.67,564,0.93,4.64,10.67,608.0
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,Rise of Cybercrime and its Effect in upcoming ...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, forecast...",36,-80,-2.64,-0.08,[Entertainment Broadcasting What We Think Futu...,11.29,525,0.91,4.88,11.29,576.0
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,OTT platform and its impact on the entertainme...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, poised, ...",20,-9,2.64,0.03,[Entertainment Broadcasting What We Think Futu...,8.76,301,0.9,3.86,8.76,333.0


* Calculated Syllable per word with some exceptions 

In [62]:
def syllables(text):
    x=[]
    for i in text:
        if i[-1][0]=='s' and ''.join(i[-2:-1])=='e':
            for j in i[:-2]:
                if j in 'aeiou':
                    x.append(j)
        elif i[-1][0]=='d' and ''.join(i[-2:-1])=='e':
            for k in i[:-2]:
                if k in 'aeiou':
                    x.append(k)
        else:
            for m in i:
                if m in 'aeiou':
                    x.append(m)
    return len(x)
    
data['Syllable Per Word'] = data['Contt'].apply(syllables)

In [63]:
data.head()

Unnamed: 0,URL_ID,URL,Title,Content,Contt,Positive Words,Negative Words,Polarity Score,Subjectivity Score,Sentences,Average Sentence Length,Complex Word Count,Percentage of Complex Words,FOG Index,Average number of sentence,Word Count,Syllable Per Word
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,Rising IT cities and its impact on the economy...,Entertainment Broadcasting Future Forecasts Wh...,"[entertainment, broadcasting, future, forecast...",28,-6,1.55,0.05,[Entertainment Broadcasting Future Forecasts W...,6.14,383,0.81,2.78,6.14,473.0,1154
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,Rising IT Cities and Their Impact on the Econo...,What We Think Future Prediction Throughout his...,"[future, prediction, history, industrial, revo...",54,-31,3.7,0.03,[What We Think Future Prediction Throughout hi...,8.38,578,0.85,3.69,8.38,679.0,1899
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,"Internet Demand’s Evolution, Communication Imp...",What We Think Entertainment Future Introductio...,"[entertainment, future, introduction, span, de...",39,-27,5.5,0.02,[What We Think Entertainment Future Introducti...,10.67,564,0.93,4.64,10.67,608.0,1854
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,Rise of Cybercrime and its Effect in upcoming ...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, forecast...",36,-80,-2.64,-0.08,[Entertainment Broadcasting What We Think Futu...,11.29,525,0.91,4.88,11.29,576.0,1730
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,OTT platform and its impact on the entertainme...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, poised, ...",20,-9,2.64,0.03,[Entertainment Broadcasting What We Think Futu...,8.76,301,0.9,3.86,8.76,333.0,943


* Calculated Average word Length with the given formula- Sum of the total number of characters in each word/Total number of words

In [64]:
def average(text):
    x = []
    for i in text:
        for j in i:
            x.append(j)
    return len(x)/len(a)
data['AVG word per length'] = np.round(data['Contt'].apply(average),2)

In [65]:
data.head()

Unnamed: 0,URL_ID,URL,Title,Content,Contt,Positive Words,Negative Words,Polarity Score,Subjectivity Score,Sentences,Average Sentence Length,Complex Word Count,Percentage of Complex Words,FOG Index,Average number of sentence,Word Count,Syllable Per Word,AVG word per length
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,Rising IT cities and its impact on the economy...,Entertainment Broadcasting Future Forecasts Wh...,"[entertainment, broadcasting, future, forecast...",28,-6,1.55,0.05,[Entertainment Broadcasting Future Forecasts W...,6.14,383,0.81,2.78,6.14,473.0,1154,446.14
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,Rising IT Cities and Their Impact on the Econo...,What We Think Future Prediction Throughout his...,"[future, prediction, history, industrial, revo...",54,-31,3.7,0.03,[What We Think Future Prediction Throughout hi...,8.38,578,0.85,3.69,8.38,679.0,1899,725.43
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,"Internet Demand’s Evolution, Communication Imp...",What We Think Entertainment Future Introductio...,"[entertainment, future, introduction, span, de...",39,-27,5.5,0.02,[What We Think Entertainment Future Introducti...,10.67,564,0.93,4.64,10.67,608.0,1854,699.86
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,Rise of Cybercrime and its Effect in upcoming ...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, forecast...",36,-80,-2.64,-0.08,[Entertainment Broadcasting What We Think Futu...,11.29,525,0.91,4.88,11.29,576.0,1730,659.71
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,OTT platform and its impact on the entertainme...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, poised, ...",20,-9,2.64,0.03,[Entertainment Broadcasting What We Think Futu...,8.76,301,0.9,3.86,8.76,333.0,943,358.0


* Calculated Personal pronouns by checking each word from the given list in the below function in each article
**NOTE- Using Content as under Contt column all the pronouns have been removed as they were in stopwords list**

In [66]:
from nltk.stem import WordNetLemmatizer
lem= WordNetLemmatizer()
def text_corpuss(x):
    string_format= str(x).lower()
    Lower_words = re.sub('[^a-zA-Z]+', ' ', string_format).strip()
    token= word_tokenize(Lower_words)
    #p= re.compile(r'<.*?>')
    #cleaned_data= p.sub('',token)
    lemmatized= [lem.lemmatize(w) for w in token]
    return lemmatized

In [67]:
data['Contt1']= data['Content'].apply(text_corpuss)

In [68]:
def personal_pronouns(text):
    personal_pronouns = ['I','i','you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them', \
                         'my', 'your', 'his', 'her', 'its', 'our', 'their', 'mine', 'yours', 'hers', 'ours', 'theirs']
    b=[]
    c=[]
    for j in text:
        if j =='US':
            b.append(j)
        elif j in personal_pronouns:
            c.append(j)
    return len(c)
data['Personal Pronouns'] = data['Contt1'].apply(personal_pronouns)

In [69]:
data.head()

Unnamed: 0,URL_ID,URL,Title,Content,Contt,Positive Words,Negative Words,Polarity Score,Subjectivity Score,Sentences,Average Sentence Length,Complex Word Count,Percentage of Complex Words,FOG Index,Average number of sentence,Word Count,Syllable Per Word,AVG word per length,Contt1,Personal Pronouns
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,Rising IT cities and its impact on the economy...,Entertainment Broadcasting Future Forecasts Wh...,"[entertainment, broadcasting, future, forecast...",28,-6,1.55,0.05,[Entertainment Broadcasting Future Forecasts W...,6.14,383,0.81,2.78,6.14,473.0,1154,446.14,"[entertainment, broadcasting, future, forecast...",68
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,Rising IT Cities and Their Impact on the Econo...,What We Think Future Prediction Throughout his...,"[future, prediction, history, industrial, revo...",54,-31,3.7,0.03,[What We Think Future Prediction Throughout hi...,8.38,578,0.85,3.69,8.38,679.0,1899,725.43,"[what, we, think, future, prediction, througho...",30
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,"Internet Demand’s Evolution, Communication Imp...",What We Think Entertainment Future Introductio...,"[entertainment, future, introduction, span, de...",39,-27,5.5,0.02,[What We Think Entertainment Future Introducti...,10.67,564,0.93,4.64,10.67,608.0,1854,699.86,"[what, we, think, entertainment, future, intro...",28
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,Rise of Cybercrime and its Effect in upcoming ...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, forecast...",36,-80,-2.64,-0.08,[Entertainment Broadcasting What We Think Futu...,11.29,525,0.91,4.88,11.29,576.0,1730,659.71,"[entertainment, broadcasting, what, we, think,...",20
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,OTT platform and its impact on the entertainme...,Entertainment Broadcasting What We Think Futur...,"[entertainment, broadcasting, future, poised, ...",20,-9,2.64,0.03,[Entertainment Broadcasting What We Think Futu...,8.76,301,0.9,3.86,8.76,333.0,943,358.0,"[entertainment, broadcasting, what, we, think,...",22


* Dropping the columns which are not needed anymore

In [70]:
data.drop(['Title', 'Content', 'Contt','Sentences','Contt1'],axis=1,inplace=True)

In [71]:
data.head()

Unnamed: 0,URL_ID,URL,Positive Words,Negative Words,Polarity Score,Subjectivity Score,Average Sentence Length,Complex Word Count,Percentage of Complex Words,FOG Index,Average number of sentence,Word Count,Syllable Per Word,AVG word per length,Personal Pronouns
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,28,-6,1.55,0.05,6.14,383,0.81,2.78,6.14,473.0,1154,446.14,68
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,54,-31,3.7,0.03,8.38,578,0.85,3.69,8.38,679.0,1899,725.43,30
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,39,-27,5.5,0.02,10.67,564,0.93,4.64,10.67,608.0,1854,699.86,28
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,36,-80,-2.64,-0.08,11.29,525,0.91,4.88,11.29,576.0,1730,659.71,20
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,20,-9,2.64,0.03,8.76,301,0.9,3.86,8.76,333.0,943,358.0,22


In [72]:
data.to_excel('Output Data Structure.xlsx', encoding='utf-8')