In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv
import nltk
import string
import numpy as np
import re

In [2]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

## 1. Objective

### The objective of this assignment is to extract textual data articles from the given URL and perform text analysis to compute variables that are explained below. 


#### Reading input file containing web links and it's attributes

In [3]:
df = pd.read_excel('Input.xlsx')

In [4]:
df.head()

Unnamed: 0,URL_ID,URL
0,37,https://insights.blackcoffer.com/ai-in-healthc...
1,38,https://insights.blackcoffer.com/what-if-the-c...
2,39,https://insights.blackcoffer.com/what-jobs-wil...
3,40,https://insights.blackcoffer.com/will-machine-...
4,41,https://insights.blackcoffer.com/will-ai-repla...


In [5]:
df.shape

(114, 2)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   URL_ID  114 non-null    int64 
 1   URL     114 non-null    object
dtypes: int64(1), object(1)
memory usage: 1.9+ KB


In [7]:
df.isnull().sum()

URL_ID    0
URL       0
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   URL_ID  114 non-null    int64 
 1   URL     114 non-null    object
dtypes: int64(1), object(1)
memory usage: 1.9+ KB


## 2. Data Extraction
#### Input.xlsx
#### For each of the articles, given in the input.xlsx file, extract the article text and save the extracted article in a text file with URL_ID as its file name.
#### While extracting text, please make sure your program extracts only the article title and the article text. It should not extract the website header, footer, or anything other than the article text. 


### Data Extraction for a single link
#### - To check whether required data is getting extracted or not.

In [9]:
url = df['URL'][0]

In [10]:
content = requests.get(url).content

In [11]:
soup = BeautifulSoup(content,'lxml')

In [14]:
print(soup.prettify())

<!DOCTYPE html>
<!--[if IE 8]>    <html class="ie8" lang="en"> <![endif]-->
<!--[if IE 9]>    <html class="ie9" lang="en"> <![endif]-->
<!--[if gt IE 8]><!-->
<html lang="en-US">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <link href="https://insights.blackcoffer.com/xmlrpc.php" rel="pingback"/>
  <meta content="index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1" name="robots"/>
  <meta content="https://insights.blackcoffer.com/wp-content/uploads/2021/06/h12-slider-image-2.jpg" property="og:image"/>
  <!-- This site is optimized with the Yoast SEO plugin v20.1 - https://yoast.com/wordpress/plugins/seo/ -->
  <title>
   AI in healthcare to Improve Patient Outcomes - Blackcoffer Insights
  </title>
  <meta content="By identifying the molecular patterns associated with disease status and its subtypes, gene expression, and protein abundance levels, machine learning methods can dete

In [15]:
title = soup.title.text

In [16]:
summary = soup.find('div',class_='td-post-content')

In [15]:
summary = [para.text for para in summary.find_all('p')]

In [16]:
summary = ' '.join([str(elem) for elem in summary])

In [17]:
summary = summary.replace('\xa0'," ")
summary = summary.replace('\n'," ")

In [18]:
summary

'Introduction “If anything kills over 10 million people in the next few decades, it will be a highly infectious virus rather than a war. Not missiles but microbes.” Bill Gates’s remarks at a TED conference in 2014, right after the world had avoided the Ebola outbreak. When the new, unprecedented, invisible virus hit us, it met an overwhelmed and unprepared healthcare system and oblivious population. This public health emergency demonstrated our lack of scientific consideration and underlined the alarming need for robust innovations in our health and medical facilities. For the past few years, artificial intelligence has proven to be of tangible potential in the healthcare sectors, clinical practices, translational medical and biomedical research. After the first case was detected in China on December 31st 2019, it was an AI program developed by BlueDot that alerted the world about the pandemic. It was quick to realise AI’s ability to analyse large chunks of data could help in detecting

### Data extraction from all links
#### - Since, the code is working fine for one link, we'll loop it for all the links

In [19]:
csv_file = open('URL_ID.csv','w',encoding='utf-8',newline='')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['TEXT'])

for i in range(0,len(df)):
    
    try:
        
        url = df.loc[i,'URL']
    
        content = requests.get(url).content

        soup = BeautifulSoup(content,'lxml')

        title = soup.title.text
        print(title)

        summary = soup.find('div',class_='td-post-content')
        summary = [para.text for para in summary.find_all('p')]
        summary = ' '.join([str(elem) for elem in summary])
        summary = summary.replace('\xa0'," ")
        summary = summary.replace('\n'," ")
    
    except Exception as e:
        summary = ''
    
    text = title+" "+summary
    csv_writer.writerow([text])    

csv_file.close()

AI in healthcare to Improve Patient Outcomes - Blackcoffer Insights
What if the Creation is Taking Over the Creator? - Blackcoffer Insights
What Jobs Will Robots Take From Humans in The Future? - Blackcoffer Insights
Will Machine Replace The Human in the Future of Work? - Blackcoffer Insights
Will AI Replace Us or Work With Us? - Blackcoffer Insights
Will machine replace the human in the future of work? - Blackcoffer Insights
How humans and machines are evolving to work together? - Blackcoffer Insights
Page not found - Blackcoffer Insights
How machine learning will affect your business? - Blackcoffer Insights
Deep learning impact on areas of e-learning? - Blackcoffer Insights
How to protect future data and its privacy? - Blackcoffer Insights
How Machines, AI, Automations, and Robo-human are Effective in Finance and Banking? - Blackcoffer Insights
How Robo Human will Impact the Future? - Blackcoffer Insights
How AI will change the World? - Blackcoffer Insights
Future of Work: How AI Has

Business Analytics In The Healthcare Industry - Blackcoffer Insights
Challenges and Opportunities of Big Data in Healthcare - Blackcoffer Insights


# Text Analysis

## 1. Sentiment Analysis

#### Loading extracted data

In [20]:
# Reading the file with extracted texts
text = pd.read_csv('URL_ID.csv')
text.head()

Unnamed: 0,TEXT
0,AI in healthcare to Improve Patient Outcomes -...
1,What if the Creation is Taking Over the Creato...
2,What Jobs Will Robots Take From Humans in The ...
3,Will Machine Replace The Human in the Future o...
4,Will AI Replace Us or Work With Us? - Blackcof...


### 1.1 Cleaning using Stop Words Lists

In [21]:
#importing stop words files that are provided
StopWords_Auditor = list(pd.read_csv("StopWords/StopWords_Auditor.txt",header=None,encoding='cp1252',sep='|')[0])
StopWords_Currencies = list(pd.read_csv("StopWords/StopWords_Currencies.txt",header=None,encoding="cp1252",on_bad_lines='skip',sep='|')[0])#
StopWords_DatesandNumbers = list(pd.read_csv("StopWords/StopWords_DatesandNumbers.txt",header=None,encoding='cp1252',sep='|')[0])
StopWords_Generic = list(pd.read_csv("StopWords/StopWords_Generic.txt",header=None,encoding='cp1252',sep='|')[0])
StopWords_GenericLong = list(pd.read_csv("StopWords/StopWords_GenericLong.txt",header=None,encoding='cp1252',sep='|')[0])
StopWords_Geographic = list(pd.read_csv("StopWords/StopWords_Geographic.txt",header=None,encoding='cp1252',sep='|')[0])
StopWords_Names = list(pd.read_csv("StopWords/StopWords_Names.txt",header=None,encoding='cp1252',sep='|')[0])

In [22]:
StopWords_Auditor = [str(word).lower() for word in StopWords_Auditor]
StopWords_Currencies = [str(word).lower() for word in StopWords_Currencies]
StopWords_DatesandNumbers = [str(word).lower() for word in StopWords_DatesandNumbers]
StopWords_Generic = [str(word).lower() for word in StopWords_Generic]
StopWords_GenericLong = [str(word).lower() for word in StopWords_GenericLong]
StopWords_Geographic = [str(word).lower() for word in StopWords_Geographic]
StopWords_Names = [str(word).lower() for word in StopWords_Names]

In [23]:
#creating func for removing stop words only
def remove_stopwords(text):
    txt=' '.join([word for word in text.split() if word.lower() not in StopWords_Auditor])
    txt1=' '.join([word for word in txt.split() if word.lower() not in StopWords_Currencies])
    txt2=' '.join([word for word in txt1.split() if word.lower() not in StopWords_DatesandNumbers])
    txt3=' '.join([word for word in txt2.split() if word.lower() not in StopWords_Generic])
    txt4=' '.join([word for word in txt3.split() if word.lower() not in StopWords_GenericLong])
    txt5=' '.join([word for word in txt4.split() if word.lower() not in StopWords_Geographic])
    txt6=' '.join([word for word in txt5.split() if word.lower() not in StopWords_Names])
    return txt6

In [24]:
text['TEXT_StopWords_removed'] = text['TEXT'].apply(remove_stopwords)

In [25]:
text['TEXT_StopWords_removed']

0      healthcare Improve Patient Outcomes - Blackcof...
1      Creation Taking Creator? - Blackcoffer Insight...
2      Jobs Robots Humans Future? - Blackcoffer Insig...
3      Machine Replace Human Future Work? - Blackcoff...
4      Replace Work Us? - Blackcoffer Insights “Machi...
5      machine replace human future work? - Blackcoff...
6      humans machines evolving work together? - Blac...
7                           found - Blackcoffer Insights
8      machine learning affect business? - Blackcoffe...
9      Deep learning impact areas e-learning? - Black...
10     protect future data privacy? - Blackcoffer Ins...
11     Machines, AI, Automations, Robo-human Effectiv...
12     Robo Human Impact Future? - Blackcoffer Insigh...
13     change World? - Blackcoffer Insights work dest...
14     Future Work: Entered Workplace - Blackcoffer I...
15     machine learning finance banking? - Blackcoffe...
16     impact future work? - Blackcoffer Insights exp...
17     online marketing - Black

### 1.2 Creating a dictionary of Positive and Negative words

In [26]:
# Importing given dictionary for reference
positive = pd.read_csv('MasterDictionary/positive-words.txt',header=None,encoding='cp1252',skip_blank_lines=True)[0]
negative = pd.read_csv('MasterDictionary/negative-words.txt',header=None,encoding='cp1252',skip_blank_lines=True)[0]

In [27]:
positive = [word for word in positive]
negative = [word for word in negative]

### 1.3	Extracting Derived variables

In [28]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [29]:
sentences = [sent_tokenize(record) for record in text['TEXT_StopWords_removed']]
count_sentences = [len(record) for record in sentences]
count_sentences_arr = np.array(count_sentences)

In [30]:
words = [word for word in text['TEXT_StopWords_removed']]
count_words = [len(word) for word in words]
count_words_arr = np.array(count_words)

#### Positive Score

In [31]:
positive_score = []
for record in text['TEXT_StopWords_removed']:
    score = 0
    for word in record.split():
        if( word in positive):
            score+=1
    positive_score.append(score)

In [32]:
positive_score_arr = np.array(positive_score)

#### Negative Score

In [33]:
negative_score = []
for record in text['TEXT_StopWords_removed']:
    score = 0
    for word in record.split():
        if( word in negative):
            score+=-1
    negative_score.append(-1*score)

In [34]:
negative_score_arr = np.array(negative_score)

#### Polarity Score
##### Polarity Score = (Positive Score – Negative Score)/ ((Positive Score + Negative Score) + 0.000001)


In [35]:
Polarity_Score=(positive_score_arr-negative_score_arr)/((positive_score_arr+negative_score_arr)+0.000001)
print('polarity_score=', Polarity_Score)

polarity_score= [ 0.35064935  0.2112676   0.31764706  0.43999999  0.36842105  0.33333333
  0.30434781  0.          0.48571427  0.34210526  0.02777778  0.26829268
  0.          0.36111111  0.42857142  0.99999994  0.32727272  0.99999993
  0.37499998 -0.99999967  0.         -0.46341463  0.21739129  0.31428571
  0.94117644  0.99999967  0.54545452  0.1111111   0.33333331  0.99999975
 -0.22222222  0.33333328  0.64102562 -0.66666656 -0.49999998 -0.21311475
  0.21428571  0.10526316  0.80952377  0.73684209  0.45454543  0.54999999
 -0.5522388   0.43999999 -0.19999999  0.15151515 -0.63380281  0.45454543
  0.24999997  0.25714285 -0.22222222  0.23076922 -0.03333333 -0.27536231
  0.33333322  0.70370368 -0.05882353  0.42857142 -0.10638298 -0.46153845
 -0.52173912 -0.11111111 -0.12195122 -0.27272727  0.         -0.25925925
 -0.67999997 -0.30434782 -0.19230769  0.          0.66666661 -0.22388059
  0.07692307 -0.68749998 -0.37142856 -0.43396226 -0.09677419 -0.02631579
 -0.37037036 -0.09090909 -0.3947368

#### SUBJECTIVITY SCORE
##### Subjectivity Score = (Positive Score + Negative Score)/ ((Total Words after cleaning) + 0.000001)


In [36]:
Subjectivity_score=(positive_score_arr+negative_score_arr)/((count_words_arr)+ 0.000001)
print('subjectivity_score',Subjectivity_score)

subjectivity_score [0.00882724 0.01398739 0.01130169 0.00923191 0.00842323 0.00948567
 0.00784447 0.         0.01338944 0.00944451 0.00917782 0.00780506
 0.00803859 0.01287554 0.00908354 0.00743332 0.01898188 0.00457816
 0.00489596 0.00413793 0.         0.0147138  0.00846522 0.01055011
 0.01258793 0.00205198 0.00740242 0.00845467 0.01261388 0.00548697
 0.01261388 0.00914634 0.01192296 0.01136364 0.02352941 0.00812467
 0.00962034 0.00708427 0.0078329  0.00878816 0.0148448  0.00613591
 0.01928058 0.01374004 0.00948509 0.00718641 0.01970033 0.00934579
 0.0071048  0.0112     0.00761744 0.00190434 0.00924927 0.01024043
 0.00628931 0.00841384 0.00835929 0.00760988 0.01150269 0.007136
 0.01202614 0.01444292 0.01042992 0.00821355 0.00671141 0.01328413
 0.00946611 0.0145202  0.01489971 0.         0.00622407 0.01751176
 0.02053712 0.00759734 0.00851996 0.01166117 0.01031957 0.01330532
 0.00886117 0.01778496 0.01059972 0.01691065 0.01061836 0.01259364
 0.01713894 0.01151543 0.01387064 0.01446945 

## 2. Analysis of Readability

#### AVG SENTENCE LENGTH
##### Average Sentence Length = the number of words / the number of sentences

In [37]:
Avg_sent_length = count_words_arr/count_sentences_arr
Avg_sent_length

array([116.30666667,  64.25316456,  88.48235294,  58.86956522,
        87.88311688,  84.71428571,  65.15555556,  28.        ,
        74.68571429,  99.34567901,  86.20879121,  90.56896552,
        63.79487179,  91.67213115,  79.03846154, 103.95454545,
       643.88888889,  67.95555556,  81.7       , 103.57142857,
        28.        , 101.32727273,  93.68965517,  60.87155963,
        62.81395349,  91.375     ,  87.41176471, 112.05263158,
        52.85185185,  72.9       ,  51.89090909,  72.88888889,
       105.51612903,  88.        ,  54.09090909, 101.45945946,
        86.88059701,  65.41463415,  43.95081967,  65.51515152,
        92.625     ,  94.47826087,  45.72368421,  47.25974026,
        85.81395349,  76.53333333,  49.36986301,  81.17241379,
       102.36363636, 135.86956522,  84.39285714,  70.74093264,
        79.1097561 , 114.20338983,  68.14285714,  82.28205128,
        50.84166667,  71.54444444,  88.82608696, 127.84210526,
        79.6875    ,  59.75342466, 109.19444444,  97.4 

#### PERCENTAGE OF COMPLEX WORDS
##### Percentage of Complex words = the number of complex words / the number of words *100

In [38]:
count_complex_words = []
for record in text['TEXT_StopWords_removed']:
    count = 0
    for word in record.split():
        d = {}.fromkeys('aeiou',0)
        haslotsvowels = False
        for x in word.lower():
            if x in d:
                d[x] += 1
        for q in d.values():
            if q > 2:
                haslotsvowels = True
        if haslotsvowels:
            count += 1
    count_complex_words.append(count)

In [39]:
count_complex_words_arr = np.array(count_complex_words)

In [40]:
percentage_complex_words = count_complex_words_arr/count_words*100

#### FOG Index
##### Fog Index = 0.4 * (Average Sentence Length + Percentage of Complex words)

In [41]:
Fog_Index = 0.4 * (Avg_sent_length + percentage_complex_words)
print('fog index= ',Fog_Index )

fog index=  [ 46.80697252  26.05587575  35.72268456  24.0057286   35.51973116
  34.07964346  26.29414582  11.2         30.04261012  40.09119816
  34.68236926  36.60070633  25.82341496  37.0408124   32.0176555
  41.86166077 257.79024063  27.26070489  32.94927785  41.53891626
  11.2         40.8754273   37.77030447  24.81885746  25.51062397
  36.65943912  35.48960494  45.1968159   21.53317242  29.26973937
  20.93856406  29.21653117  42.68336999  35.42727273  22.03972498
  40.91942577  34.99961898  26.49396701  17.81904477  26.45582934
  37.4548583   38.1778667   18.61177584  19.17869687  34.42314237
  30.99660859  20.08090858  32.77482788  41.08755046  54.73182609
  33.95181065  28.48094788  31.90904811  45.93068808  27.59257263
  33.04993488  20.69726329  28.89111215  35.88285769  51.49913111
  32.05277778  24.20398334  44.00339467  39.20640657  26.75734526
  27.44440344  30.39489804  39.87777778  37.09214296  55.09813953
  27.70883225  49.5768376   25.42531859  36.81602048  31.83369036

## 3.Average Number of Words Per Sentence
##### Average Number of Words Per Sentence = the total number of words / the total number of sentences

In [42]:
Avg_words_per_sentance = count_words_arr/count_sentences_arr
Avg_words_per_sentance

array([116.30666667,  64.25316456,  88.48235294,  58.86956522,
        87.88311688,  84.71428571,  65.15555556,  28.        ,
        74.68571429,  99.34567901,  86.20879121,  90.56896552,
        63.79487179,  91.67213115,  79.03846154, 103.95454545,
       643.88888889,  67.95555556,  81.7       , 103.57142857,
        28.        , 101.32727273,  93.68965517,  60.87155963,
        62.81395349,  91.375     ,  87.41176471, 112.05263158,
        52.85185185,  72.9       ,  51.89090909,  72.88888889,
       105.51612903,  88.        ,  54.09090909, 101.45945946,
        86.88059701,  65.41463415,  43.95081967,  65.51515152,
        92.625     ,  94.47826087,  45.72368421,  47.25974026,
        85.81395349,  76.53333333,  49.36986301,  81.17241379,
       102.36363636, 135.86956522,  84.39285714,  70.74093264,
        79.1097561 , 114.20338983,  68.14285714,  82.28205128,
        50.84166667,  71.54444444,  88.82608696, 127.84210526,
        79.6875    ,  59.75342466, 109.19444444,  97.4 

## 4.Complex Word Count

In [43]:
count_complex_words = []
for record in text['TEXT_StopWords_removed']:
    count = 0
    for word in record.split():
        d = {}.fromkeys('aeiou',0)
        haslotsvowels = False
        for x in word.lower():
            if x in d:
                d[x] += 1
        for q in d.values():
            if q > 2:
                haslotsvowels = True
        if haslotsvowels:
            count += 1
    count_complex_words.append(count)

In [44]:
count_complex_words_arr = np.array(count_complex_words)

## 5.Word count

In [45]:
from nltk.corpus import stopwords

In [46]:
nltk_stopwords = stopwords.words('english')

In [47]:
punc = [punc for punc in string.punctuation]

In [48]:
#creating func for removing stop words,punctuations and converting to words
def remove_punc_stopwords_nltk(text):
    nopunc =[char for char in text if char not in punc]
    nopunc=''.join(nopunc)
    txt=' '.join([word for word in nopunc.split() if word.lower() not in nltk_stopwords])
    return txt

In [49]:
text['TEXT_cleaned'] = text['TEXT_StopWords_removed'].apply(remove_punc_stopwords_nltk)

In [50]:
words = [word for word in text['TEXT_cleaned']]
count_words = [len(word.split()) for word in words]
count_words_arr = np.array(count_words)

## 6. Syllable count per word

In [51]:
syllable_count = []
vowels=['a','e','i','o','u']
for record in text['TEXT_StopWords_removed']:
    count=0
    for i in record:
        x=re.compile('[es|ed]$')
        if x.match(i.lower()):
            count+=0
        else:
            for j in i:
                if(j.lower() in vowels ):
                    count+=1
    syllables = count
    syllable_count.append(syllables)

In [52]:
syllable_count_arr = np.array(syllable_count)

In [53]:
syllable_count_per_word = np.round(syllable_count_arr/count_words_arr,decimals=4)

In [54]:
syllable_count_per_word

array([1.9979, 1.8735, 2.0835, 1.8434, 1.8333, 1.9014, 1.9238, 2.    ,
       1.6728, 1.8629, 1.7247, 2.1684, 1.8517, 2.    , 1.8352, 1.9841,
       1.7748, 1.7458, 1.8511, 2.2025, 2.    , 1.8427, 1.8831, 1.8974,
       1.8239, 1.7189, 1.5794, 1.9798, 2.0654, 2.0247, 1.6182, 1.8101,
       1.8989, 1.8254, 1.8382, 1.9435, 1.8189, 1.7553, 1.9137, 1.6641,
       1.908 , 1.9073, 1.7646, 1.5043, 1.6211, 1.8154, 1.7671, 1.6351,
       1.6788, 1.7335, 1.7504, 1.5283, 1.7828, 1.9046, 1.7288, 1.648 ,
       1.9827, 1.7935, 1.8221, 1.7459, 1.8531, 1.7966, 1.9372, 1.8665,
       1.6667, 1.6714, 1.7383, 1.8822, 1.7476, 1.9383, 1.6129, 1.8668,
       1.622 , 1.6984, 1.9872, 1.8378, 1.8475, 1.7986, 1.7226, 1.4322,
       1.7631, 1.6333, 2.0492, 1.8105, 1.7176, 1.792 , 1.7628, 1.7239,
       1.8556, 1.7081, 1.8253, 1.9843, 1.7872, 1.7705, 1.8464, 1.9468,
       2.0812, 2.0739, 1.5175, 1.8466, 1.6481, 2.0185, 1.8265, 1.6782,
       1.724 , 1.8991, 1.8706, 2.    , 1.902 , 1.8268, 1.8253, 1.8669,
      

## 7. Personal Pronouns

In [56]:
count_pronouns = []
pronounRegex = re.compile(r'\b(I|we|my|ours|(?-i:us)|(?-i:Us))\b',re.I)
for record in text['TEXT']:
    pronouns = pronounRegex.findall(record)
    count_pronouns.append(len(pronouns))

In [57]:
count_pronouns

[1,
 7,
 3,
 17,
 18,
 21,
 7,
 0,
 2,
 11,
 1,
 10,
 6,
 32,
 11,
 0,
 15,
 0,
 1,
 2,
 0,
 2,
 4,
 10,
 4,
 0,
 4,
 1,
 0,
 1,
 19,
 1,
 0,
 4,
 6,
 16,
 20,
 18,
 1,
 4,
 8,
 8,
 13,
 5,
 2,
 9,
 15,
 5,
 0,
 0,
 5,
 0,
 10,
 1,
 2,
 2,
 6,
 5,
 3,
 1,
 5,
 37,
 12,
 3,
 2,
 13,
 4,
 4,
 7,
 0,
 4,
 3,
 42,
 0,
 0,
 3,
 8,
 4,
 3,
 46,
 11,
 7,
 1,
 6,
 36,
 12,
 11,
 9,
 2,
 3,
 7,
 4,
 1,
 3,
 0,
 8,
 0,
 4,
 7,
 5,
 3,
 8,
 0,
 0,
 3,
 1,
 2,
 0,
 2,
 9,
 2,
 2,
 0,
 8]

## 8. AVERAGE WORD LENGTH
##### AVERAGE WORD LENGTH = Sum of the total number of characters in each word/Total number of words

In [58]:
count_characters = np.array([len(record) for record in text['TEXT_cleaned']])

In [59]:
average_word_length = (count_characters/count_words_arr)

In [60]:
average_word_length

array([8.67487179, 8.06913997, 8.66945107, 8.03255814, 8.17676768,
       8.09139785, 8.1143695 , 8.66666667, 7.77160494, 8.19514768,
       8.04086022, 8.73883162, 7.88632619, 8.30461538, 8.23324022,
       8.79761905, 8.11473088, 8.17877095, 8.83988764, 8.56962025,
       8.66666667, 8.11649017, 8.01538462, 8.28947368, 8.16037736,
       7.68108108, 7.54497354, 8.23790323, 8.8627451 , 8.72839506,
       7.63247863, 7.79746835, 8.65027322, 7.96825397, 8.10294118,
       8.70552885, 7.79387187, 7.76899696, 8.00958466, 8.0546875 ,
       8.18965517, 8.44354839, 7.93203883, 7.41452991, 7.47789474,
       8.45192308, 7.96705882, 7.91929825, 7.89781022, 8.29945055,
       7.96637168, 7.38843931, 8.15474642, 8.28753181, 7.59322034,
       7.77295918, 8.37266187, 8.0627451 , 7.98159509, 8.14965197,
       8.06578947, 8.        , 8.26406926, 7.98757764, 7.49333333,
       7.89516129, 7.73831776, 8.39855072, 7.79716981, 8.13580247,
       7.40725806, 8.33182844, 7.27235772, 7.92023346, 8.44468

In [63]:
Output_Data = pd.DataFrame({"URL_ID":df['URL_ID'],\
                            "URL":df['URL'],\
                            "POSITIVE SCORE":positive_score_arr,\
                            "NEGATIVE SCORE":negative_score_arr,\
                            "POLARITY SCORE":Polarity_Score,\
                            "SUBJECTIVITY SCORE":Subjectivity_score,\
                            "AVG SENTENCE LENGTH":Avg_sent_length,\
                            "PERCENTAGE OF COMPLEX WORDS":percentage_complex_words,\
                            "FOG INDEX":Fog_Index,\
                            "AVG NUMBER OF WORDS PER SENTENCE":Avg_words_per_sentance,\
                            "COMPLEX WORD COUNT":count_complex_words_arr,\
                            "WORD COUNT":count_words_arr,\
                            "SYLLABLE PER WORD":syllable_count_per_word,\
                            "PERSONAL PRONOUNS":count_pronouns,\
                            "AVG WORD LENGTH":average_word_length
                           })

In [64]:
Output_Data

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthc...,52,25,0.350649,0.008827,116.306667,0.710765,46.806973,116.306667,62,975,1.9979,1,8.674872
1,38,https://insights.blackcoffer.com/what-if-the-c...,43,28,0.211268,0.013987,64.253165,0.886525,26.055876,64.253165,45,593,1.8735,7,8.06914
2,39,https://insights.blackcoffer.com/what-jobs-wil...,56,29,0.317647,0.011302,88.482353,0.824358,35.722685,88.482353,62,838,2.0835,3,8.669451
3,40,https://insights.blackcoffer.com/will-machine-...,36,14,0.44,0.009232,58.869565,1.144756,24.005729,58.869565,62,645,1.8434,17,8.032558
4,41,https://insights.blackcoffer.com/will-ai-repla...,39,18,0.368421,0.008423,87.883117,0.916211,35.519731,87.883117,62,792,1.8333,18,8.176768
