## Import Library

In [47]:
import plotly.graph_objs
import requests
import pandas as pd
from bs4 import BeautifulSoup
import string
import re
import urllib

# Text Analysis
## Scrap Data

In [48]:
url="https://insights.blackcoffer.com/how-is-login-logout-time-tracking-for-employees-in-office-done-by-ai/"

In [49]:
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
# Here the user agent is for Edge browser on windows 10. You can find your browser user agent from the above given link.
r = requests.get(url=url, headers=headers)

In [50]:
soup = BeautifulSoup(r.content, 'html5lib')


### Extract title from artical

In [51]:
title=soup.find('h1',class_="entry-title")
title=title.text.replace('\n'," ")
title

'How is Login Logout Time Tracking for Employees in Office done by AI?'

### Extract content from articel

In [52]:
content=soup.findAll(attrs={'class':'td-post-content'})
content=content[0].text.replace('\n'," ")
content

'          When people hear AI they often think about sentient robots and magic boxes. AI today is much more mundane and simple—but that doesn’t mean it’s not powerful. Another misconception is that high-profile research projects can be applied directly to any business situation. AI done right can create an extreme return on investments (ROIs)—for instance through automation or precise prediction. But it does take thought, time, and proper implementation. We have seen that success and value generated by AI projects are increased when there is a grounded understanding and expectation of what the technology can deliver from the C-suite down.    “Artificial Intelligence (AI) is a science and a set of computational technologies that are inspired by—but typically operate quite differently from—the ways people use their nervous systems and bodies to sense, learn, reason and take action.”3 Lately there has been a big rise in the day-to-day use of machines powered by AI. These machines are wir

### Remove punctuation from the content

In [53]:
content = content.translate(str.maketrans('', '', string.punctuation))
content

'          When people hear AI they often think about sentient robots and magic boxes AI today is much more mundane and simple—but that doesn’t mean it’s not powerful Another misconception is that highprofile research projects can be applied directly to any business situation AI done right can create an extreme return on investments ROIs—for instance through automation or precise prediction But it does take thought time and proper implementation We have seen that success and value generated by AI projects are increased when there is a grounded understanding and expectation of what the technology can deliver from the Csuite down    “Artificial Intelligence AI is a science and a set of computational technologies that are inspired by—but typically operate quite differently from—the ways people use their nervous systems and bodies to sense learn reason and take action”3 Lately there has been a big rise in the daytoday use of machines powered by AI These machines are wired using crossdiscip

### Convert Tokens

In [54]:
from nltk.tokenize import word_tokenize
text_tokens = word_tokenize(content)
print(text_tokens[0:50])

['When', 'people', 'hear', 'AI', 'they', 'often', 'think', 'about', 'sentient', 'robots', 'and', 'magic', 'boxes', 'AI', 'today', 'is', 'much', 'more', 'mundane', 'and', 'simple—but', 'that', 'doesn', '’', 't', 'mean', 'it', '’', 's', 'not', 'powerful', 'Another', 'misconception', 'is', 'that', 'highprofile', 'research', 'projects', 'can', 'be', 'applied', 'directly', 'to', 'any', 'business', 'situation', 'AI', 'done', 'right', 'can']


### Remove stopwords from the tokens

In [55]:
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

my_stop_words = stopwords.words('english')
my_stop_words.append('the')
no_stop_tokens = [word for word in text_tokens if not word in my_stop_words]
print(no_stop_tokens[0:40])

['When', 'people', 'hear', 'AI', 'often', 'think', 'sentient', 'robots', 'magic', 'boxes', 'AI', 'today', 'much', 'mundane', 'simple—but', '’', 'mean', '’', 'powerful', 'Another', 'misconception', 'highprofile', 'research', 'projects', 'applied', 'directly', 'business', 'situation', 'AI', 'done', 'right', 'create', 'extreme', 'return', 'investments', 'ROIs—for', 'instance', 'automation', 'precise', 'prediction']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Check for positive words

In [56]:
with open("positive-words.txt",'r') as pos_word:
     pos_words = pos_word.read().split("\n")
     pos_words = pos_words[5:]

### Positive Score

In [57]:
pos_count = " ".join([w for w in no_stop_tokens if w in pos_words])
pos_count = pos_count.split(" ")
Positive_score=len(pos_count)
print(Positive_score)

16


### Check for negative words

In [58]:
with open("negative-words.txt","r") as neg:
   negwords = neg.read().split("\n")

negwords = negwords[36:]
neg_count = " ".join ([w for w in no_stop_tokens if w in negwords])
neg_count=neg_count.split(" ")

### NEgative Score

In [59]:
Negative_score=len(neg_count)
print(Negative_score)

9


In [60]:
filter_content = ' '.join(no_stop_tokens)

In [61]:
data=[[url,title,content,filter_content,Positive_score,Negative_score]]

In [62]:
data=pd.DataFrame(data,columns=["url","title","content","filter_content","Positive_Score","Negative_Score"])

### Calculate Polarity Score and Subjectivity Score

In [63]:
from textblob import TextBlob
def sentiment_analysis(data):
   sentiment=TextBlob(data["content"]).sentiment
   return pd.Series([sentiment.polarity,sentiment.subjectivity])
data[["polarity","subjectivity"]]=data.apply(sentiment_analysis,axis=1)
data

Unnamed: 0,url,title,content,filter_content,Positive_Score,Negative_Score,polarity,subjectivity
0,https://insights.blackcoffer.com/how-is-login-...,How is Login Logout Time Tracking for Employee...,When people hear AI they often think...,When people hear AI often think sentient robot...,16,9,0.14304,0.478514


# Average Sentence length

In [64]:
AVG_SENTENCE_LENGTH = len(content.replace(' ',''))/len(re.split(r'[?!.]', content))
print('Word average =', AVG_SENTENCE_LENGTH)

Word average = 3673.0


## FOG Index

In [65]:
import textstat
FOG_index=(textstat.gunning_fog(content))
print(FOG_index)

289.41


## AVG Number of words per Sentence

In [66]:
AVG_NUMBER_OF_WORDS_PER_SENTENCE = [len(l.split()) for l in re.split(r'[?!.]', content) if l.strip()]
AVG_NUMBER_OF_WORDS_PER_SENTENCE=print(sum(AVG_NUMBER_OF_WORDS_PER_SENTENCE)/len(AVG_NUMBER_OF_WORDS_PER_SENTENCE))

712.0


## Complex word Count

In [67]:
def syllable_count(word):
    count = 0
    vowels = "AEIOUYaeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
            if word.endswith("es"or "ed"):
                count -= 1
    if count == 0:
        count += 1
    return count



COMPLEX_WORDS=syllable_count(content)
print(COMPLEX_WORDS)

1279


# Word Counts

In [68]:
Words_counts=len(content)
print(Words_counts)

4435


# Percentage of Complex words

In [69]:
pcw=(COMPLEX_WORDS/Words_counts)*100
print(pcw)

28.838782412626834


## Personal Pronouns

In [70]:
def ProperNounExtractor(text):
    count = 0
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        tagged = nltk.pos_tag(words)
        for (word, tag) in tagged:
            if tag == 'PRP': # If the word is a proper noun
                count+=1

    return(count)



# Calling the ProperNounExtractor function to extract all the proper nouns from the given text.
Personal_Pronouns=ProperNounExtractor(content)

LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle[0m

  Searched in:
    - 'C:\\Users\\ASUS/nltk_data'
    - 'C:\\Users\\ASUS\\anaconda3\\nltk_data'
    - 'C:\\Users\\ASUS\\anaconda3\\share\\nltk_data'
    - 'C:\\Users\\ASUS\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\ASUS\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


# Average Word Length

In [71]:
Average_Word_Length=len(content.replace(' ',''))/len(content.split())
print(Average_Word_Length)

5.158707865168539



# SYLLABLE PER WORD

In [72]:
word=content.replace(' ','')
syllable_count=0
for w in word:
      if(w=='a' or w=='e' or w=='i' or w=='o' or w=='y' or w=='u' or w=='A' or w=='E' or w=='I' or w=='O' or w=='U' or w=='Y'):
            syllable_count=syllable_count+1
print("The AVG number of syllables in the word is: ")
print(syllable_count/len(content.split()))

The AVG number of syllables in the word is: 
2.109550561797753


### For WORDCLOUD

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS
# Define a function to plot word cloud
def plot_cloud(wordcloud):
    # Set figure size
    plt.figure(figsize=(40, 30))
    # Display image
    plt.imshow(wordcloud)
    # No axis details
    plt.axis("off")
stopwords = STOPWORDS
stopwords.add('will')
wordcloud = WordCloud(width = 500, height = 500, background_color='black', max_words=100,colormap='Set2',stopwords=stopwords).generate(content)
# Plot
plot_cloud(wordcloud)

# Negative wordcloud

In [None]:
neg_review = " ".join ([w for w in neg_count if w in negwords])

wordcloud = WordCloud(width = 3000, height = 2000, background_color='black', max_words=100,colormap='Set2',stopwords=stopwords).generate(neg_review)
#Plot
plot_cloud(wordcloud)