In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import spacy
from textstat.textstat import textstatistics
import nltk
from nltk.corpus import stopwords,brown
from nltk.tokenize import word_tokenize
import openpyxl

nlp = spacy.load('en_core_web_sm')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [2]:
X = pd.read_excel('Input.xlsx')
X.head()

Unnamed: 0,URL_ID,URL
0,1.0,https://insights.blackcoffer.com/how-is-login-...
1,2.0,https://insights.blackcoffer.com/how-does-ai-h...
2,3.0,https://insights.blackcoffer.com/ai-and-its-im...
3,4.0,https://insights.blackcoffer.com/how-do-deep-l...
4,5.0,https://insights.blackcoffer.com/how-artificia...


In [3]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',}
def get_title(url):
    page = requests.get(url,headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    title = soup.title.get_text()
    return title

def get_raw_content(url):
    page = requests.get(url,headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    raw_content = soup.body.find_all(class_ = "td-post-content")[0].get_text()
    return str(raw_content)

def remove_stopwords(text):
    text = re.sub("[^-9A-Za-z ]", " " , text)
    tokens = word_tokenize(text)
    filtered = [ w for w in tokens if not w.lower() in stop_words]
    for j,i in enumerate(filtered):
        i = re.sub('[^-9A-Za-z ]','',i)
        filtered[j] = i
    for i in range(len(filtered)-1,-1,-1):
        if filtered[i] == '':
            filtered.pop(i)
    return filtered

master_dict = pd.read_csv('Master_Dictionary.csv')
master_dict["Word"] = master_dict["Word"].str.lower()
master_dict = master_dict[['Word','Negative','Positive']]
master_dict['P/N'] = master_dict['Positive'] - master_dict['Negative']
master_dict = master_dict[master_dict['P/N'] != 0]
master_dict = master_dict.drop(columns = ['Positive','Negative'])
master_dict['P/N'] = np.where(master_dict['P/N'] > 0, 1, 0)

def count_PN(li):
    positive = negative = 0
    for i in li:
        if i in master_dict['Word'].values:
            if master_dict.loc[master_dict['Word'] == i,'P/N'].values[0] == 1:
                positive += 1
            else:
                negative += 1
    return positive,negative

def make_sentence(text):
    sentences = nlp(text)
    return list(sentences.sents)

def word_count(text):
    sentences = make_sentence(text)
    count = 0
    for i in sentences:
        count += len(word_tokenize(i.text))
    return count

def sentence_count(text):
    sentences = make_sentence(text)
    count = len(sentences)
    return count

def avg_sentence_length(text):
    words = word_count(text)
    sentences = sentence_count(text)
    average_length = words/sentences
    return average_length

def count_syllables(word):
    count = textstatistics().syllable_count(word)
    return count

def avg_syllables(text):
    syllables = count_syllables(text)
    words = word_count(text)
    count = syllables/words
    return count

def hard_words(text):
    sentences = nlp(text)
    words = []
    sentences = make_sentence(text)
    for i in sentences:
        words += [str(token) for token in i]
    
    hard_words = set()

    for i in words:
        syllable_count = count_syllables(i)
        if i not in nlp.Defaults.stop_words and syllable_count >= 2:
            hard_words.add(i)
        
    return hard_words

def average_word_len(tokens):
    length = 0
    for i in tokens:
        length += len(i)
    count = length/len(tokens)
    return count

def Pronoun_count(text):
    token = nltk.word_tokenize(text)
    list_of_pronouns = ["i", "we", "my", "ours", "our","us"]
    tagged_token =  nltk.pos_tag(token)
    pronoun_count = 0
    for word,tag in tagged_token:
        if word.lower() in list_of_pronouns and (tag=='PRP' or tag=='PRP$'):
            pronoun_count += 1
    return pronoun_count

In [4]:
X['TITLE'] = X['URL'].apply(get_title)
X['Raw_Content'] = X['URL'].apply(get_raw_content)
X['Filtered_Content'] = X['Raw_Content'].apply(remove_stopwords)
X['URL_ID'] = X['URL_ID'].astype(int)
X.set_index('URL_ID',drop=True, inplace=True)
X.head()

Unnamed: 0_level_0,URL,TITLE,Raw_Content,Filtered_Content
URL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,https://insights.blackcoffer.com/how-is-login-...,How is Login Logout Time Tracking for Employee...,\nWhen people hear AI they often think about s...,"[people, hear, AI, often, think, sentient, rob..."
2,https://insights.blackcoffer.com/how-does-ai-h...,How does AI help to monitor Retail Shelf watch...,\nWith increasing computing power and more dat...,"[increasing, computing, power, data, potential..."
3,https://insights.blackcoffer.com/ai-and-its-im...,AI and its impact on the Fashion Industry – Bl...,\nIf you were a fan of the 90’s film Clueless ...,"[fan, 9, film, Clueless, back, day, remember, ..."
4,https://insights.blackcoffer.com/how-do-deep-l...,How do deep learning models predict old and ne...,"\nUnderstanding exactly how data is ingested, ...","[Understanding, exactly, data, ingested, analy..."
5,https://insights.blackcoffer.com/how-artificia...,How artificial intelligence can boost your pro...,"\nFrom the stone age to the modern world, from...","[stone, age, modern, world, hunting, gathering..."


In [5]:
X['P/N'] = X['Filtered_Content'].apply(count_PN)
X['POSITIVE SCORE'] = X['P/N'].apply(lambda x: x[0])
X['NEGATIVE SCORE'] = X['P/N'].apply(lambda x: x[1])
X['POLARITY'] = (X['POSITIVE SCORE'] - X['NEGATIVE SCORE'])/(X['POSITIVE SCORE'] + X['NEGATIVE SCORE'] + 0.000001)
X['SUBJECTIVITY'] = (X['POSITIVE SCORE'] + X['NEGATIVE SCORE'])/(len(X['Filtered_Content']) + 0.000001)
X['AVG SENTENCE LENGTH'] = X['Raw_Content'].apply(avg_sentence_length)
X['Complex Words'] = X['Raw_Content'].apply(hard_words)
X['Word Count Total'] = X['Raw_Content'].apply(word_count)
X['PERCENTAGE OF COMPLEX WORDS'] = X['Complex Words'].apply(lambda x: len(x))/X['Word Count Total']*100
X['FOG INDEX'] = 0.4*X['AVG SENTENCE LENGTH'] + 0.4*X['PERCENTAGE OF COMPLEX WORDS']
X['Sentence Count'] = X['Raw_Content'].apply(sentence_count)
X['AVG NUMBER OF WORDS PER SENTENCE'] = X['Word Count Total']/X['Sentence Count']
X['COMPLEX WORD COUNT'] = X['Complex Words'].apply(len)
X['WORD COUNT'] = X['Filtered_Content'].apply(len)
X['SYLLABLE PER WORD'] = X['Raw_Content'].apply(avg_syllables)
X['PERSONAL PRONOUN'] = X['Raw_Content'].apply(Pronoun_count)
X['AVG WORD LENGTH'] = X['Filtered_Content'].apply(average_word_len)
X.drop(columns = ['Raw_Content','Filtered_Content','Complex Words','Word Count Total','Sentence Count','P/N'],inplace = True)
X.head()

Unnamed: 0_level_0,URL,TITLE,POSITIVE SCORE,NEGATIVE SCORE,POLARITY,SUBJECTIVITY,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUN,AVG WORD LENGTH
URL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,https://insights.blackcoffer.com/how-is-login-...,How is Login Logout Time Tracking for Employee...,4,6,-0.2,0.058824,30.807692,23.845194,21.861154,30.807692,191,420,1.423221,8,6.614286
2,https://insights.blackcoffer.com/how-does-ai-h...,How does AI help to monitor Retail Shelf watch...,8,7,0.066667,0.088235,24.241379,27.45377,20.67806,24.241379,193,378,1.514936,7,6.939153
3,https://insights.blackcoffer.com/ai-and-its-im...,AI and its impact on the Fashion Industry – Bl...,33,21,0.222222,0.317647,24.621951,24.913323,19.81411,24.621951,503,1070,1.489846,19,7.06729
4,https://insights.blackcoffer.com/how-do-deep-l...,How do deep learning models predict old and ne...,5,1,0.666667,0.035294,31.466667,26.90678,23.349379,31.466667,127,252,1.552966,1,7.063492
5,https://insights.blackcoffer.com/how-artificia...,How artificial intelligence can boost your pro...,21,17,0.105263,0.223529,20.720588,21.788502,17.003636,20.720588,307,693,1.391767,41,6.715729


In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 170 entries, 1 to 171
Data columns (total 15 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   URL                               170 non-null    object 
 1   TITLE                             170 non-null    object 
 2   POSITIVE SCORE                    170 non-null    int64  
 3   NEGATIVE SCORE                    170 non-null    int64  
 4   POLARITY                          170 non-null    float64
 5   SUBJECTIVITY                      170 non-null    float64
 6   AVG SENTENCE LENGTH               170 non-null    float64
 7   PERCENTAGE OF COMPLEX WORDS       170 non-null    float64
 8   FOG INDEX                         170 non-null    float64
 9   AVG NUMBER OF WORDS PER SENTENCE  170 non-null    float64
 10  COMPLEX WORD COUNT                170 non-null    int64  
 11  WORD COUNT                        170 non-null    int64  
 12  SYLLABLE

In [7]:
X = X.round(3)
X.head()

Unnamed: 0_level_0,URL,TITLE,POSITIVE SCORE,NEGATIVE SCORE,POLARITY,SUBJECTIVITY,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUN,AVG WORD LENGTH
URL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,https://insights.blackcoffer.com/how-is-login-...,How is Login Logout Time Tracking for Employee...,4,6,-0.2,0.059,30.808,23.845,21.861,30.808,191,420,1.423,8,6.614
2,https://insights.blackcoffer.com/how-does-ai-h...,How does AI help to monitor Retail Shelf watch...,8,7,0.067,0.088,24.241,27.454,20.678,24.241,193,378,1.515,7,6.939
3,https://insights.blackcoffer.com/ai-and-its-im...,AI and its impact on the Fashion Industry – Bl...,33,21,0.222,0.318,24.622,24.913,19.814,24.622,503,1070,1.49,19,7.067
4,https://insights.blackcoffer.com/how-do-deep-l...,How do deep learning models predict old and ne...,5,1,0.667,0.035,31.467,26.907,23.349,31.467,127,252,1.553,1,7.063
5,https://insights.blackcoffer.com/how-artificia...,How artificial intelligence can boost your pro...,21,17,0.105,0.224,20.721,21.789,17.004,20.721,307,693,1.392,41,6.716


In [11]:
wb = openpyxl.load_workbook(filename = 'D:\\Project\\Jupyter\\Blackcoffer Sentiment Analysis\\Output Data Structure.xlsx')
ws = wb.active
for i in range(1,len(X)+1):
    ws.cell(row = i+1, column =  3).value = X.iloc[i-1]['POSITIVE SCORE']
    ws.cell(row = i+1, column =  4).value = X.iloc[i-1]['NEGATIVE SCORE']
    ws.cell(row = i+1, column =  5).value = X.iloc[i-1]['POLARITY']
    ws.cell(row = i+1, column =  6).value = X.iloc[i-1]['SUBJECTIVITY']
    ws.cell(row = i+1, column =  7).value = X.iloc[i-1]['AVG SENTENCE LENGTH']
    ws.cell(row = i+1, column =  8).value = X.iloc[i-1]['PERCENTAGE OF COMPLEX WORDS']
    ws.cell(row = i+1, column =  9).value = X.iloc[i-1]['FOG INDEX']
    ws.cell(row = i+1, column = 10).value = X.iloc[i-1]['AVG NUMBER OF WORDS PER SENTENCE']
    ws.cell(row = i+1, column = 11).value = X.iloc[i-1]['COMPLEX WORD COUNT']
    ws.cell(row = i+1, column = 12).value = X.iloc[i-1]['WORD COUNT']
    ws.cell(row = i+1, column = 13).value = X.iloc[i-1]['SYLLABLE PER WORD']
    ws.cell(row = i+1, column = 14).value = X.iloc[i-1]['PERSONAL PRONOUN']
    ws.cell(row = i+1, column = 15).value = X.iloc[i-1]['AVG WORD LENGTH']
wb.save('D:\\Project\\Jupyter\\Blackcoffer Sentiment Analysis\\Output Data Structure.xlsx')