#### **ContentFiles** : *Folder containg all the text files.*
#### **MasterDictionary** : *Folder containg text files for positive and negative words.*
#### **StopWords** : *Folder containg text files for all stop words.*
#### **Input.csv** : *CSV File having all the urls.*
#### **Output Data Structure.csv** : *CSV File having the basic structure of output.*
#### **Output_Data.csv** : *CSV File having all the calculated parameters as final output.*

#### **Importing all the Important Libraries**

In [33]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SUSHANT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SUSHANT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### **Loading the Input File file containing URLs of articles into a pandas dataframe**

In [34]:
df = pd.read_excel('Input.xlsx')

In [35]:
df.head()

Unnamed: 0,URL_ID,URL
0,37,https://insights.blackcoffer.com/ai-in-healthc...
1,38,https://insights.blackcoffer.com/what-if-the-c...
2,39,https://insights.blackcoffer.com/what-jobs-wil...
3,40,https://insights.blackcoffer.com/will-machine-...
4,41,https://insights.blackcoffer.com/will-ai-repla...


In [36]:
df.shape

(114, 2)

#### Iterating through each row in the df.

In [37]:

for index, row in df.iterrows():
  url = row['URL']
  url_id = row['URL_ID']

  # making a request to url
  try:
    response = requests.get(url)
  except:
    print("can't get response of {}".format(url_id))

  #creating a beautifulsoup object
  try:
    soup = BeautifulSoup(response.content, 'html.parser')
  except:
    print("can't get page of {}".format(url_id))
  #for getting the title
  try:
    title = soup.find('h1').get_text()
  except:
    print("can't get title of {}".format(url_id))
    continue
  #for getting the text
  article = ""
  try:
    for p in soup.find_all('p'):
      article += p.get_text()
  except:
    print("can't get text of {}".format(url_id))

  #writing title and article text to the file
  file_name = (r'ContentFiles/') + str(url_id) + '.txt'
  with open(file_name, 'w' , encoding="utf-8") as file:
    file.write(title + '\n' + article)

can't get title of 44
can't get title of 57
can't get title of 144


#### Here, It can be seen that URLs with the url_id 44, 57 and 144 are not accessible as these pages do not exist anymore. 

In [38]:
# loading all the stopwords from the provided stopwords folder and storing in the set variable
stop_words = set()
for files in os.listdir(r"StopWords"):
  with open(os.path.join(r"StopWords",files),'r') as f:
    stop_words.update(set(f.read().splitlines()))

print(len(stop_words))

12919


####  Sorting the files in ascending order so that the calculated parameters can be matched with the respective urls as per Output Data Structure File.


In [39]:
filenumber=[]
for text_file in os.listdir("ContentFiles"):
    filenumber.append(text_file)
    

def myFunc(e):
    return(int(e[0:-4]))

filenumber.sort(key=myFunc)
print(filenumber)

['37.txt', '38.txt', '39.txt', '40.txt', '41.txt', '42.txt', '43.txt', '45.txt', '46.txt', '47.txt', '48.txt', '49.txt', '50.txt', '51.txt', '52.txt', '53.txt', '54.txt', '55.txt', '56.txt', '58.txt', '59.txt', '60.txt', '61.txt', '62.txt', '63.txt', '64.txt', '65.txt', '66.txt', '67.txt', '68.txt', '69.txt', '70.txt', '71.txt', '72.txt', '73.txt', '74.txt', '75.txt', '76.txt', '77.txt', '78.txt', '79.txt', '80.txt', '81.txt', '82.txt', '83.txt', '84.txt', '85.txt', '86.txt', '87.txt', '88.txt', '89.txt', '90.txt', '91.txt', '92.txt', '93.txt', '94.txt', '95.txt', '96.txt', '97.txt', '98.txt', '99.txt', '100.txt', '101.txt', '102.txt', '103.txt', '104.txt', '105.txt', '106.txt', '107.txt', '108.txt', '109.txt', '110.txt', '111.txt', '112.txt', '113.txt', '114.txt', '115.txt', '116.txt', '117.txt', '118.txt', '119.txt', '120.txt', '121.txt', '122.txt', '123.txt', '124.txt', '125.txt', '126.txt', '127.txt', '128.txt', '129.txt', '130.txt', '131.txt', '132.txt', '133.txt', '134.txt', '135

#### Loading all text files from the ContentFiles folder and storing in a list named docs.

In [40]:
docs = []

for i in filenumber:
    with open(os.path.join(("ContentFiles"),i),'r',encoding='utf-8') as f:
        text = f.read()
#tokenize the given text file
        words = word_tokenize(text)
# remove the stop words from the tokens
        filtered_text = [word for word in words if word.lower() not in stop_words]
# add each filtered tokens of each file into a list
        docs.append(filtered_text)

#### Accessing the positive and negative words from the provided Master Dictionary and storing them in the set variables.

In [41]:
pos=set()
neg=set()

for files in os.listdir("MasterDictionary"):
  if files =='positive-words.txt':
    with open(os.path.join("MasterDictionary",files),'r',encoding='ISO-8859-1') as f:
      pos.update(f.read().splitlines())
  else:
    with open(os.path.join("MasterDictionary",files),'r',encoding='ISO-8859-1') as f:
      neg.update(f.read().splitlines())


print("pos-->",len(pos))
print("neg-->",len(neg))

pos--> 2006
neg--> 4783


#### Calculating *Positive Score, Negative Score, Polarity Score and Subjectivity Score* for each file respectively by counting their positive and negative words

In [42]:
positive_words = []
Negative_words =[]
positive_score = []
negative_score = []
polarity_score = []
subjectivity_score = []

#iterate through the list of docs
for i in range(len(docs)):
  positive_words.append([word for word in docs[i] if word.lower() in pos])
  Negative_words.append([word for word in docs[i] if word.lower() in neg])
  positive_score.append(len(positive_words[i]))
  negative_score.append(len(Negative_words[i]))
  polarity_score.append((positive_score[i] - negative_score[i]) / ((positive_score[i] + negative_score[i]) + 0.000001))
  subjectivity_score.append((positive_score[i] + negative_score[i]) / ((len(docs[i])) + 0.000001))

print("done done")

done done


In [43]:
print(len(positive_words[0]))
print(len(Negative_words[0]))

69
31


#### Calculating other Parameters

In [44]:
# Average Sentence Length = the number of words / the number of sentences
# Percentage of Complex words = the number of complex words / the number of words 
# Fog Index = 0.4 * (Average Sentence Length + Percentage of Complex words)

avg_sentence_length = []
Percentage_of_Complex_words  =  []
Fog_Index = []
complex_word_count =  []
avg_syllable_word_count =[]

stopwords = set(stopwords.words('english'))
def measure(file):
  with open(os.path.join("ContentFiles", file),'r',encoding='ISO-8859-1') as f:
    text = f.read()
# removing punctuations 
    text = re.sub(r'[^\w\s.]','',text)
# splitting the given file into sentences
    sentences = text.split('.')
# calculating total number of sentences in a file
    num_sentences = len(sentences)
# calculating total words in the file
    words = [word  for word in text.split() if word.lower() not in stopwords ]
    num_words = len(words)
 
# complex words having syllable count is greater than 2
# Complex words are words in the text that contain more than two syllables.
    complex_words = []
    for word in words:
      vowels = 'aeiou'
      syllable_count_word = sum( 1 for letter in word if letter.lower() in vowels)
      if syllable_count_word > 2:
        complex_words.append(word)

# Syllable Count Per Word
# Counting the number of Syllables in each word of the text by counting the vowels present in each word.
#  Will also handle some exceptions like words ending with "es","ed" by not counting them as a syllable.
    syllable_count = 0
    syllable_words =[]
    for word in words:
      if word.endswith('es'):
        word = word[:-2]
      elif word.endswith('ed'):
        word = word[:-2]
      vowels = 'aeiou'
      syllable_count_word = sum( 1 for letter in word if letter.lower() in vowels)
      if syllable_count_word >= 1:
        syllable_words.append(word)
        syllable_count += syllable_count_word


    avg_sentence_len = num_words / num_sentences
    avg_syllable_word_count = syllable_count / len(syllable_words)
    Percent_Complex_words  =  len(complex_words) / num_words
    Fog_Index = 0.4 * (avg_sentence_len + Percent_Complex_words)

    return avg_sentence_len, Percent_Complex_words, Fog_Index, len(complex_words),avg_syllable_word_count

print("done done lonmdon")

#------------------------------------------------------------------------------------------------------------

# iterate through each file or doc
for file in filenumber:
  x,y,z,a,b = measure(file)          #calling measure function
  avg_sentence_length.append(x)
  Percentage_of_Complex_words.append(y)
  Fog_Index.append(z)
  complex_word_count.append(a)
  avg_syllable_word_count.append(b)

done done lonmdon


In [45]:
# Average Word Length Sum of the total number of characters in each word/Total number of words
# Calculating the total cleaned words present in the text by
# (1)removing the stop words (using stopwords class of nltk package).
# (2)removing any punctuations like ? ! , . from the word before counting.

def cleaned_words(file):
  with open(os.path.join("ContentFiles",file), 'r',encoding='ISO-8859-1') as f:
    text = f.read()
    text = re.sub(r'[^\w\s]', '' , text)
    words = [word  for word in text.split() if word.lower() not in stopwords]
    length = sum(len(word) for word in words)
    average_word_length = length / len(words)
  return len(words),average_word_length

word_count = []
average_word_length = []
for file in filenumber:
  x, y = cleaned_words(file)         #calling cleaned_words function 
  word_count.append(x)
  average_word_length.append(y)

In [46]:
# To calculate Personal Pronouns mentioned in the text, we use regex to find
# the counts of the words - “I,” “we,” “my,” “ours,” and “us”. Special care is taken
#  so that the country name US is not included in the list.
def count_personal_pronouns(file):
  with open(os.path.join("ContentFiles",file), 'r',encoding='ISO-8859-1') as f:
    text = f.read()
    personal_pronouns = ["I", "we", "my", "ours", "us"]
    count = 0
    for pronoun in personal_pronouns:
      count += len(re.findall(r"\b" + pronoun + r"\b", text)) # \b is used to match word boundaries
  return count

pp_count = []
for file in filenumber:
  x = count_personal_pronouns(file)    #calling count_personal_pronouns function 
  pp_count.append(x)

output_df = pd.read_excel('Output Data Structure.xlsx')

print("size->",output_df.shape)

size-> (114, 15)


#### As we have already seen Pages with url_ids 44, 57 and 144 do not exist anymore. Therefore, dropping that rows from the dataframe.

In [47]:
output_df.drop([44-37,57-37,144-37], axis = 0, inplace=True)

print("size->",output_df.shape)


size-> (111, 15)


In [48]:
print(output_df.shape)

(111, 15)


##### Here we can see that those urls throwing 404 errors are dropped successfully.

#### Creating a single list of all the calculated parameters whose values to be saved in final output data file.

In [49]:
variables = [positive_score,
            negative_score,
            polarity_score,
            subjectivity_score,
            avg_sentence_length,
            Percentage_of_Complex_words,
            Fog_Index,
            avg_sentence_length,
            complex_word_count,
            word_count,
            avg_syllable_word_count,
            pp_count,
            average_word_length]

# writing the values to the dataframe
for i, var in enumerate(variables):
  output_df.iloc[:,i+2] = var

#Finally saving the dataframe as Output_Data.csv.
output_df.to_csv('Output_Data.csv')