In [1]:
#importing required libraries 
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import os
from tqdm import tqdm
import json

# Creating positive and negative words dictionary

In [2]:
def add_values_in_dict(word_dict,key, list_of_words):
    if key not in word_dict:
        word_dict[key] = list()
    word_dict[key].append(list_of_words)
    return word_dict

In [3]:
word_dict = {'Positive':[], 'Negative':[]} #creating dictionary 

In [4]:
path_filtered = "D:/Text_analysis/Filtered_data"
path = 'D:/Text_analysis/MasterDictionary-20230305T133348Z-001/MasterDictionary'

In [5]:
#getting positive and negative words lists
positive = []
negative = []
for filename in os.listdir(path):
    f = os.path.join(path, filename)
    with open(f, 'r', encoding='latin-1') as file1:
        if filename == 'positive-words.txt':
              positive = (file1.read().split())
        elif filename == 'negative-words.txt':
              negative = (file1.read().split())
        else:
              continue          

In [6]:
for filename in tqdm(os.listdir(path_filtered)):
    f = os.path.join(path_filtered, filename)
    if os.path.isfile(f):
        with open(f,encoding='latin-1') as file1:
            words = file1.read().split()
            for r in words:
                if r in positive:
                    word_dict = add_values_in_dict(word_dict, 'Positive', r)
                elif r in negative:
                    word_dict = add_values_in_dict(word_dict, 'Negative', r)
                else:
                    continue

100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:05<00:00, 18.95it/s]


In [7]:
file1 = open('D:\Text_analysis\Dictionary\dictionary.json', 'w')
json.dump(word_dict, file1)
file1.close()

# Word and Sentence tokenization

In [8]:
path_dest='D:/Text_analysis/Tokens'
from nltk.tokenize import word_tokenize, sent_tokenize
for filename in tqdm(os.listdir(path_filtered)):
    f = os.path.join(path_filtered, filename)
    with open(f,encoding='latin-1') as file1:
        text = file1.read()
        file1.close()
    base, ext = os.path.splitext(filename)
    word_tokens = word_tokenize(text)
    sent_tokens = sent_tokenize(text)
  
    word_destination = path_dest + '/word_tokens/wt_' + base +'.json'
    sent_destination = path_dest + '/sent_tokens/st_' + base +'.json'
  
    with open(word_destination, 'w') as a:
        json.dump(word_tokens, a)
    a.close()
    with open(sent_destination,'w') as a:
        json.dump(sent_tokens, a)
    a.close()

100%|███████████████████████████████████████████████████████████████████████████████| 111/111 [00:00<00:00, 167.28it/s]


In [9]:
path_dict="D:\Text_analysis\Dictionary\dictionary.json"
dictionary = open(path_dict, 'r')
word_dict = json.load(dictionary)
dictionary.close()

# Extracting derived variables:
## Positive Score: 
This score is calculated by assigning the value of +1 for each word if found in the Positive Dictionary and then adding up all the values.

## Negative Score: 
This score is calculated by assigning the value of -1 for each word if found in the Negative Dictionary and then adding up all the values. We multiply the score with -1 so that the score is a positive number.

## Polarity Score: 
This is the score that determines if a given text is positive or negative in nature. It is calculated by using the formula:

## Polarity Score = 
(Positive Score – Negative Score)/ ((Positive Score + Negative Score) + 0.000001) Range is from -1 to +1

## Subjectivity Score: 
This is the score that determines if a given text is objective or subjective. It is calculated by using the formula:

## Subjectivity Score = 
(Positive Score + Negative Score)/ ((Total Words after cleaning) + 0.000001) Range is from 0 to +1

In [10]:
path_wt='D:/Text_analysis/Tokens/word_tokens'


In [11]:
p_score = []
n_score = []
polarity_score = []
subjectivity_score = []
url_ids = []


for filename in tqdm(os.listdir(path_wt)):
    f = os.path.join(path_wt, filename)
    file1 = open(f, 'r')
    words = json.load(file1)
    p = 0
    n = 0
    polar_score = 0
    no_of_words = len(words)
#  print(no_of_words, filename)
    for word in words:
        if word in word_dict['Positive']:
              p += 1
        elif word in word_dict['Negative']:
              n += 1
        else:
              continue
  #print('p',(p - n)/((p + n) + 0.000001))
    url_ids.append(re.findall("\d+", filename))
    flat_list = []
    p_score.append(p)
    n_score.append(n)
    polarity_score.append((p - n)/((p + n) + 0.000001))
    subjectivity_score.append((p + n)/((no_of_words) + 0.000001))

100%|████████████████████████████████████████████████████████████████████████████████| 111/111 [00:06<00:00, 17.77it/s]


In [12]:
def flat_list(list_2d):
    flat_list = []
    for sublist in list_2d:
        for item in sublist:
            flat_list.append(item)
    return flat_list

flat_list1 = flat_list(url_ids)

In [13]:
score = {'url_id': flat_list1, 'p_score': p_score, 'n_score': n_score, 
         'polarity_score': polarity_score,
         'subjectivity_score': subjectivity_score}
file1 = open('D:/Text_analysis/results/scores.json', 'w')
json.dump(score, file1)
file1.close()

## Analysis of Readability
It is calculated using the Gunning Fox index formula described below:

Average Sentence Length = the number of words / the number of sentences

Percentage of Complex words = the number of complex words / the number of words

Fog Index = 0.4 * (Average Sentence Length + Percentage of Complex words)

In [14]:
word_path='D:\Text_analysis\Tokens\word_tokens'
sent_path='D:\Text_analysis\Tokens\sent_tokens'

In [15]:
files = [os.listdir(word_path), os.listdir(sent_path)]

In [16]:
def count_complex_words(words_list):
    c = 0
    for word in words_list:
        l = re.findall('(?!e$)[aeiou]+', word, re.I)+re.findall('^[aeiouy]*e$', word, re.I)
        if len(l) > 2:
            c += 1
    return c

In [17]:
avg_sent_length = []
percent_of_complex_words = []
fog_index = []
url_id = []

for i in tqdm(range(len(files[0]))):
    w = os.path.join(word_path, files[0][i])
    s = os.path.join(sent_path, files[1][i])

    file1 = open(w, 'r')
    words = json.load(file1)
    file1.close()
    no_of_words = len(words)

    file2 = open(s, 'r')
    sent = json.load(file2)
    file2.close()

    url_id.append(re.findall("\d+", files[0][i]))
    avg_sent_length.append(int(no_of_words/len(sent)))
    percent_of_complex_words.append(count_complex_words(words)/no_of_words)
    fog_index.append(0.4*(avg_sent_length[i]+percent_of_complex_words[i]))

100%|███████████████████████████████████████████████████████████████████████████████| 111/111 [00:00<00:00, 308.05it/s]


In [18]:
flat_list2 = flat_list(url_id)

In [19]:
readability = {'url_id': flat_list2, 'avg_sent_length': avg_sent_length,
               'percent_of_complex_words': percent_of_complex_words,
               'fog_index': fog_index}

In [20]:
file1 = open('D:/Text_analysis/results/readability_analysis.json', 'w')
json.dump(readability, file1)
file1.close()

## Average Number of Words Per Sentence
The formula for calculating is:

Average Number of Words Per Sentence = the total number of words / the total number of sentences

Complex Word Count
Complex words are words in the text that contain more than two syllables.

In [21]:
path_extracted = 'D:/Text_analysis/Filtered_data/'

In [22]:
from nltk.tokenize import word_tokenize, sent_tokenize

avg_no_words_per_sent = []
complex_word_count = []
url_id = []

for filename in tqdm(os.listdir(path_extracted)):
    f = os.path.join(path_extracted, filename)
    with open(f, encoding='latin-1') as file1:
        text = file1.read()
        file1.close()
        base, ext = os.path.splitext(filename)
        word_tokens = word_tokenize(text)
        sent_tokens = sent_tokenize(text)

        url_id.append(base)
        avg_no_words_per_sent.append(int(len(word_tokens)/len(sent_tokens)))
        complex_word_count.append(count_complex_words(word_tokens))

100%|███████████████████████████████████████████████████████████████████████████████| 111/111 [00:00<00:00, 137.92it/s]


In [23]:
avg_words_and_complex_words = {'url_id': url_id, 'avg_no_words_per_sent': avg_no_words_per_sent,
                               'compplex_word_count': complex_word_count}

In [24]:
file1 = open('D:/Text_analysis/results/avg_and_complex_words.json', 'w')
json.dump(avg_words_and_complex_words, file1)
file1.close()

## Word Count
We count the total cleaned words present in the text by

removing the stop words (using stopwords class of nltk package).
removing any punctuations like ? ! , . from the word before counting.
Syllable Count Per Word
We count the number of Syllables in each word of the text by counting the vowels present in each word. We also handle some exceptions like words ending with "es","ed" by not counting them as a syllable.

In [25]:
def count_syllables(word):
    c = 0
    vowels = 'aeiou'
    l = re.findall(f'(?!e$)(?!es$)(?!ed$)[{vowels}]', word, re.I)
    return len(l)

In [26]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

# using RegexpTokenizer to remove punctuations

tokenizer = RegexpTokenizer(r'\w+')

url_id = []
word_count = []
syllable_count = []

for filename in tqdm(os.listdir(path_extracted)):
    f = os.path.join(path_extracted, filename)
    base, ext = os.path.splitext(filename)
    with open(f,encoding='latin-1') as file1:
        text = file1.read()
        file1.close()
  
    stop_words = set(stopwords.words('english'))
    words = tokenizer.tokenize(text)
    filtered_words = []
    syllable_per_word = []
    for word in words:
        if word not in stop_words:
            filtered_words.append(word)
            list1 = []
            list1.append(word)
            list1.append(count_syllables(word))
            syllable_per_word.append(list1)

    path = 'D:/Text_analysis/results/syllable_per_word' + '/' + base + '_syllable_count_per_word.json'
    file1 = open(path, 'w')
    json.dump(syllable_per_word, file1)
    file1.close()

    url_id.append(base)
    word_count.append(len(filtered_words))
    syllable_sum = 0
    for i in range(len(syllable_per_word)):
        syllable_sum += syllable_per_word[i][1]
    syllable_count.append(int(syllable_sum/len(syllable_per_word)))

100%|███████████████████████████████████████████████████████████████████████████████| 111/111 [00:00<00:00, 242.14it/s]


In [27]:
word_and_syllable_count = {'url_id': url_id, 'word_count': word_count, 'syllable_count': syllable_count}

In [28]:
file1 = open('D:/Text_analysis/results/word_and_syllable_count.json','w')
json.dump(word_and_syllable_count, file1)
file1.close()

In [29]:
def count_personal_pronouns(text):
    pronoun_count = re.compile(r'\b(I|we|ours|my|mine|(?-i:us))\b', re.I)
    pronouns = pronoun_count.findall(text)
    return len(pronouns)

In [30]:
url_id = []
personal_pronouns_count = []
word_avg_length = []

for filename in tqdm(os.listdir(path_extracted)):
    f = os.path.join(path_extracted, filename)
    base, ext = os.path.splitext(filename)
    with open(f,encoding='latin-1') as file1:
        text = file1.read()
        words = text.split()
        file1.close()
  
    c = 0
    for word in words:
        c += len(word)
    
    url_id.append(base)
    personal_pronouns_count.append(count_personal_pronouns(text))
    word_avg_length.append(round(c/len(words)))

100%|██████████████████████████████████████████████████████████████████████████████| 111/111 [00:00<00:00, 2174.47it/s]


In [31]:
pronouns_and_word_length = {'url_id': url_id, 'personal_pronouns': personal_pronouns_count,
                            'word_avg_length': word_avg_length}

In [32]:
with open('D:/Text_analysis/results/pronouns_and_word_length.json', 'w') as file1:
    json.dump(pronouns_and_word_length, file1)
file1.close()

## Collecting all the results and saving in the excel sheet

In [33]:
path_result = 'D:/Text_analysis/results/'

In [34]:
results = {}
for filename in tqdm(os.listdir(path_result)):
    f = os.path.join(path_result, filename)
    base, ext = os.path.splitext(filename)
    try:
        with open(f, 'r') as file1:
            results[base] = json.load(file1)
    except:
        pass

100%|██████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 1997.76it/s]


In [36]:
df = {}
for key in results.keys():
    print(key)
    df[key] = pd.DataFrame.from_dict(results[key])

avg_and_complex_words
pronouns_and_word_length
readability_analysis
scores
word_and_syllable_count


In [38]:
all_df = pd.merge(df['scores'], df['readability_analysis'], on ='url_id', how = 'outer')

In [39]:
all_df1 = pd.merge(df['avg_and_complex_words'], df['word_and_syllable_count'],
                  on = 'url_id',how = 'outer')

In [40]:
all_df1 = pd.merge(all_df1, df['pronouns_and_word_length'], on='url_id', how = 'outer')

In [41]:
all_df = pd.merge(all_df, all_df1, on = 'url_id', how = 'outer')

In [42]:
all_df = all_df.astype({'url_id': float})
all_df.sort_values('url_id', inplace=True)

In [43]:
column1 = list(all_df.columns)

In [44]:
db= pd.read_excel("D:\Text_analysis\Output Data Structure.xlsx")
db.set_index(['URL_ID'], inplace = True)

In [45]:
not_working_url_id=[44,57,122]
db.drop(not_working_url_id, axis = 0,inplace = True)

In [46]:
db.to_excel('D:/Text_analysis/new Output Data Structure.xlsx')

## Importing output structure sheet and saving final result.

In [47]:
main_df = pd.read_excel("D:\Text_analysis\Output Data Structure.xlsx")
main_df

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthc...,,,,,,,,,,,,,
1,38,https://insights.blackcoffer.com/what-if-the-c...,,,,,,,,,,,,,
2,39,https://insights.blackcoffer.com/what-jobs-wil...,,,,,,,,,,,,,
3,40,https://insights.blackcoffer.com/will-machine-...,,,,,,,,,,,,,
4,41,https://insights.blackcoffer.com/will-ai-repla...,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,146,https://insights.blackcoffer.com/blockchain-fo...,,,,,,,,,,,,,
110,147,https://insights.blackcoffer.com/the-future-of...,,,,,,,,,,,,,
111,148,https://insights.blackcoffer.com/big-data-anal...,,,,,,,,,,,,,
112,149,https://insights.blackcoffer.com/business-anal...,,,,,,,,,,,,,


In [48]:
columns = list(main_df.columns)

In [49]:
columns.remove('URL')

In [50]:
column_list = [column1, columns]

In [51]:
column_dict = {}
for i in range(len(column_list[0])):
    column_dict[column_list[0][i]] = column_list[1][i]

In [52]:
all_df.rename(columns=column_dict, inplace=True)


In [65]:
data = pd.merge(main_df,all_df, on='URL_ID', how='outer')

In [66]:
data.drop(['POSITIVE SCORE_x', 'NEGATIVE SCORE_x',
       'POLARITY SCORE_x', 'SUBJECTIVITY SCORE_x', 'AVG SENTENCE LENGTH_x',
       'PERCENTAGE OF COMPLEX WORDS_x', 'FOG INDEX_x',
       'AVG NUMBER OF WORDS PER SENTENCE_x', 'COMPLEX WORD COUNT_x',
       'WORD COUNT_x', 'SYLLABLE PER WORD_x', 'PERSONAL PRONOUNS_x',
       'AVG WORD LENGTH_x'],axis=1,inplace=True)

In [67]:
data.to_excel('D:/Text_analysis/results/Final_Result.xlsx')