In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import nltk
import regex as re


In [2]:
# List of file names to load
file_names = [r'/kaggle/input/text-analysis/StopWords_Auditor.txt',r'/kaggle/input/text-analysis/StopWords_Currencies.txt',r'/kaggle/input/text-analysis/StopWords_DatesandNumbers.txt',r'/kaggle/input/text-analysis/StopWords_Generic.txt',r'/kaggle/input/text-analysis/StopWords_GenericLong.txt',r'/kaggle/input/text-analysis/StopWords_Geographic.txt',r'/kaggle/input/text-analysis/StopWords_Names.txt']

# Open a new file object to write the combined contents
with open('combined.txt', 'w') as combined_file:
    # Loop over the file names and append the contents to the combined file
    for file_name in file_names:
        with open(file_name, 'r',encoding='latin-1') as f:
            contents = f.read()
            combined_file.write(contents)
with open('combined.txt', 'r') as f:
    stop_words = set([line.strip() for line in f])
with open('/kaggle/input/text-analysis/positive-words.txt', 'r',encoding='latin-1') as f:
    positive_words = set([line.strip() for line in f])
with open('/kaggle/input/text-analysis/negative-words.txt', 'r',encoding='latin-1') as f:
    negative_words = set([line.strip() for line in f])


In [3]:
# Define functions to calculate metrics
def get_positive_score(text):
    return sum([1 for word in text.lower().split() if word in positive_words])


In [4]:
def get_negative_score(text):
    return sum([1 for word in text.lower().split() if word in negative_words])


In [5]:
def get_polarity_score(text):
    return (get_positive_score(text) - get_negative_score(text)) / (get_positive_score(text) + get_negative_score(text) + 0.000001)


In [6]:
def get_word_count(text):
    words = nltk.word_tokenize(text.lower())
    clean = [word for word in words if len(word) > 0 and word not in stop_words and word.isalpha()]
    return len(clean)


In [7]:
def get_subjectivity_score(text):
    return (get_positive_score(text) + get_negative_score(text))/ ((get_word_count(text)) + 0.000001)


In [8]:
def get_avg_sentence_length(text):
    sentences = nltk.sent_tokenize(text)
    return sum([len(sentence.split()) for sentence in sentences]) / len(sentences)


In [9]:
def get_percentage_complex_words(text):
    words = nltk.word_tokenize(text.lower())
    complex_words = [word for word in words if len(word) > 2 and word not in stop_words and word.isalpha()]
    return len(complex_words) / len(words)


In [10]:
def get_fog_index(text):
    return 0.4 * (get_avg_sentence_length(text) + 100 * get_percentage_complex_words(text))


In [11]:
def get_avg_words_per_sentence(text):
    # Tokenize the text into sentences
    sentences = nltk.sent_tokenize(text)

    # Get the total number of words and sentences
    total_words = sum(len(nltk.word_tokenize(sentence)) for sentence in sentences)
    total_sentences = len(sentences)

    # Calculate the average number of words per sentence
    if total_sentences > 0:
        avg_words_per_sentence = total_words / total_sentences
    else:
        avg_words_per_sentence = 0

    return avg_words_per_sentence


In [12]:
def get_avg_word_length(text):
    words = nltk.word_tokenize(text.lower())
    return sum([len(word) for word in words]) / len(words)


In [13]:
def get_complex_word_count(text):
    words = nltk.word_tokenize(text.lower())
    complex_words = [word for word in words if len(word) > 2 and word not in stop_words and word.isalpha()]
    return len(complex_words)


In [14]:
def get_syllable_count(word):
    vowels = "aeiouy"
    syllables = 0
    prev_char = None
    for char in word:
        char = char.lower()
        if char in vowels and prev_char not in vowels:
            syllables += 1
        prev_char = char
    if word.endswith('es'):
        syllables -= 1
    if word.endswith('ed'):
        syllables -= 1
    if syllables == 0:
        syllables = 1
    return syllables


In [15]:
def get_syllables_per_word(text):
    words = nltk.word_tokenize(text.lower())
    return sum([get_syllable_count(word) for word in words]) / len(words)


In [16]:
def get_personal_pronouns(text):
    # Define a regular expression pattern to match personal pronouns
    pattern = r"\b(I|we|my|ours|us)\b(?!S\b)"
    # Find all matches in the text using the pattern
    matches = re.findall(pattern, text, re.IGNORECASE)
    # Count the number of matches for each pronoun and store in a dictionary
    counts = {
        "I": matches.count("I"),
        "we": matches.count("we"),
        "my": matches.count("my"),
        "ours": matches.count("ours"),
        "us": matches.count("us")
    }
    return sum(counts.values())


In [17]:
# Load the Excel file into a pandas DataFrame
Input = pd.read_excel('/kaggle/input/text-analysis/Input.xlsx')

# Extract the links into a list
links = Input['URL'].tolist()

In [18]:
text_list = []
Output=pd.DataFrame()
Output['URL_ID'] = Input['URL_ID']
Output['URL'] = Input['URL']
Output.reset_index(drop=True,inplace=True)
Output

Unnamed: 0,URL_ID,URL
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...
1,38.0,https://insights.blackcoffer.com/what-if-the-c...
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...
3,40.0,https://insights.blackcoffer.com/will-machine-...
4,41.0,https://insights.blackcoffer.com/will-ai-repla...
...,...,...
109,146.0,https://insights.blackcoffer.com/blockchain-fo...
110,147.0,https://insights.blackcoffer.com/the-future-of...
111,148.0,https://insights.blackcoffer.com/big-data-anal...
112,149.0,https://insights.blackcoffer.com/business-anal...


In [19]:
count=0
# Extract the text from each link
for link in links:
    response = requests.get(link)
    soup = BeautifulSoup(response.content, 'html.parser')
    text = soup.get_text()
    text_list.append(text)
    Output.at[Output.index[count],'POSITIVE SCORE'] = get_positive_score(text)
    Output.at[Output.index[count],'NEGATIVE SCORE'] = get_negative_score(text)
    Output.at[Output.index[count],'POLARITY SCORE'] = get_polarity_score(text)
    Output.at[Output.index[count],'SUBJECTIVITY SCORE'] = get_subjectivity_score(text)
    Output.at[Output.index[count],'AVG SENTENCE LENGTH'] = get_avg_sentence_length(text)
    Output.at[Output.index[count],'PERCENTAGE OF COMPLEX WORDS'] = get_percentage_complex_words(text)
    Output.at[Output.index[count],'FOG INDEX'] = get_fog_index(text)
    Output.at[Output.index[count],'AVG NUMBER OF WORDS PER SENTENCE'] = get_avg_words_per_sentence(text)
    Output.at[Output.index[count],'COMPLEX WORD COUNT'] = get_complex_word_count(text)
    Output.at[Output.index[count],'WORD COUNT'] = get_word_count(text)
    Output.at[Output.index[count],'SYLLABLE PER WORD'] = get_syllable_count(text)
    Output.at[Output.index[count],'PERSONAL PRONOUNS'] = get_personal_pronouns(text)
    Output.at[Output.index[count],'AVG WORD LENGTH'] = get_avg_word_length(text)
    count=count+1
    print(count)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114


In [20]:
# Exporting the contents to Excel
Output.to_excel('Output.xlsx',index=False )


In [21]:
Output

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37.0,https://insights.blackcoffer.com/ai-in-healthc...,104.0,30.0,0.552239,0.086396,32.388235,0.486425,32.412278,35.964706,1487.0,1551.0,5523.0,2.0,5.301276
1,38.0,https://insights.blackcoffer.com/what-if-the-c...,89.0,32.0,0.471074,0.104491,24.989362,0.411941,26.473401,28.340426,1097.0,1158.0,4345.0,7.0,4.811866
2,39.0,https://insights.blackcoffer.com/what-jobs-wil...,98.0,37.0,0.451852,0.094538,26.380000,0.467369,29.246765,29.420000,1375.0,1428.0,5257.0,3.0,5.153977
3,40.0,https://insights.blackcoffer.com/will-machine-...,95.0,22.0,0.623932,0.093005,23.660550,0.415147,26.070111,26.165138,1184.0,1258.0,4794.0,19.0,4.845722
4,41.0,https://insights.blackcoffer.com/will-ai-repla...,92.0,24.0,0.586207,0.085357,28.645161,0.436789,28.929636,32.150538,1306.0,1359.0,5039.0,13.0,4.948495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,146.0,https://insights.blackcoffer.com/blockchain-fo...,54.0,27.0,0.333333,0.083505,30.542373,0.469995,31.016747,33.610169,932.0,970.0,3497.0,10.0,5.279879
110,147.0,https://insights.blackcoffer.com/the-future-of...,68.0,16.0,0.619048,0.063976,35.185714,0.457652,32.380356,39.371429,1259.0,1313.0,4617.0,2.0,5.009815
111,148.0,https://insights.blackcoffer.com/big-data-anal...,60.0,42.0,0.176471,0.088465,27.184211,0.487101,30.357724,30.092105,1114.0,1153.0,4021.0,4.0,5.086139
112,149.0,https://insights.blackcoffer.com/business-anal...,63.0,7.0,0.800000,0.078038,43.918919,0.494369,37.342342,48.000000,878.0,897.0,3333.0,1.0,5.488176
