#### # Step[1]:
#### Installing required libraries

In [1]:
!pip install requests beautifulsoup4 numpy pandas nltk syllables

Defaulting to user installation because normal site-packages is not writeable


####  #Step[2]:

#### Importing necessary libraries

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import re
import syllables
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

#### Download NLTK resources

In [3]:
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\bipla\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bipla\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\bipla\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bipla\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### #Step[3]:

#### Load input data file

In [4]:
df = pd.read_excel('D:/Journey of Data Science/Projects/Data extraction/Input.xlsx')[['URL_ID', 'URL']]
df.head()

Unnamed: 0,URL_ID,URL
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...


#### Let explore the input file

In [5]:
print("Input data has {} rows & {} columns".format(len(df),len(df.columns)))
print("Number of null in 'URL_ID' column: {}".format(df['URL_ID'].isnull().sum()))
print("Number of null in 'URL' column: {}".format(df['URL'].isnull().sum()))

Input data has 100 rows & 2 columns
Number of null in 'URL_ID' column: 0
Number of null in 'URL' column: 0


#### #Step[4]:

#### Function to extract article text

In [6]:
def extract_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        article_text = " ".join([p.get_text() for p in soup.find_all('p') if not p.find('img')])
        return article_text
    except Exception as e:
        print(f"Failed to extract text from {url}: {e}")
        return ""

#### Extract and save article text

In [7]:
for i, row in df.iterrows():
    url = row['URL']
    url_id = row['URL_ID']
    article_text = extract_article_text(url)
    output_file = f"{url_id}.txt"
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(article_text)
    print(f"Extracted article from {url} and saved to {output_file}.")


Extracted article from https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/ and saved to blackassign0001.txt.
Extracted article from https://insights.blackcoffer.com/rising-it-cities-and-their-impact-on-the-economy-environment-infrastructure-and-city-life-in-future/ and saved to blackassign0002.txt.
Extracted article from https://insights.blackcoffer.com/internet-demands-evolution-communication-impact-and-2035s-alternative-pathways/ and saved to blackassign0003.txt.
Extracted article from https://insights.blackcoffer.com/rise-of-cybercrime-and-its-effect-in-upcoming-future/ and saved to blackassign0004.txt.
Extracted article from https://insights.blackcoffer.com/ott-platform-and-its-impact-on-the-entertainment-industry-in-future/ and saved to blackassign0005.txt.
Extracted article from https://insights.blackcoffer.com/the-rise-of-the-ott-platform-and-its-impact-on-the-entertainment-industry-by-2040/

#### #Step[5]:

#### Load positive and negative words

In [8]:
positive_words = set()
negative_words = set()

with open('D:/Journey of Data Science/Projects/Data extraction/DataSet/MasterDictionary/positive-words.txt', 'r') as f:
    positive_words = set(word.strip() for word in f.readlines())

with open('D:/Journey of Data Science/Projects/Data extraction/DataSet/MasterDictionary/negative-words.txt', 'r', encoding='latin-1') as f:
    negative_words = set(word.strip() for word in f.readlines())

#### Functions to calculate positive and negative scores

In [9]:
def get_positive_score(text):
    words = text.split()
    positive_count = sum(1 for word in words if word in positive_words)
    total_words = len(words)
    return positive_count / total_words if total_words else 0

def get_negative_score(text):
    words = text.split()
    negative_count = sum(1 for word in words if word in negative_words)
    total_words = len(words)
    return negative_count / total_words if total_words else 0


#### #Step[6]:

#### Function to count syllables in a word

In [10]:
def count_syllables(word):
    vowels = 'aeiouy'
    num_vowels = 0
    for i in range(len(word)):
        if word[i].lower() in vowels:
            num_vowels += 1
            if i > 0 and word[i-1].lower() in vowels:
                num_vowels -= 1
    if word.endswith('e'):
        num_vowels -= 1
    if num_vowels == 0:
        num_vowels = 1
    return num_vowels

#### Function to count complex words

In [11]:
def count_complex_words(text):
    words = text.split()
    return sum(1 for word in words if syllables.estimate(word) > 2)

def calculate_percentage_of_complex_words(text):
    words = word_tokenize(text)
    total_words = len(words)
    num_complex_words = count_complex_words(text)
    return (num_complex_words / total_words) if total_words else 0

#### #Step[7]:

#### Function to count personal pronouns

In [12]:
def count_personal_pronouns(text):
    personal_pronouns = ['I', 'we', 'my', 'ours', 'us']                  # Provided in Text Analysis.docx
    pattern = r'\b(' + '|'.join(personal_pronouns) + r')\b'
    regex = re.compile(pattern, flags=re.IGNORECASE)
    count = len(regex.findall(text))
    return count


#### Count personal pronouns

In [13]:
personalpronouns = []
for i, row in df.iterrows():
    url_id = row['URL_ID']
    with open(f'{url_id}.txt', 'r', encoding='utf-8') as file:
        text = file.read()
        personalpronouns.append(count_personal_pronouns(text))

#### #Step[8]:

#### Function to load stop words

In [14]:
def load_stop_words(file_path):
    with open(file_path, 'r', encoding='latin-1') as f:
        stop_words = [line.strip() for line in f]
    return set(stop_words)


#### Load stop words from multiple files

In [15]:
stop_words_files = [
    'D:/Journey of Data Science/Projects/Data extraction/DataSet/StopWords/StopWords_Auditor.txt',
    'D:/Journey of Data Science/Projects/Data extraction/DataSet/StopWords/StopWords_Geographic.txt',
    'D:/Journey of Data Science/Projects/Data extraction/DataSet/StopWords/StopWords_Currencies.txt',
    'D:/Journey of Data Science/Projects/Data extraction/DataSet//StopWords/StopWords_DatesandNumbers.txt',
    'D:/Journey of Data Science/Projects/Data extraction/DataSet/StopWords/StopWords_Generic.txt',
    'D:/Journey of Data Science/Projects/Data extraction/DataSet/StopWords/StopWords_GenericLong.txt',
    'D:/Journey of Data Science/Projects/Data extraction/DataSet/StopWords/StopWords_Names.txt'
]

stop_words = set()
for file in stop_words_files:
    stop_words.update(load_stop_words(file))

#### #Step[9]:

#### Function to normalize text

In [16]:
lemmatizer = WordNetLemmatizer()

def normalize_text(text):
    text = re.sub(r'[^\w\s.]', ' ', text).lower().strip()
    text = text.replace('.', ' FULL_STOP_TOKEN ')
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

#### Normalize the text

In [17]:
for i, row in df.iterrows():
    url_id = row['URL_ID']
    with open(f'{url_id}.txt', 'r', encoding='utf-8') as file:
        text = file.read()
    normalized_text = normalize_text(text)
    with open(f'{url_id}.txt', 'w', encoding='utf-8') as file:
        file.write(normalized_text)

#### #Step[10]:

#### Function to calculate average word length

In [18]:
def calculate_avg_word_length(text):
    words = text.split()
    total_length = sum(len(word) for word in words)
    return (total_length / len(words)) if words else 0

#### #Step[11]

#### Calculate various scores and metrics

In [19]:
positive = []
negative = []
polarity = []
subjectivity = []
fogIndex = []
avgnum = []
complexwords = []
syllable = []
wordcount = []
avgwordlength = []


In [20]:
for i, row in df.iterrows():
    url_id = row['URL_ID']
    with open(f'{url_id}.txt', 'r', encoding='utf-8') as file:
        text = file.read()
    
    sentences = text.split('FULL_STOP_TOKEN')
    num_sentences = len(sentences)
    text = text.replace('FULL_STOP_TOKEN', "")
    words = text.split()
    num_words = len(words)
    
    positive_score = get_positive_score(text)
    positive.append(positive_score)
    
    negative_score = get_negative_score(text)
    negative.append(negative_score)
    
    polarity.append((positive_score - negative_score) / ((positive_score + negative_score) + 0.000001))
    subjectivity.append((positive_score + negative_score) / ((num_words) + 0.000001))
    
    avg_sentence_length = num_words / num_sentences if num_sentences else 0
    percentage_of_complex_words = calculate_percentage_of_complex_words(text)
    fogIndex.append(0.4 * (avg_sentence_length + percentage_of_complex_words))
    
    avgnum.append(avg_sentence_length)
    complexwords.append(count_complex_words(text))
    wordcount.append(num_words)
    syllable.append(sum(count_syllables(word) for word in words))
    avgwordlength.append(calculate_avg_word_length(text))

#### #Step[12]:

#### Add calculated columns to DataFrame

In [21]:


# Add calculated columns to DataFrame
df["POSITIVE SCORE"] = positive
df["NEGATIVE SCORE"] = negative
df["POLARITY SCORE"] = polarity
df["SUBJECTIVITY SCORE"] = subjectivity
df["FOG INDEX"] = fogIndex
df["AVG NUMBER OF WORDS PER SENTENCE"] = avgnum
df["COMPLEX WORD COUNT"] = complexwords
df["WORD COUNT"] = wordcount
df["SYLLABLE PER WORD"] = syllable
df["PERSONAL PRONOUNS"] = personalpronouns
df["AVG WORD LENGTH"] = avgwordlength

# Save the final DataFrame to an Excel file
df.to_excel('Output Data Structure.xlsx', index=False)
df.to_csv('Output Data Structure.csv', index=False)
# Step 21: Display the first few rows of the DataFrame
df.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...,0.049505,0.006601,0.764692,0.000185,4.202376,10.1,123,303,697,6,6.762376
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...,0.071586,0.035242,0.340203,0.000118,4.508832,10.809524,420,908,2215,8,7.122247
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...,0.064516,0.037419,0.26582,0.000132,5.306999,12.704918,436,775,2050,15,7.776774
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...,0.059761,0.108898,-0.291337,0.000224,5.593711,13.446429,405,753,1950,7,7.705179
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...,0.058594,0.019531,0.499994,0.000153,4.835014,11.636364,231,512,1186,8,7.179688
