# Data Extraction and Text Analysis for Blackcoffer company

## Mounting Google Drive:

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Importing Necessary Packages:

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Reading Input Excel File:

In [3]:
#read the url file into the pandas object
df = pd.read_excel('/content/drive/MyDrive/Test Assignment/Input.xlsx')

## Fetching Data from URLs and Writing to Text Files:

In [4]:
#loop throgh each row in the df
for index, row in df.iterrows():
  url = row['URL']
  url_id = row['URL_ID']

  # make a request to url
  header = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
  try:
    response = requests.get(url,headers=header)
  except:
    print("can't get response of {}".format(url_id))

  #create a beautifulsoup object
  try:
    soup = BeautifulSoup(response.content, 'html.parser')
  except:
    print("can't get page of {}".format(url_id))
  #find title
  try:
    title = soup.find('h1').get_text()
  except:
    print("can't get title of {}".format(url_id))
    continue
  #find text
  article = ""
  try:
    for p in soup.find_all('p'):
      article += p.get_text()
  except:
    print("can't get text of {}".format(url_id))

  #write title and text to the file
  file_name = '/content/drive/MyDrive/Test Assignment/TitleText/' + str(url_id) + '.txt'
  with open(file_name, 'w') as file:
    file.write(title + '\n' + article)

## Loading Stopwords and Text Files:

In [5]:
import os
from nltk.tokenize import word_tokenize

# Directories
text_dir = "/content/drive/MyDrive/Test Assignment/TitleText"
stopwords_dir = "/content/drive/MyDrive/Test Assignment/StopWords"
sentiment_dir = "/content/drive/MyDrive/Test Assignment/MasterDictionary"

# Load all stop words from the stopwords directory and store in the set variable
stop_words = set()
for filename in os.listdir(stopwords_dir):
    file_path = os.path.join(stopwords_dir, filename)
    if os.path.isfile(file_path):  # Ensure it's a file, not a directory
        with open(file_path, 'r', encoding='ISO-8859-1') as f:
            stop_words.update(set(f.read().splitlines()))

# Load all text files from the TitleText directory and store in a list (docs)
docs = []
for text_file in os.listdir(text_dir):
    file_path = os.path.join(text_dir, text_file)
    if os.path.isfile(file_path):  # Ensure it's a file, not a directory
        with open(file_path, 'r') as f:
            text = f.read()
            words = word_tokenize(text)
            filtered_text = [word for word in words if word.lower() not in stop_words]
            docs.append(filtered_text)



## Calculating Sentiment Scores:



In [6]:
# Store positive and negative words from the sentiment directory
pos = set()
neg = set()

for filename in os.listdir(sentiment_dir):
    file_path = os.path.join(sentiment_dir, filename)
    if os.path.isfile(file_path):  # Ensure it's a file, not a directory
        if filename == 'positive-words.txt':
            with open(file_path, 'r', encoding='ISO-8859-1') as f:
                pos.update(f.read().splitlines())
        else:
            with open(file_path, 'r', encoding='ISO-8859-1') as f:
                neg.update(f.read().splitlines())

# Initialize lists to store scores
positive_words = []
Negative_words =[]
positive_score = []
negative_score = []
polarity_score = []
subjectivity_score = []

# Iterate through the list of docs to calculate scores
for doc in docs:
    positive_words = [word for word in doc if word.lower() in pos]
    negative_words = [word for word in doc if word.lower() in neg]
    positive_score.append(len(positive_words))
    negative_score.append(len(negative_words))
    total_words = len(doc)
    polarity_score.append((len(positive_words) - len(negative_words)) / (total_words + 0.000001))
    subjectivity_score.append((len(positive_words) + len(negative_words)) / (total_words + 0.000001))

# Now we have calculated our scores and stored in positive_score, negative_score, polarity_score, and subjectivity_score lists.

## Calculating Text Analysis Metrics (Average Sentence Length, Complex Word Percentage, Fog Index, etc.):

In [7]:
# Average Sentence Length = the number of words / the number of sentences
# Percentage of Complex words = the number of complex words / the number of words
# Fog Index = 0.4 * (Average Sentence Length + Percentage of Complex words)

avg_sentence_length = []
Percentage_of_Complex_words  =  []
Fog_Index = []
complex_word_count =  []
avg_syllable_word_count =[]

stopwords = set(stopwords.words('english'))
def measure(file):
  with open(os.path.join(text_dir, file),'r') as f:
    text = f.read()
# remove punctuations
    text = re.sub(r'[^\w\s.]','',text)
# split the given text file into sentences
    sentences = text.split('.')
# total number of sentences in a file
    num_sentences = len(sentences)
# total words in the file
    words = [word  for word in text.split() if word.lower() not in stopwords ]
    num_words = len(words)

# complex words having syllable count is greater than 2
# Complex words are words in the text that contain more than two syllables.
    complex_words = []
    for word in words:
      vowels = 'aeiou'
      syllable_count_word = sum( 1 for letter in word if letter.lower() in vowels)
      if syllable_count_word > 2:
        complex_words.append(word)

# Syllable Count Per Word
# We count the number of Syllables in each word of the text by counting the vowels present in each word.
#  We also handle some exceptions like words ending with "es","ed" by not counting them as a syllable.
    syllable_count = 0
    syllable_words =[]
    for word in words:
      if word.endswith('es'):
        word = word[:-2]
      elif word.endswith('ed'):
        word = word[:-2]
      vowels = 'aeiou'
      syllable_count_word = sum( 1 for letter in word if letter.lower() in vowels)
      if syllable_count_word >= 1:
        syllable_words.append(word)
        syllable_count += syllable_count_word


    avg_sentence_len = num_words / num_sentences
    avg_syllable_word_count = syllable_count / len(syllable_words)
    Percent_Complex_words  =  len(complex_words) / num_words
    Fog_Index = 0.4 * (avg_sentence_len + Percent_Complex_words)

    return avg_sentence_len, Percent_Complex_words, Fog_Index, len(complex_words),avg_syllable_word_count

# iterate through each file or doc
for file in os.listdir(text_dir):
  x,y,z,a,b = measure(file)
  avg_sentence_length.append(x)
  Percentage_of_Complex_words.append(y)
  Fog_Index.append(z)
  complex_word_count.append(a)
  avg_syllable_word_count.append(b)

## Calculating Word Count and Average Word Length:

In [8]:
# Word Count and Average Word Length Sum of the total number of characters in each word/Total number of words
# We count the total cleaned words present in the text by
# removing the stop words (using stopwords class of nltk package).
# removing any punctuations like ? ! , . from the word before counting.

def cleaned_words(file):
  with open(os.path.join(text_dir,file), 'r') as f:
    text = f.read()
    text = re.sub(r'[^\w\s]', '' , text)
    words = [word  for word in text.split() if word.lower() not in stopwords]
    length = sum(len(word) for word in words)
    average_word_length = length / len(words)
  return len(words),average_word_length

word_count = []
average_word_length = []
for file in os.listdir(text_dir):
  x, y = cleaned_words(file)
  word_count.append(x)
  average_word_length.append(y)



## Calculating Personal Pronouns:

In [9]:
# To calculate Personal Pronouns mentioned in the text, we use regex to find
# the counts of the words - “I,” “we,” “my,” “ours,” and “us”. Special care is taken
#  so that the country name US is not included in the list.
def count_personal_pronouns(file):
  with open(os.path.join(text_dir,file), 'r') as f:
    text = f.read()
    personal_pronouns = ["I", "we", "my", "ours", "us"]
    count = 0
    for pronoun in personal_pronouns:
      count += len(re.findall(r"\b" + pronoun + r"\b", text)) # \b is used to match word boundaries
  return count

pp_count = []
for file in os.listdir(text_dir):
  x = count_personal_pronouns(file)
  pp_count.append(x)

## Loading and Modifying Output DataFrame

In [10]:
# Load the existing output DataFrame
output_df = pd.read_excel('/content/drive/MyDrive/Test Assignment/Output Data Structure.xlsx')

# Drop rows with URL_IDs 44, 57, 144 that couldn't be accessed (adjust index based on your dataset)
output_df.drop([44-37, 57-37, 144-37], axis=0, inplace=True)

# Verify the number of rows in output_df
print(f"Number of rows after dropping: {len(output_df)}")


Number of rows after dropping: 144


## Saving Output Data to CSV:

In [11]:
# Assign calculated variables to the output DataFrame, ensuring lengths match
output_df['AVG_SENTENCE_LENGTH'] = avg_sentence_length[:len(output_df)]
output_df['PERCENTAGE_OF_COMPLEX_WORDS'] = Percentage_of_Complex_words[:len(output_df)]
output_df['FOG_INDEX'] = Fog_Index[:len(output_df)]
output_df['COMPLEX_WORD_COUNT'] = complex_word_count[:len(output_df)]
output_df['WORD_COUNT'] = word_count[:len(output_df)]
output_df['AVG_SYLLABLES_PER_WORD'] = avg_syllable_word_count[:len(output_df)]
output_df['PERSONAL_PRONOUNS'] = pp_count[:len(output_df)]
output_df['AVG_WORD_LENGTH'] = average_word_length[:len(output_df)]

# Save the DataFrame to a CSV file
output_df.to_csv('/content/drive/MyDrive/Test Assignment/Output_Data.csv', index=False)

print("Output data saved successfully.")

Output data saved successfully.
