# Blackcoffer Web Scraping Using Scrapy

In [None]:
import scrapy
import pandas as pd

class BlackcofferSpider(scrapy.Spider):
    name = "blackcoffer"

    # Read URLs and IDs from an Excel file
    def start_requests(self):
        excel_path = r"D:\Blackcoffer\Input.xlsx" # Replace with the actual path
        df = pd.read_excel(excel_path)
        for index, row in df.iterrows():
            url = row['URL']
            identifier = row['URL_ID']
            yield scrapy.Request(url=url, meta={'identifier': identifier}, callback=self.parse_page)

    # Parse individual pages
    def parse_page(self, response):
        identifier = response.meta['identifier']

        # Extract title
        title = response.xpath('//h1/text()').get()

        # Extract content from multiple paths
        content_paths = ['//div[@class="td-post-content tagdiv-type"]/p/text()','//div[@class="tdb-block-inner td-fix-index"]/p/text()']  # Customize the XPath expressions
        content_text = []

        for path in content_paths:
            content = response.xpath(path).getall()
            content_text.extend(content)

        # #Extract content (customize the XPath accordingly)
        # content = response.xpath('//div[@class="td-post-content tagdiv-type"]/p/text()').getall()
        # Join the content into a single string
        content_text = '\n'.join(content_text).strip()
        # Create the content to be saved in the text file
        file_content = f"Title: {title}\n\n{content_text}"

        # Save content in a text file
        if file_content:
            filename = f"{identifier}.txt"
            with open(filename, 'w',encoding='utf-8') as file:
                file.write(file_content)


# Blackcoffer Sentiment Analysis Project

## Import Library

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PUJA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df=pd.read_excel(r"Output Data Structure.XLSX")
df.head()

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,,,,,,,,,,,,,
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,,,,,,,,,,,,,
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,,,,,,,,,,,,,
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...,,,,,,,,,,,,,
4,432.0,https://insights.blackcoffer.com/rise-of-telem...,,,,,,,,,,,,,


# 1. Sentiment Analysis
## 1.1 Cleaning using Stop Words Lists

In [5]:
#Retriving path for StopWords
stop_word_path= r"D:\Blackcoffer\StopWords"
stop_word_files=os.listdir(stop_word_path)
stop_word_files

['StopWords_Auditor.txt',
 'StopWords_Currencies.txt',
 'StopWords_DatesandNumbers.txt',
 'StopWords_Generic.txt',
 'StopWords_GenericLong.txt',
 'StopWords_Geographic.txt',
 'StopWords_Names.txt']

In [6]:
#coverting all the stopword files into list
stop_words_list = []
for i in tqdm(stop_word_files):
    path=stop_word_path+'\\'+i
    #print(path)
    with open(path, 'r',encoding='latin-1') as file:
        for line in file:
            stop_words_list.append(line.strip())

  0%|          | 0/7 [00:00<?, ?it/s]

In [7]:
stop_words_list

['ERNST',
 'YOUNG',
 'DELOITTE',
 'TOUCHE',
 'KPMG',
 'PRICEWATERHOUSECOOPERS',
 'PRICEWATERHOUSE',
 'COOPERS',
 'AFGHANI  | Afghanistan',
 'ARIARY | Madagascar',
 'BAHT | Thailand',
 'BALBOA | Panama',
 'BIRR | Ethiopia',
 'BOLIVAR | Venezuela',
 'BOLIVIANO  | Bolivia',
 'CEDI | Ghana',
 'COLON  | Costa Rica',
 'CÓRDOBA  | Nicaragua',
 'DALASI | Gambia',
 'DENAR | Macedonia (Former Yug. Rep.)',
 'DINAR | Algeria',
 'DIRHAM  | Morocco',
 'DOBRA | São Tom and Príncipe',
 'DONG | Vietnam',
 'DRAM | Armenia',
 'ESCUDO  | Cape Verde',
 'EURO  | Belgium',
 'FLORIN | Aruba',
 'FORINT | Hungary',
 'GOURDE | Haiti',
 'GUARANI | Paraguay',
 'GULDEN | Netherlands Antilles',
 'HRYVNIA  | Ukraine',
 'KINA | Papua New Guinea',
 'KIP | Laos',
 'KONVERTIBILNA MARKA  | Bosnia-Herzegovina',
 'KORUNA  | Czech Republic',
 'KRONA | Sweden',
 'KRONE | Denmark',
 'KROON | Estonia',
 'KUNA | Croatia',
 'KWACHA | Zambia',
 'KWANZA | Angola',
 'KYAT | Myanmar',
 'LARI | Georgia',
 'LATS | Latvia',
 'LEK | Alba

## Cleaning Stopwords

In [8]:

stopword_own = []

for entry in stop_words_list:
    parts = entry.lower().split('|')
    if len(parts) == 2:  # Check if the split produced two parts
        stopword_own.append(parts[0].strip())  # Extract and append the currency name
    else:
        stopword_own.append(parts[0])
        

In [9]:
def clean_sent(text):
    words = word_tokenize(text.lower()) #tokenization and case conversion
    words = [i for i in words  if i.isalpha()]
    clean_text=[i for i in words if i not in stopword_own ]
    return clean_text

# 1.2 Creating a dictionary of Positive and Negative words

In [10]:
path=r"D:\Blackcoffer\MasterDictionary"
master_dic=os.listdir(path)

In [11]:
negative_path=path+'\\'+master_dic[0]
negative_words=[]
with open(negative_path, 'r',encoding = 'latin-1') as file:
    for line in file:
        negative_words.append(line.lower().strip())

In [12]:
positive_path=path+'\\'+master_dic[1]
positive_words=[]
with open(positive_path, 'r',encoding = 'latin-1') as file:
    for line in file:
        positive_words.append(line.lower().strip())

In [13]:
master_dictionary={'Negative':negative_words,'Positive':positive_words}

In [14]:
master_dictionary

{'Negative': ['2-faced',
  '2-faces',
  'abnormal',
  'abolish',
  'abominable',
  'abominably',
  'abominate',
  'abomination',
  'abort',
  'aborted',
  'aborts',
  'abrade',
  'abrasive',
  'abrupt',
  'abruptly',
  'abscond',
  'absence',
  'absent-minded',
  'absentee',
  'absurd',
  'absurdity',
  'absurdly',
  'absurdness',
  'abuse',
  'abused',
  'abuses',
  'abusive',
  'abysmal',
  'abysmally',
  'abyss',
  'accidental',
  'accost',
  'accursed',
  'accusation',
  'accusations',
  'accuse',
  'accuses',
  'accusing',
  'accusingly',
  'acerbate',
  'acerbic',
  'acerbically',
  'ache',
  'ached',
  'aches',
  'achey',
  'aching',
  'acrid',
  'acridly',
  'acridness',
  'acrimonious',
  'acrimoniously',
  'acrimony',
  'adamant',
  'adamantly',
  'addict',
  'addicted',
  'addicting',
  'addicts',
  'admonish',
  'admonisher',
  'admonishingly',
  'admonishment',
  'admonition',
  'adulterate',
  'adulterated',
  'adulteration',
  'adulterier',
  'adversarial',
  'adversary'

# 1.3 Extracting Derived variables

In [16]:
text_data_path=r"D:\Blackcoffer\Text"

In [17]:
for index, i in tqdm(enumerate(df['URL_ID'])):
    path_textfile = text_data_path + '\\' + str(i) + '.txt'
    try:
        with open(path_textfile, 'r',encoding='latin-1') as file:
            content = file.read()
            
            clean_content = clean_sent(content)
            
            positive_score = sum(1 for word in clean_content if word.lower() in master_dictionary['Positive'])
            negative_score = sum(1 for word in clean_content if word.lower() in master_dictionary['Negative'])
            
            # Update the DataFrame using .loc
            df.loc[index, 'POSITIVE SCORE'] = positive_score
            df.loc[index, 'NEGATIVE SCORE'] = negative_score
            df.loc[index, 'SUBJECTIVITY SCORE'] = (positive_score + negative_score) / (len(clean_content) + 0.000001)
            df.loc[index, 'POLARITY SCORE'] = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    except FileNotFoundError:
        print(f"URL_ID: {i} - 404 error")
    except Exception as e:
        print(f"URL_ID: {i} - Error: {e}")

0it [00:00, ?it/s]

URL_ID: 11668.0 - 404 error
URL_ID: 17671.4 - 404 error


## 2. Analysis of Readability
## 3.AVG NUMBER OF WORDS PER SENTENCE
## 4. COMPLEX WORD COUNT

In [18]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))  # or replace 'english' with your desired language

In [19]:
for index, i in tqdm(enumerate(df['URL_ID'])):
    path_textfile = text_data_path + '\\' + str(i) + '.txt'
    try:
        with open(path_textfile, 'r',encoding='latin-1') as file:
            content = file.read()
            
            # Find sentence length
            text = re.sub(r'[^\w\s.]','',content)
            sentences = text.split('.')
            num_sentences = len(sentences)
    
            #Find length of word
            words = [word  for word in text.split() if word.lower() not in stopwords ]
            words = [i for i in words  if i.isalpha()]
            num_words=len(words)
            
            #Average Sentence Length
            average_sentence_length = num_words / num_sentences
            
            # Calculate the number of complex words
            complex_words = []
            for word in words:
                vowels = 'aeiou'
                syllable_count_word = sum( 1 for letter in word if letter.lower() in vowels)
                if syllable_count_word > 2:
                    complex_words.append(word)
            percentage_complex_words = (len(complex_words) / len(words)) * 100
            
            
            #fog_index
            fog_index = 0.4 * (average_sentence_length + percentage_complex_words)
            
             # Update the DataFrame using .loc
            df.loc[index, 'AVG SENTENCE LENGTH'] = average_sentence_length
            df.loc[index, 'PERCENTAGE OF COMPLEX WORDS'] = percentage_complex_words
            df.loc[index, 'COMPLEX WORD COUNT'] = len(complex_words)
            df.loc[index, 'FOG INDEX'] = fog_index
            df.loc[index, 'AVG NUMBER OF WORDS PER SENTENCE'] = average_sentence_length
            
    except FileNotFoundError:
        print(f"URL_ID: {i} - 404 web not working")
    except Exception as e:
        print(f"URL_ID: {i} - Error: {e}")      

0it [00:00, ?it/s]

URL_ID: 11668.0 - 404 web not working
URL_ID: 17671.4 - 404 web not working


# 5.Word Count

In [20]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PUJA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
from string import punctuation

for index, i in tqdm(enumerate(df['URL_ID'])):
    path_textfile = text_data_path + '\\' + str(i) + '.txt'
    try:
        with open(path_textfile, 'r',encoding='latin-1') as file:
            content = file.read()
            
            # Tokenize the text into words
            words = nltk.word_tokenize(content.lower())
    
            # Remove punctuation
            words = [word for word in words if word not in punctuation]
            
            # Remove stopwords
            stop_words = set(stopwords.words('english'))
            words = [word for word in words if word not in stop_words]
            
            # Count the cleaned words
            word_count = len(words)
            df.loc[index, 'WORD COUNT'] = word_count
            
    except FileNotFoundError:
        print(f"URL_ID: {i} - 404 web not working")
    except Exception as e:
        print(f"URL_ID: {i} - Error: {e}")    

0it [00:00, ?it/s]

URL_ID: 11668.0 - 404 web not working
URL_ID: 17671.4 - 404 web not working


# 7.Personal Pronouns

In [22]:

# Assuming you have your list of URL_IDs in 'output['URL_ID']'
for index, i in tqdm(enumerate(df['URL_ID'])):
    path_textfile = text_data_path + '\\' + str(i) + '.txt'
    try:
        with open(path_textfile, 'r', encoding='latin-1') as file:
            content = file.read()
            
            # Add the regular expression pattern
            pattern = r'\b(?:I|we|my|ours|us)\b'
            
            # Find and count personal pronouns in content
            matches = re.findall(pattern, content, re.IGNORECASE)
            count = len(matches)
            df.loc[index, 'PERSONAL PRONOUNS'] = count
            
            
    except FileNotFoundError:
        print(f"URL_ID: {i} - 404 web not working")
    except Exception as e:
        print(f"URL_ID: {i} - Error: {e}")

0it [00:00, ?it/s]

URL_ID: 11668.0 - 404 web not working
URL_ID: 17671.4 - 404 web not working


# 8. Average Word Length


In [23]:
for index, i in tqdm(enumerate(df['URL_ID'])):
    path_textfile = text_data_path + '\\' + str(i) + '.txt'
    try:
        with open(path_textfile, 'r',encoding='latin-1') as file:
            content = file.read()
            words=nltk.word_tokenize(content.lower())
            words = [i for i in words  if i.isalpha()]
            total_characters = sum(len(word) for word in words)
            total_words=len(words)
            average_length = total_characters / total_words
            df.loc[index, 'AVG WORD LENGTH'] = average_length
            
            
    except FileNotFoundError:
        print(f"URL_ID: {i} - 404 web not working")
    except Exception as e:
        print(f"URL_ID: {i} - Error: {e}")
            

0it [00:00, ?it/s]

URL_ID: 11668.0 - 404 web not working
URL_ID: 17671.4 - 404 web not working


# Syllable Count Per Word

In [24]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))  # or replace 'english' with your desired language

In [25]:
from tqdm import tqdm
import re

def count_syllables(words):
    syllable_count = 0
    syllable_words = []
    vowels = 'aeiou'

    for word in words:
        if word.endswith('es'):
            word = word[:-2]
        elif word.endswith('ed'):
            word = word[:-2]

        syllable_count_word = sum(1 for letter in word if letter.lower() in vowels)
        
        if syllable_count_word >= 1:
            syllable_words.append(word)
            syllable_count += syllable_count_word
            
    return syllable_count, syllable_words

# Assuming you have defined the 'stopwords' list and the 'output' DataFrame

for index, i in tqdm(enumerate(df['URL_ID'])):
    path_textfile = text_data_path + '\\' + str(i) + '.txt'
    try:
        with open(path_textfile, 'r', encoding='latin-1') as file:
            content = file.read()
        
            text = re.sub(r'[^\w\s.]', '', content)
            words = [word for word in text.split() if word.lower() not in stopwords]
            words = [i for i in words if i.isalpha()]
            syllable_count, syllable_words = count_syllables(words)
            
            if len(syllable_words) > 0:
                Syllable_Count_Per_Word = syllable_count / len(syllable_words)
                df.loc[index, 'SYLLABLE PER WORD'] = Syllable_Count_Per_Word
    except FileNotFoundError:
        print(f"URL_ID: {i} - 404 web not working")
    except Exception as e:
        print(f"URL_ID: {i} - Error: {e}")


55it [00:00, 260.09it/s]

URL_ID: 11668.0 - 404 web not working
URL_ID: 17671.4 - 404 web not working


114it [00:00, 229.95it/s]


In [26]:
df

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,74.0,23.0,0.525773,0.122785,11.354430,59.754738,28.443667,11.354430,536.0,978.0,2.893973,2.0,5.506748
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,38.0,13.0,0.490196,0.200000,12.200000,65.245902,30.978361,12.200000,199.0,334.0,2.996721,3.0,5.376054
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,18.0,27.0,-0.200000,0.098684,7.351351,53.860294,24.484658,7.351351,293.0,633.0,2.627778,3.0,5.180488
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...,30.0,26.0,0.071429,0.107075,11.254545,53.634895,25.955776,11.254545,332.0,684.0,2.630470,7.0,5.434579
4,432.0,https://insights.blackcoffer.com/rise-of-telem...,30.0,26.0,0.071429,0.107075,11.254545,53.634895,25.955776,11.254545,332.0,684.0,2.630470,7.0,5.434579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,50921.0,https://insights.blackcoffer.com/coronavirus-i...,5.0,7.0,-0.166667,0.064516,10.000000,50.833333,24.333333,10.000000,122.0,284.0,2.624473,1.0,5.146226
110,51382.8,https://insights.blackcoffer.com/coronavirus-i...,17.0,45.0,-0.451613,0.098569,16.333333,39.135654,22.187595,16.333333,326.0,899.0,2.473424,2.0,5.158351
111,51844.6,https://insights.blackcoffer.com/what-are-the-...,80.0,31.0,0.441441,0.143969,12.943662,47.442873,24.154614,12.943662,436.0,1001.0,2.505470,0.0,5.174214
112,52306.4,https://insights.blackcoffer.com/marketing-dri...,20.0,21.0,-0.024390,0.075926,11.516667,47.322721,23.535755,11.516667,327.0,769.0,2.502216,6.0,4.936867


In [27]:
# Assuming 'output' is your DataFrame
excel_filename = 'output_new.xlsx'  # Provide the desired filename

# Specify the full path if you want to save it in a specific location
full_path = r'D:\Blackcoffer' + excel_filename

# Or, just provide the filename and save it in the current working directory
df.to_excel(excel_filename, index=False)