# Import modules

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import random
import time
import threading
import queue
import IPython
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Navigating HTML Structures of One link

### Request and scrape the html file using beautilful soup and requests modules

In [None]:
dota_news = 'https://dotesports.com/dota-2/news/pure-primed-to-shed-ruined-reputation-to-prove-hes-the-best-dota-2-player-in-the-world' # you can follow the link if you want

# Get the contents of one dota article
response = requests.get(dota_news)

# If scraping is allowed, parse the html document
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'lxml')
    print('Scraped')
else:
    raise Exception('Unable to scrape. Status code: {}'.format(response.status_code))
    # status code 403 means forbidden—you don't have permission to access this resource

Scraped


### Scraping DOTA 2 News Data

#### Parse the whole html file

In [None]:
html_content = soup
print(html_content)

<!DOCTYPE html>
<html lang="en-US">
<head>
<script>
												window.gg_cfproxy_active_features = window.gg_cfproxy_active_features || [];
												window.gg_cfproxy_active_features = [{"name":"updated-homepage","value":false,"on":false,"off":true,"source":"defaultValue","ruleId":""},{"name":"latest-page","value":false,"on":false,"off":true,"source":"defaultValue","ruleId":""},{"name":"latest-page-with-updated-homepage","value":false,"on":false,"off":true,"source":"defaultValue","ruleId":""},{"name":"gamurs-network-account","value":true,"on":true,"off":false,"source":"force","ruleId":""},{"name":"commenting-experience","value":true,"on":true,"off":false,"source":"force","ruleId":""},{"name":"connatix-player","value":false,"on":false,"off":true,"source":"defaultValue","ruleId":""},{"name":"finite-article-experience","value":false,"on":false,"off":true,"source":"defaultValue","ruleId":""},{"name":"custom-article-tile-image-sizes","value":true,"on":true,"off":false,"source":"force","

#### Parse the news title using 'find'

In [None]:
dota_news_title = soup.find('h1', {'class','wp-block-gamurs-article-header__content--title'})
print('Title: ', dota_news_title)

Title:  <h1 class="wp-block-gamurs-article-header__content--title">
Pure primed to shed ‘ruined’ reputation to prove he’s the best Dota 2 player in the world </h1>


In [None]:
# If we want to only take the text
print('Title: ', dota_news_title.text)

Title:  
Pure primed to shed ‘ruined’ reputation to prove he’s the best Dota 2 player in the world 


#### Parse news content in the 'div' using 'find'

In [None]:
# All news content
dota_news_content = soup.find('div', {'class', 'wp-block-gamurs-article-content'})

print(dota_news_content)

<div class="wp-block-gamurs-article-content">
<p data-characters="297" data-injectable="true" data-video="true">As a <em>Dota 2</em> player with a history intertwined with controversy, some expect Ivan “Pure” Moskalenko to be cocky or perhaps avoidant of their past relative to their future. However, as he approaches the final weekend of ESL One Birmingham, Pure is ready to shake off the old and welcome the new.</p><div class="primis-player-container"><div class="primis-player-title">Recommended Videos</div><div class="primis-player"></div></div> <p data-characters="587" data-current-count="587" data-injectable="true">A carry for Tundra Esports in Birmingham on loan from BetBoom, Pure has played an integral part in the team’s advances through the competition. Despite his talent, he has become ensnared in controversy and criticism since he began his professional career. April 2022 saw his <a href="https://dotesports.com/dota-2/news/virtus-pro-terminates-contract-with-pure-following-disqu

### Parse all the 'p' or paragraph using 'find_all'

In [None]:
paragraph_list = dota_news_content.find_all('p')
print(paragraph_list)

[<p data-characters="297" data-injectable="true" data-video="true">As a <em>Dota 2</em> player with a history intertwined with controversy, some expect Ivan “Pure” Moskalenko to be cocky or perhaps avoidant of their past relative to their future. However, as he approaches the final weekend of ESL One Birmingham, Pure is ready to shake off the old and welcome the new.</p>, <p data-characters="587" data-current-count="587" data-injectable="true">A carry for Tundra Esports in Birmingham on loan from BetBoom, Pure has played an integral part in the team’s advances through the competition. Despite his talent, he has become ensnared in controversy and criticism since he began his professional career. April 2022 saw his <a href="https://dotesports.com/dota-2/news/virtus-pro-terminates-contract-with-pure-following-disqualification-from-dota-pro-circuit-regional-playoffs" rel="noreferrer noopener" target="_blank">contract at Virtus.pro terminated</a> for the <a href="https://dotesports.com/dota

# Navigating HTML Structures of Multiple links

In [None]:
# Create a dataframe of the dota news links
links_to_scrape = pd.read_csv('dota2_news_links.csv')

paragraph_news_list = []
title_list = []

# Scrape news links
for link in links_to_scrape['dota links']:
    response = requests.get(link)

    # If nothing wrong happens, scrape the link
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'lxml')

        # Parse the news content
        dota_news_content = soup.find('div', {'class', 'wp-block-gamurs-article-content'})
        paragraph_list = dota_news_content.find_all('p')

        # Parse the title
        title = soup.find('h1', {'class','wp-block-gamurs-article-header__content--title'})

        # Append data
        paragraph_news_list.append(paragraph_list)
        title_list.append(title.text)

        print('Scraped link: {}'.format(link))
    else:
        raise Exception('Unable to scrape. Status code: {}'.format(response.status_code))

Scraped link: https://dotesports.com/dota-2/news/after-talent-trees-innates-and-facets-dota-2-players-are-now-asking-whats-next
Scraped link: https://dotesports.com/dota-2/news/dota-2-players-praise-crownfall-as-one-of-the-greatest-events
Scraped link: https://dotesports.com/dota-2/news/valves-surprise-dota-2-fighting-game-is-showing-signs-of-staying-power
Scraped link: https://dotesports.com/dota-2/news/dota-2-sleet-fighter-how-to-play-best-heroes-tips-tricks
Scraped link: https://dotesports.com/dota-2/news/wait-for-dota-2-crownfall-act-is-finally-over-but-not-ringmaster
Scraped link: https://dotesports.com/dota-2/news/dota-2-devs-rush-out-patch-for-rampant-meepo-exploit-ravaging-ranked
Scraped link: https://dotesports.com/dota-2/news/dota-2s-next-crownfall-act-delayed-to-june-39-no-thats-not-a-typo
Scraped link: https://dotesports.com/dota-2/news/only-2-dota-2-heroes-went-unpicked-during-ti-2024-qualifiers
Scraped link: https://dotesports.com/dota-2/news/dota-2-patch-7-36c-notes-all-

In [None]:
len(paragraph_news_list)

43

In [None]:
paragraph_news_list

[[<p data-characters="317" data-injectable="true" data-video="true"><em>Dota 2 </em>has come a long way since its release in 2013. From featuring nothing more than hero abilities and their respective stats, the game’s formula has evolved to include Talent trees and other features, alongside the recently-added Innates and Facets—and players are now wondering what’s coming next. </p>,
  <p data-characters="637" data-current-count="637" data-injectable="true">“What’s next in <em><a href="https://dotesports.com/dota-2" rel="noreferrer noopener" target="_blank">Dota</a></em>‘s evolution,” asked a player in a <a href="https://www.reddit.com/r/DotA2/comments/1e38161/talent_trees_neutral_items_aghs_upgrades_innates/" rel="noreferrer noopener" target="_blank">July 14 Reddit thread</a>, where they highlighted just how far we are from the game’s original design. Aghanim’s Scepter now works on essentially every hero in the game, Neutral items can give heroes game-changing abilities, and the Innate

# Throttling & Avoiding Detection

In [None]:
tate_news_link = 'https://www.reuters.com/world/europe/romanian-court-maintains-influencer-andrew-tates-travel-restrictions-2024-07-16/'

In [None]:
# Get the contents of one dota article
response = requests.get(tate_news_link)

# If scraping is allowed, parse the html document
if tate_news_link.startswith("https://www.reuters.com/"):
  # add code
elif tate_news_link.startswith("https://www.abscbn.com/"):
  # add code for parsing
  if response.status_code == 200:
      soup = BeautifulSoup(response.text, 'lxml')
      print('Scraped')
  else:
      raise Exception('Unable to scrape. Status code: {}'.format(response.status_code))
      # status code 403 means forbidden—you don't have permission to access this resource
      # status code 401 means the request sent to the website's server lacks valid authentication credentials

Exception: Unable to scrape. Status code: 401

## Setting Header (User-Agent, Referer, Language)

In [None]:
header={
    # Setting Base Agent
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',

    # Setting Referer
    'Referer': 'https://www.reuters.com/site-search/?query=andrew+tate',

    # Setting accepted Language
    'Accept-Language': 'en-US,en;q=0.9'
}

In [None]:
# Get the contents of one dota article
response = requests.get(tate_news_link,  headers=header)

# If scraping is allowed, parse the html document
if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'lxml')
    print('Scraped')
else:
    raise Exception('Unable to scrape. Status code: {}'.format(response.status_code))
    # status code 403 means forbidden—you don't have permission to access this resource
    # status code 401 means the request sent to the website's server lacks valid authentication credentials

Scraped


In [None]:
# Check if we scraped it
print(soup)

<!DOCTYPE html>
<html data-layout="regular-article" lang="en"><head><title>Romanian court reinstates influencer Andrew Tate's travel ban | Reuters</title><meta content="2024-08-02T11:53:21.736Z" name="render_timestamp"/><meta content="width=device-width, initial-scale=1" name="viewport"/><meta content="app-id=602660809, app-argument=https://www.reuters.com/world/europe/romanian-court-maintains-influencer-andrew-tates-travel-restrictions-2024-07-16/?id=IT7BNCAP6NO25BWCF3BN32VFAU" name="apple-itunes-app"/><script>(function(){
      var current_location = window.location.href;

      if (current_location.indexOf('/info-pages/supported-browsers/') === -1) {
        var supportFetchApi = 'fetch' in window;
        var supportCSSGrid = window.CSS && CSS.supports('display', 'grid');

        if (!supportFetchApi && !supportCSSGrid) {
          window.location.href = '/info-pages/supported-browsers/';
        }
      }
    })()</script><script async="" data-config="{&quot;API_ORIGIN&quot;:&quo

## Delayed Requests

In [None]:
tate_links = pd.read_csv('tate_links.csv')

tate_all_news_contents = []

for link in tate_links['tate_links']:
    # Add a random delay between requests
    delay = random.uniform(1, 2)  # Sleep for a random time between 1 and 2 seconds
    print(f'Waiting for {delay:.2f} seconds before the next request...')
    time.sleep(delay)

    # Get the contents of one dota article
    response = requests.get(link,  headers=header)

    # If scraping is allowed, parse the html document
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'lxml')
        tate_all_news_contents.append(soup)
        print('Succesfully scraped link: ',link)
    else:
        raise Exception('Unable to scrape. Status code: {}'.format(response.status_code))
        # status code 403 means forbidden—you don't have permission to access this resource
        # status code 401 means the request sent to the website's server lacks valid authentication credentials

Waiting for 1.70 seconds before the next request...
Succesfully scraped link:  https://news.abs-cbn.com/overseas/06/20/23/influencer-andrew-tate-indicted-for-human-trafficking-in-romania
Waiting for 1.33 seconds before the next request...
Succesfully scraped link:  https://news.abs-cbn.com/overseas/04/01/23/andrew-tate-from-kickboxer-to-misogynist-influencer
Waiting for 1.43 seconds before the next request...
Succesfully scraped link:  https://news.abs-cbn.com/overseas/02/22/23/romania-extends-detention-of-influencer-andrew-tate
Waiting for 1.22 seconds before the next request...
Succesfully scraped link:  https://news.abs-cbn.com/business/01/09/23/second-coming-of-conspiracy-theorists-after-twitter-amnesty
Waiting for 1.93 seconds before the next request...
Succesfully scraped link:  https://news.abs-cbn.com/overseas/12/30/22/ex-kickboxer-held-in-romania-for-rape-human-trafficking
Waiting for 1.85 seconds before the next request...
Succesfully scraped link:  https://www.nytimes.com/ar

In [None]:
len(tate_all_news_contents)

17

## Using Proxies

#### Acts as an intermediary for requests from clients seeking resources from servers that provide those resources.

#### Servers may detect too many requests and may block the IP address to stop further scraping. To avoid blocking, proxies are used and scraping will continue working as the IP address is changed and won’t cause any issues. It also helps in hiding the machine’s IP address as it creates anonymity.

### Scraping free proxy website:

#### Scrape site containing proxies we can use

In [None]:
site_for_proxies = 'https://free-proxy-list.net/#'

# Get the HTML content of the site
response = requests.get(site_for_proxies,  headers=header)

# Parse the HTML file
soup = BeautifulSoup(response.text, 'lxml')

# Parse table
proxy_content = soup.find('tbody')

# Parse row in the table
rows = proxy_content.find_all('tr')

# Default list of necessary data
address = []
google = []
https = []

# Iterate over each row and store data
for values in rows:
    # Find all table data
    values.find_all('td')

    # Get all the text only without tags
    x = [tag.text for tag in values]

    # store in respective list
    address.append(x[0] + ':' + x[1])
    google.append(x[5])
    https.append(x[6])

# Create a dictionary and convert to pandas dataframe
dict_proxies = {
    'address': address,
    'google': google,
    'https':https
}

df = pd.DataFrame(dict_proxies)

#### Store only those that can search google and https

In [None]:
df = df[(df['https'] == 'yes') & (df['google'] == 'yes')]
df.head(50)

Unnamed: 0,address,google,https
46,103.7.135.70:8080,yes,yes
293,69.197.149.234:6343,yes,yes


In [None]:
#df = df[df['address'] != '203.189.88.156:80'] use this to remove a slow proxy
proxy_list = df['address'].to_list()
proxy_list

['103.7.135.70:8080', '69.197.149.234:6343']

### Copying from text:

In [None]:
with open('proxy.txt', 'r') as file:
    proxies = file.read().split()
proxies

['103.7.135.70:8080',
 '23.95.216.78:34561',
 '190.242.37.62:8080',
 '160.86.242.23:8080',
 '189.240.60.168:9090',
 '164.52.206.180:80',
 '47.236.236.2:8899',
 '135.148.171.194:18080',
 '194.67.91.153:80',
 '198.199.86.11:8080',
 '202.61.204.51:80',
 '209.97.150.167:3128',
 '139.99.237.62:80',
 '66.29.154.105:3128',
 '139.59.1.14:8080',
 '161.35.70.249:3128',
 '139.162.78.109:8080',
 '167.99.236.14:80',
 '134.209.29.120:8080',
 '51.15.242.202:8888',
 '13.81.217.201:80',
 '138.91.159.185:80',
 '20.206.106.192:8123',
 '20.24.43.214:80',
 '20.210.113.32:8123',
 '47.74.152.29:8888',
 '219.65.73.81:80',
 '23.94.86.138:80',
 '132.145.134.230:80',
 '68.178.168.41:80',
 '72.10.160.92:5635',
 '192.73.244.36:80',
 '198.49.68.80:80',
 '81.200.149.178:80',
 '123.205.24.244:8193',
 '72.10.160.173:29439',
 '212.107.28.120:80',
 '72.10.160.94:18345',
 '41.207.242.62:80',
 '72.10.160.171:10095',
 '67.43.227.227:11023',
 '162.214.165.203:80',
 '177.87.144.122:8086',
 '190.158.210.102:8080',
 '162.223.9

### Validate proxy

In [None]:
def validate(prox_list, number_proxy):
    usable_proxies = []
    length = 1

    for proxy in prox_list:
        try:
            print('checking proxy: ', proxy)
            response = requests.get('https://ipinfo.io/json',
                                       proxies={
                                           'http':proxy,
                                           'https': proxy
                                       })
        except:
            print('Failed proxy: ', proxy, '\n')
            continue

        # If all is well append to the list of usable proxies
        if response.status_code == 200:
            print('Successful proxy: ', proxy, '\n')
            usable_proxies.append(proxy)

        if length == number_proxy:
            break

        length += 1

    return usable_proxies

In [None]:
usable_proxies = validate(proxy_list, 5) # alternatively u can validate the 'proxies'. However it is not recommended since it would take long to filter which is useful or not
usable_proxies

checking proxy:  103.7.135.70:8080
Failed proxy:  103.7.135.70:8080 

checking proxy:  69.197.149.234:6343
Failed proxy:  69.197.149.234:6343 



[]

### Scraping with proxies

In [None]:
tate_links_using_proxies = pd.read_csv('tate_links_for_proxies.csv')

counter = 0

tate_proxy_contents = []

for link in tate_links_using_proxies['tate_links']:
    try:
        # use proxies
        res = requests.get(link, headers=header, proxies={'http':proxy_list[counter],'https':proxy_list[counter]}) # change usable_proxies to proxy_list if there are no proxy
        if res.status_code == 200:
            print('Status code: {}. Scraped with proxy: {}'.format(res.status_code, proxy_list[counter]))
            soup = BeautifulSoup(res.text, 'lxml')
            tate_proxy_contents.append(soup)
        else:
            print("Failed to scrape with status error: {}".format(res.status_code))
    except Exception as e:
        print('Failed to scrape. Reason: ', e)
    finally:
        if len(proxy_list) - 1 > counter:
            counter += 1
        else:
            counter = 0

Failed to scrape. Reason:  HTTPSConnectionPool(host='www.reuters.com', port=443): Max retries exceeded with url: /world/europe/romanian-court-maintains-influencer-andrew-tates-travel-restrictions-2024-07-16/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1007)')))
Failed to scrape. Reason:  HTTPSConnectionPool(host='www.reuters.com', port=443): Max retries exceeded with url: /world/europe/romanian-court-lifts-influencer-andrew-tates-travel-restrictions-within-eu-2024-07-05/ (Caused by ProxyError('Unable to connect to proxy', NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7b8b3ad3e710>: Failed to establish a new connection: [Errno 111] Connection refused')))
Failed to scrape. Reason:  HTTPSConnectionPool(host='www.reuters.com', port=443): Max retries exceeded with url: /world/europe/romanian-court-says-internet-personality-tates-human-trafficking-trial-

#### If scraping with proxy produces bad results such as 403, and 401 you can remove the proxy. However take note that you are scraping with your IP address instead of a proxy and this may cause your IP to be banned temporarily or permanently.

In [None]:
tate_links_not_using_proxies = pd.read_csv('tate_links_for_proxies.csv')

tate_proxy_contents = []

for link in tate_links_not_using_proxies['tate_links']:
    try:
        # Using your own IP address
        res = requests.get(link, headers=header)
        if res.status_code == 200:
            print('Successfully scraped: ',link)
            soup = BeautifulSoup(res.text, 'lxml')
            tate_proxy_contents.append(soup)
        else:
            print('Failed to scrape: ',res.status_code)
    except Exception as e:
        print('Failed to scrape. Reason: ', e)

Successfully scraped:  https://www.reuters.com/world/europe/romanian-court-maintains-influencer-andrew-tates-travel-restrictions-2024-07-16/
Successfully scraped:  https://www.reuters.com/world/europe/romanian-court-lifts-influencer-andrew-tates-travel-restrictions-within-eu-2024-07-05/
Successfully scraped:  https://www.reuters.com/world/europe/romanian-court-says-internet-personality-tates-human-trafficking-trial-can-start-2024-04-26/
Successfully scraped:  https://www.reuters.com/world/europe/andrew-tates-seized-assets-will-remain-possession-romanian-law-enforcement-says-2024-03-22/
Successfully scraped:  https://www.reuters.com/world/europe/bucharest-appeals-court-overturns-decision-seize-assets-andrew-tate-2024-01-08/
Successfully scraped:  https://www.reuters.com/world/europe/romanian-court-rejects-andrew-tates-bid-leave-country-while-awaiting-trial-2023-12-22/
Successfully scraped:  https://www.reuters.com/world/europe/romanian-judge-loosens-restrictions-influencer-andrew-tate-2

# Parsing and Cleaning Data

### Removing HTML tags and stripping unwanted characters from DOTA 2 news links

In [None]:
# Removed html tags and parse the list of list of paragraphs
all_news_dota_paragraph = []
for paragraph_list in paragraph_news_list:
    paragraphs = []

    for paragraph in paragraph_list:
        # Append the cleaned data
        paragraphs.append(paragraph.text.strip())

    all_news_dota_paragraph.append(paragraphs)

dota_news = {
    'title':title_list,
    'links':links_to_scrape['dota links'],
    'content':all_news_dota_paragraph
}

dota_news_df = pd.DataFrame(dota_news)
dota_news_df.head()

Unnamed: 0,title,links,content
0,"\nAfter Talent trees, Innates, and Facets, Dot...",https://dotesports.com/dota-2/news/after-talen...,[Dota 2 has come a long way since its release ...
1,\nDota 2 players praise Crownfall as ‘one of t...,https://dotesports.com/dota-2/news/dota-2-play...,"[After a lengthy wait, Dota 2 players finally ..."
2,\nValve’s surprise Dota 2 fighting game is sho...,https://dotesports.com/dota-2/news/valves-surp...,"[The next chapter of Dota 2‘s 2024 event, Crow..."
3,\nDota 2 Sleet Fighter explained – How to play...,https://dotesports.com/dota-2/news/dota-2-slee...,[Deep in the hearts of Icewrack in Dota 2 lies...
4,\nThe wait for Dota 2’s next Crownfall act is ...,https://dotesports.com/dota-2/news/wait-for-do...,"[If you’re like me, you’ll have had June 39 (y..."


In [None]:
len(all_news_dota_paragraph)

43

### Removing HTML tags and stripping unwanted characters from Tate news links

In [None]:
titles = []
contents = []

for html_content in tate_proxy_contents:
    # Parse title in the html file
    tate_title = html_content.find('h1', {'class','text__text__1FZLe text__dark-grey__3Ml43 text__medium__1kbOh text__heading_3__1kDhc heading__base__2T28j heading__heading_3__3aL54'})

    # Segment to the important contents only
    useful_tate_content = html_content.find('div', {'class','article-body__content__17Yit'})

    # Parse all paragraph in the segment
    tate_paragraph = useful_tate_content.find_all('div',{'class','text__text__1FZLe text__dark-grey__3Ml43 text__regular__2N1Xr text__small__1kGq2 body__full_width__ekUdw body__small_body__2vQyf article-body__paragraph__2-BtD'})

    # Remove all html tags
    cleaned_paragraph_list = [p.text for p in tate_paragraph]

    # Append data
    titles.append(tate_title.text)
    contents.append(cleaned_paragraph_list)

# Dictionary to store all data
tate_dict = {
    'link': tate_links_using_proxies['tate_links'],
    'title':titles,
    'content':contents,
}

# Convert to pandas dataframe
tate_df = pd.DataFrame(tate_dict)
tate_df.head(10)

Unnamed: 0,link,title,content
0,https://www.reuters.com/world/europe/romanian-...,Romanian court reinstates influencer Andrew Ta...,"[BUCHAREST, July 16 (Reuters) - Internet perso..."
1,https://www.reuters.com/world/europe/romanian-...,Romanian court eases travel restrictions on in...,"[BUCHAREST, July 5 (Reuters) - A Romanian cour..."
2,https://www.reuters.com/world/europe/romanian-...,"Andrew Tate human trafficking trial can start,...","[BUCHAREST, April 26 (Reuters) - The trial of ..."
3,https://www.reuters.com/world/europe/andrew-ta...,Internet personality Tate's assets to remain w...,"[BUCHAREST, March 22 (Reuters) - Assets belong..."
4,https://www.reuters.com/world/europe/bucharest...,Romanian court will look again at seizure of i...,[Jan 8 (Reuters) - A Romanian court has accept...
5,https://www.reuters.com/world/europe/romanian-...,Romanian court rejects Andrew Tate's bid to le...,"[BUCHAREST, Dec 22 (Reuters) - A Bucharest cou..."
6,https://www.reuters.com/world/europe/romanian-...,Romanian judge loosens restrictions on influen...,[Nov 23 (Reuters) - Andrew Tate and his brothe...


In [None]:
# To text
all_sentences = []
for content in tate_df['content']:
    # Flatten the list of paragraphs into a single string
    content_string = ' '.join(content)
    # Tokenize the string into sentences
    sentences = sent_tokenize(content_string)
    # Append sentences to the list
    all_sentences.extend(sentences)

In [None]:
with open('output_sentences.txt', 'w') as file:
    for sentence in all_sentences:
        file.write(sentence + '\n')

# End of Coding Session
### Congratulations!

In [None]:
IPython.display.Audio('End.wav')

# Activity 5:

## In this activity feel free to view the link for detailed instructions:
##### https://docs.google.com/document/d/1W28MGJ2tH0K0lAjSGoLpKA1ZWJYL3nc5Z1UwDJ_HaZ8/edit?fbclid=IwZXh0bgNhZW0CMTAAAR2s4LA-RwCgOY0iCtC6E714n7QMIyt-6HdKAQqFdgvTLO9gRy3A19RQn2k_aem_sKNFJg1xc1YrH0C6KW5Jhw#heading=h.cow3s5m3e6am



In [None]:
# You can optionally follow this pseudocode:

# Create a list of links to scrape through web crawling or searching on Google.

# Loop through each link

    # Create a request using any or all security by-passing method

    # Store data in a list

# Create a Dataframe and store all data