In [95]:
import re
import pandas as pd

In [96]:
df = pd.read_csv('CNN_DailyMail.csv')

In [97]:
df

Unnamed: 0,Title,Text
0,John and . Audrey Cook were discovered alongsi...,"By . Anthony Bond . PUBLISHED: . 07:03 EST, 2 ..."
1,NEW: Libya can serve as example of cooperation...,UNITED NATIONS (CNN) -- A rare meeting of U.N....
2,Very Reverend Robert Waddington sexually abuse...,Cover-up: Former Archbishop Lord Hope allowed ...
3,Monday night's episode showed Buddy Valastro t...,"By . Kristie Lau . PUBLISHED: . 10:48 EST, 14 ..."
4,People asked to turn out lights for hour betwe...,'The lamps are going out all over Europe. We s...
...,...,...
6995,Newcastle will demand a £12million fee for def...,France's Monaco will rival Arsenal in the race...
6996,Lisa Bloom: Americans are obsessed with the pe...,(CNN) -- Our country is obsessed with the peri...
6997,"Clinton de Menezes, 43, was attacked at friend...",By . Barbara Jones . Nicola de Menezes has des...
6998,Barcelona move up to second in Spain with 5-0 ...,(CNN) -- Barcelona and Real Madrid have roared...


In [98]:
def extract_text(text, pattern):
    if pd.isnull(text):
        return None
    matches = pattern.findall(text)
    matches = [re.sub(r'^[\'"(<\[\{]+|[\'">.,);\]\}]+$', '', m) for m in matches]
    return ', '.join(matches) if matches else None

In [99]:
# 1. Extract all email addresses present in the text.
email_pattern = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')
df['Emails'] = df['Text'].apply(lambda x: extract_text(x, email_pattern))

df[df["Emails"].notna()][['Emails']]

Unnamed: 0,Emails
699,sam.webb@mailonline.co.uk
918,news@dailymail.co.uk
1938,tomorrow@harmless.org.uk
2990,mailonlinepictures@dailymail.co.uk
4215,concierge@astonmartindriving.com
5001,factcheck@cnn.com
5376,tacklekeown@mailonline.co.uk
5462,asylumhotel@dailymail.co.uk
5958,"feedback@cheapoair.com, celliott@ngs.org"
5974,doha@anantara.com


In [100]:
# 2. Find and list all URLs embedded in the dataset.
url_pattern = re.compile(r'https?://[^\s\'",<>]+')

df['URLs'] = df['Text'].apply(lambda x: extract_text(x, url_pattern))

df[df["URLs"].notna()][['URLs']]

Unnamed: 0,URLs
371,http://www.gofundme.com/lf0ga0
420,http://www.gnomeexperiment.com/
434,http://www.stbrides.com/inspire/
990,http://www.dailymail.co.uk/news/article-206921...
1344,http://www.shortlist.com/entertainment/sport/r...
1634,http://video.foxnews.com
2052,http://triggertrap.com/lapselondon/
2068,http://www.findmypast.com.au/new-south-wales-w...
2111,http://bit.ly/1sFCW7x
2380,http://www.nordictrack.com/


In [101]:
# 3. Identify all phone numbers using regex patterns typical to various country formats.

phone_pattern = re.compile(
    r'(?:(?:\+|00)\d{1,3}[\s\-\.]?)?'
    r'(?:\(?\d{2,4}\)?[\s\-\.])'
    r'\d{3,4}[\s\-\.]?\d{3,4}'
)


df["Phone_numbers"] = df["Text"].apply(lambda x: extract_text(x, phone_pattern))

df[df["Phone_numbers"].notna()][['Phone_numbers']]


Unnamed: 0,Phone_numbers
92,1300 659 467
144,"1483 631239, 0800 555 111"
717,"+91 471 227 3093, +1 504 200 6523, +44 1628 82..."
883,"+27 21 421 6666, +1 305 947 3525, +44 1273 606..."
894,1800 333 000
918,0203 615 1154
944,704) 920-5580
972,3001 232 323
1009,"0161 856 5097, 0800 555 111"
1303,800-799-7233


In [102]:
# 4. Clean the text by removing special characters, punctuation, and extra spaces.
df["CleanText"] = df["Text"].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)).strip())
df["CleanText"] = df["CleanText"].apply(lambda x: re.sub(r'\s+', ' ', x))

df[["Text", "CleanText"]]

Unnamed: 0,Text,CleanText
0,"By . Anthony Bond . PUBLISHED: . 07:03 EST, 2 ...",By Anthony Bond PUBLISHED 0703 EST 2 March 201...
1,UNITED NATIONS (CNN) -- A rare meeting of U.N....,UNITED NATIONS CNN A rare meeting of UN Securi...
2,Cover-up: Former Archbishop Lord Hope allowed ...,Coverup Former Archbishop Lord Hope allowed a ...
3,"By . Kristie Lau . PUBLISHED: . 10:48 EST, 14 ...",By Kristie Lau PUBLISHED 1048 EST 14 June 2012...
4,'The lamps are going out all over Europe. We s...,The lamps are going out all over Europe We sha...
...,...,...
6995,France's Monaco will rival Arsenal in the race...,Frances Monaco will rival Arsenal in the race ...
6996,(CNN) -- Our country is obsessed with the peri...,CNN Our country is obsessed with the perils of...
6997,By . Barbara Jones . Nicola de Menezes has des...,By Barbara Jones Nicola de Menezes has describ...
6998,(CNN) -- Barcelona and Real Madrid have roared...,CNN Barcelona and Real Madrid have roared back...


In [103]:
# 5. Extract dates and times mentioned in the text (e.g., “January 5, 2023” or “5:00 PM”).
datetime_pattern = re.compile(
    r'\b(?:\d{1,2}:\d{2}\s?(?:AM|PM)?)\b|'
    r'\b(?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b|'
    r'\b(?:\d{1,2}\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*[,]?\s?\d{2,4})\b|'
    r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,2},?\s\d{4}\b',
    flags=re.IGNORECASE
)

df["Date_Time"] = df["Text"].apply(lambda x: extract_text(x, datetime_pattern))
df[df["Date_Time"].notna()][['Date_Time']]

Unnamed: 0,Date_Time
0,"07:03 , 2 March 2013, 08:07 , 2 March 2013"
3,"10:48 , 14 June 2012, 11:02 , 14 June 2012"
4,"August 3, 1914"
8,"09:26 , 14 June 2012, 13:19 , 14 June 2012"
13,01:30
...,...
6978,"12:40 , 30 October 2013, 04:27 , 31 October 2013"
6979,"February 10, 1806"
6984,"08:55 , 23 June 2013, 09:21 , 23 June 2013"
6988,"22:23 , 31 October 2013, 04:28 , 1 November 2013"


In [104]:
# 6. Count the occurrences of specific keywords or patterns, such as “AI”, “machine learning”, or hashtags.
AI_pattern = re.compile(r'\bAI\b', flags=re.IGNORECASE)
df["AI_Count"] = df["Text"].apply(lambda x: len(AI_pattern.findall(str(x))))

ML_pattern = re.compile(r'\bmachine learning\b', flags=re.IGNORECASE)
df["ML_Count"] = df["Text"].apply(lambda x: len(ML_pattern.findall(str(x))))

hashtag_pattern = re.compile(r'#\w+')
df["Hashtags"] = df["Text"].apply(lambda x: extract_text(x, hashtag_pattern))

df[df["AI_Count"] > 0][['AI_Count', 'Text']]

Unnamed: 0,AI_Count,Text
86,1,To call this XCOM would be a major disservice ...
720,1,The Chinese authorities have banned the phrase...
1315,8,Stephen Hawking recently said that the develop...
2277,3,A Chinese man has offered to 'rent' his girlfr...
5753,14,Beijing (CNN) -- Must art always serve politic...
6070,4,"By . Rachel Reilly . PUBLISHED: . 07:58 EST, 5..."
6092,1,"It's starred on Jeopardy, helped with cancer r..."


In [105]:
df[df["ML_Count"] > 0][['ML_Count', 'Text']]

Unnamed: 0,ML_Count,Text
1315,1,Stephen Hawking recently said that the develop...
2963,1,By . Sarah Griffiths . Drones that can choose ...


In [106]:
df[df["Hashtags"].notna()][['Hashtags', 'Text']]

Unnamed: 0,Hashtags,Text
22,#HotJesus,"Clearly Jesus was sexy. After all, He is the S..."
28,"#Playstation2013, #playstation2013, #playstati...","By . Damien Gayle . PUBLISHED: . 12:37 EST, 1 ..."
62,"#ShadesofRevlon, #ShadesOfRevlon",The accusations against the Revlon CEO are ugl...
347,#block,Ready for battle: Model Sara McKenna is fighti...
407,"#1, #1","By . Meghan Keneally . PUBLISHED: . 15:16 EST,..."
...,...,...
6838,#rapetruck,Police are investigating a 'kidnap' sticker pl...
6885,"#unfriend, #NUD, #unfriend",(CNN) -- Every holiday needs a song. And Natio...
6904,"#2018WorldCup, #Russia2018",Click here to see every World Cup's logo! The ...
6939,#thismorning,A BBC reporter slipped up today by referring t...


In [108]:
# 7. Split the text into sentences or words using regex delimiters.
df["sentences"] = df["Text"].apply(lambda x: re.split(r'(?<=[.!?])\s+', str(x)) if pd.notnull(x) else [])

df[["Text", "sentences"]]

Unnamed: 0,Text,sentences
0,"By . Anthony Bond . PUBLISHED: . 07:03 EST, 2 ...","[By ., Anthony Bond ., PUBLISHED: ., 07:03 EST..."
1,UNITED NATIONS (CNN) -- A rare meeting of U.N....,[UNITED NATIONS (CNN) -- A rare meeting of U.N...
2,Cover-up: Former Archbishop Lord Hope allowed ...,[Cover-up: Former Archbishop Lord Hope allowed...
3,"By . Kristie Lau . PUBLISHED: . 10:48 EST, 14 ...","[By ., Kristie Lau ., PUBLISHED: ., 10:48 EST,..."
4,'The lamps are going out all over Europe. We s...,"['The lamps are going out all over Europe., We..."
...,...,...
6995,France's Monaco will rival Arsenal in the race...,[France's Monaco will rival Arsenal in the rac...
6996,(CNN) -- Our country is obsessed with the peri...,[(CNN) -- Our country is obsessed with the per...
6997,By . Barbara Jones . Nicola de Menezes has des...,"[By ., Barbara Jones ., Nicola de Menezes has ..."
6998,(CNN) -- Barcelona and Real Madrid have roared...,[(CNN) -- Barcelona and Real Madrid have roare...


In [109]:
for i, sentence in enumerate(df.loc[0, 'sentences']):
    print(f"Sentence {i+1}: {sentence}")


Sentence 1: By .
Sentence 2: Anthony Bond .
Sentence 3: PUBLISHED: .
Sentence 4: 07:03 EST, 2 March 2013 .
Sentence 5: | .
Sentence 6: UPDATED: .
Sentence 7: 08:07 EST, 2 March 2013 .
Sentence 8: Three members of the same family who died in a static caravan from carbon monoxide poisoning would have been unconscious 'within minutes', investigators said today.
Sentence 9: The bodies of married couple John and Audrey Cook were discovered alongside their daughter, Maureen, at the mobile home they shared on Tremarle Home Park in Camborne, west Cornwall.
Sentence 10: The inquests have now opened into the deaths last Saturday, with investigators saying the three died along with the family's pet dog, of carbon monoxide poisoning from a cooker.
Sentence 11: Tragic: The inquests have opened into the deaths of three members of the same family who were found in their static caravan last weekend.
Sentence 12: John and Audrey Cook are pictured .
Sentence 13: Awful: The family died following carbon m