In [2]:
import pandas as pd
import numpy as np
import re
import string
import emoji
import nltk

In [3]:
def pre_processing(text):
    if isinstance(text, str):
        
        # Removing URLs
        def remove_url(text):
            pattern = re.compile(r'https?://\S+|www\.\S+')
            return pattern.sub(r'', text)

        text = remove_url(text)

        # Removing HTML tags
        def remove_html_tags(text):
            pattern = re.compile('<.*?>')
            return pattern.sub(r'', text)

        text = remove_html_tags(text)
        
        # Removing Emojis
        def remove_emojis(text):
            emoji_unicode_dict = emoji.get_emoji_unicode_dict(lang='en')
            emoji_pattern = re.compile("|".join(emoji_unicode_dict.keys()))
            return emoji_pattern.sub(r'', text)
    
        text = remove_emojis(text)

    return text

In [4]:
def create_passages(text, max_words_per_passage=100):
    if isinstance(text, str):
        sentences = nltk.sent_tokenize(text)
        passages = []
        passage = ""
        word_count = 0

        for sentence in sentences:
            words = nltk.word_tokenize(sentence)
            word_count += len(words)

            if word_count <= max_words_per_passage:
                passage += " " + sentence
            else:
                passages.append(passage.strip())
                passage = sentence
                word_count = len(words)

        if passage:
            passages.append(passage.strip())

        return passages

### Emails Data

In [17]:
emails_data = pd.read_csv('Email/emails_raw.csv')

#filtering 2022 emails for 10 months
emails_2022 = emails_data[emails_data['Year'] == 2022]
emails_2022 = emails_2022[emails_2022['Month'].isin(range(1,11))]
emails_2022.shape

(70765, 13)

In [19]:
emails_df = emails_2022[['Name', 'Subject', 'Date', 'Body']]
emails_df.head()

Unnamed: 0,Name,Subject,Date,Body
44038,Team Ronchetti,CAMPAIGN COUNTDOWN: EIGHT DAYS,2022-10-31T18:03:34Z,"This election is just 8 days away, and WE NEED..."
44042,John Fetterman,I’m not taking a single day off,2022-10-30T12:34:15Z,"Look, my team might not approve, but I’ll just..."
44127,Sara Jacobs,I’m asking one last time,2022-10-31T23:31:42Z,I know you’re probably getting a lot of emails...
44130,Adam Schiff,hey,2022-10-31T23:58:02Z,[ https://act.adamschiff.com/go/62?t=1001&akid...
44134,Brad Pfaff,This is the most important deadline of the race,2022-10-31T22:16:31Z,"Hi there, I just got word from my Finance Dire..."


In [23]:
# formatting date column
emails_df['Date'] = emails_df['Date'].apply(lambda x: pd.to_datetime(x).date())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emails_df['Date'] = emails_df['Date'].apply(lambda x: pd.to_datetime(x).date())


In [24]:
# before pre-processing

emails_df['Body'][34875:34900]

  emails_df['Body'][34875:34900]


79523    # [#] Team, It is clear Zoe knows what it mean...
79524    Hi team. It’s nice to meet you! My name is Emm...
79525    Doctor Oz For Senate What can you do?         ...
79526    https://martinheinrich.com/ [https://martinhei...
79527    Friends, Greg’s far-right opponent Rep. Chabot...
79528    Liz Cheney for Wyoming                        ...
79529    Friend, Like you, I am horrified by the Suprem...
79530    Doctor Oz For Senate                          ...
79531    Hello, I'm endorsing Senator Maggie Hassan's r...
79532    I want you to compare our character, vision an...
79533    Peter, The last few days have been some of the...
79534    Statements by Donald J. Trump, 45th President ...
79535    Derek, The Supreme Court’s decision to overtur...
79536    |                                             ...
79537    That means we still have time to undo the dama...
79538    Schmitt for Congress We can’t give up.        ...
79539    CRUCIAL FEC DEADLINE: LESS THAN 24 HOURS LEFT .

In [25]:
# pre-processing on "Body" column

emails_df['Body'] = emails_df['Body'].apply(pre_processing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emails_df['Body'] = emails_df['Body'].apply(pre_processing)


In [26]:
# after pre-processing

emails_df['Body'][34875:34900]

  emails_df['Body'][34875:34900]


79523    # [#] Team, It is clear Zoe knows what it mean...
79524    Hi team. It’s nice to meet you! My name is Emm...
79525    Doctor Oz For Senate What can you do?         ...
79526     [ Team, it’s official:Joe O’Dea is my Republi...
79527    Friends, Greg’s far-right opponent Rep. Chabot...
79528    Liz Cheney for Wyoming                        ...
79529    Friend, Like you, I am horrified by the Suprem...
79530    Doctor Oz For Senate                          ...
79531    Hello, I'm endorsing Senator Maggie Hassan's r...
79532    I want you to compare our character, vision an...
79533    Peter, The last few days have been some of the...
79534    Statements by Donald J. Trump, 45th President ...
79535    Derek, The Supreme Court’s decision to overtur...
79536    |                                             ...
79537    That means we still have time to undo the dama...
79538    Schmitt for Congress We can’t give up.        ...
79539    CRUCIAL FEC DEADLINE: LESS THAN 24 HOURS LEFT .

In [9]:
# j = 0
# for i in emails_df['Body']:
#     if j <=10:
#         print(i)
#         print("**********************************")
#         print()
#     j +=1

In [10]:
emails_df['Passages'] = emails_df['Body'].apply(create_passages)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emails_df['Passages'] = emails_df['Body'].apply(create_passages)


In [11]:
emails_df

Unnamed: 0,Name,Subject,Body,Passages
34715,DUE @ 11:59 PM | Upset the Setup,re: President Joe Biden,☆ OFFICIAL UTS APPROVAL POLL | REGISTERED TO P...,[☆ OFFICIAL UTS APPROVAL POLL | REGISTERED TO ...
34863,Coach Tuberville,🎉 Happy New Year 🎉,"96 Peter, The countdown is on – 2022 is quickl...","[96 Peter, The countdown is on – 2022 is quick..."
34872,Derek Kilmer for Congress,One last update,"Peter, Before tonight's festivities begin, we ...","[Peter, Before tonight's festivities begin, we..."
34873,Sarah Longwell,A great year for democracy,We thought you might want to see this special ...,[We thought you might want to see this special...
34875,Donald J. Trump,2022 is almost over,--- | | | | | | | Untitled Document![Trump 202...,[--- | | | | | | | Untitled Document! [Trump 2...
...,...,...,...,...
186607,Justices Gorsuch and Kavanaugh LIED to the Ame...,5X match to hold Republicans accountable,"Team, Representatives Ted Lieu and Alexandria ...","[Team, Representatives Ted Lieu and Alexandria..."
186608,Dan Goldman,A new playbook in Congress,"Today marks 50 days until Election Day, Peter....","[Today marks 50 days until Election Day, Peter..."
186609,Team Beshear,Andy's opponents are threatening to undo our p...,FIRST: It was reported that former disgraced g...,[FIRST: It was reported that former disgraced ...
186611,Tom Malinowski,thank you for keeping up this fight,͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ...,"[, ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏..."


In [12]:
emails_df.to_csv('Email/emails_processed.csv')

### Podcast Data

In [27]:
podcast_data = pd.read_csv('Podcast/podcast_data_raw.csv')
podcast_data

Unnamed: 0,Podcast,Date,Part,Transcript
0,Conservative_Review_with_Daniel_Horowitz,2022-05-12,1,And welcome back fellow American patriots and...
1,Conservative_Review_with_Daniel_Horowitz,2022-11-04,1,And welcome back fellow American patriots and...
2,Conservative_Review_with_Daniel_Horowitz,2022-02-10,1,"Set five years after Jedi Fallen Order, follo..."
3,Conservative_Review_with_Daniel_Horowitz,2022-05-16,1,Politics Without the Soap Opera with unfilter...
4,Conservative_Review_with_Daniel_Horowitz,2022-09-13,1,And welcome back fellow American patriots and...
...,...,...,...,...
6944,Louder_with_Crowder,2022-11-10,1,"Alright, Mr. Jeremy Quartering. Does your aud..."
6945,Louder_with_Crowder,2022-09-06,14,"You're running for press, you socialist cheat..."
6946,Louder_with_Crowder,2022-09-06,1,"Yeah, uh, don't get it twisted This late nigh..."
6947,Louder_with_Crowder,2022-09-07,2,It's Parody Week! You can find today's track ...


In [33]:
podcast_data['Month'] = podcast_data['Date'].apply(lambda x: pd.to_datetime(x).month)

In [37]:
podcast_data = podcast_data[podcast_data['Month'].isin(range(1,11))]

In [44]:
podcast_data = podcast_data.drop(columns='Month')
podcast_data

Unnamed: 0,Podcast,Date,Part,Transcript
0,Conservative_Review_with_Daniel_Horowitz,2022-05-12,1,And welcome back fellow American patriots and...
2,Conservative_Review_with_Daniel_Horowitz,2022-02-10,1,"Set five years after Jedi Fallen Order, follo..."
3,Conservative_Review_with_Daniel_Horowitz,2022-05-16,1,Politics Without the Soap Opera with unfilter...
4,Conservative_Review_with_Daniel_Horowitz,2022-09-13,1,And welcome back fellow American patriots and...
7,Conservative_Review_with_Daniel_Horowitz,2022-03-14,1,And welcome back fellow lab rats to the one a...
...,...,...,...,...
6943,Louder_with_Crowder,2022-03-22,1,"Ahoy audio listeners, it's Dave and you're li..."
6945,Louder_with_Crowder,2022-09-06,14,"You're running for press, you socialist cheat..."
6946,Louder_with_Crowder,2022-09-06,1,"Yeah, uh, don't get it twisted This late nigh..."
6947,Louder_with_Crowder,2022-09-07,2,It's Parody Week! You can find today's track ...


In [45]:
podcast_data['Transcript'] = podcast_data['Transcript'].apply(pre_processing)
podcast_data

Unnamed: 0,Podcast,Date,Part,Transcript
0,Conservative_Review_with_Daniel_Horowitz,2022-05-12,1,And welcome back fellow American patriots and...
2,Conservative_Review_with_Daniel_Horowitz,2022-02-10,1,"Set five years after Jedi Fallen Order, follo..."
3,Conservative_Review_with_Daniel_Horowitz,2022-05-16,1,Politics Without the Soap Opera with unfilter...
4,Conservative_Review_with_Daniel_Horowitz,2022-09-13,1,And welcome back fellow American patriots and...
7,Conservative_Review_with_Daniel_Horowitz,2022-03-14,1,And welcome back fellow lab rats to the one a...
...,...,...,...,...
6943,Louder_with_Crowder,2022-03-22,1,"Ahoy audio listeners, it's Dave and you're li..."
6945,Louder_with_Crowder,2022-09-06,14,"You're running for press, you socialist cheat..."
6946,Louder_with_Crowder,2022-09-06,1,"Yeah, uh, don't get it twisted This late nigh..."
6947,Louder_with_Crowder,2022-09-07,2,It's Parody Week! You can find today's track ...


In [15]:
# j = 0
# for i in podcast_data['Transcript']:
#     if j <=5:
#         print(i)
#         print("**********************************")
#         print()
#     j +=1

In [16]:
podcast_data['Passages'] = podcast_data['Transcript'].apply(create_passages)
podcast_data['Passages']

0       [And welcome back fellow American patriots and...
1       [And welcome back fellow American patriots and...
2       [Set five years after Jedi Fallen Order, follo...
3       [Politics Without the Soap Opera with unfilter...
4       [And welcome back fellow American patriots and...
                              ...                        
6944    [Alright, Mr. Jeremy Quartering. Does your aud...
6945    [, You're running for press, you socialist che...
6946    [, Yeah, uh, don't get it twisted This late ni...
6947    [It's Parody Week! You can find today's track ...
6948    [Alright, audio listener, I will keep this bri...
Name: Passages, Length: 6949, dtype: object

In [17]:
podcast_data.to_csv('Podcast/podcast_data_processed.csv')

In [51]:
podcast_data['Passages'][4]

["And welcome back fellow American patriots and minimans standing at the ready to fight anew for our life, our liberty, and our property here at Sierra podcast on this fine Tuesday, September 13th. Daniel Horowitz back here in the house today. And folks, I was thinking has there ever been a revolution in global history where there was no opposition? It's like you're facing a revolution but you don't even know it. Where are the troops? Where's the cavalry? Where's everyone?",
 'Name me a single issue that is confronting us systemically, the transhumanism, the medical fascism, the war on food fuel, forcing us to eat bugs and drink sewage water, the border, crime, anything where there is a meaningful, coherent opposition that is likely to get into power and implement a plan or even is articulating a plan. I think back to 2010, the issues of the time were Obamacare and spending, and at least in word, there was a united front to fight them.',
 "Now they lied about it, but you can at least l

### TV Data

In [19]:
tv_data = pd.read_csv('TV/Tv_data_raw_2022.csv')
tv_data

Unnamed: 0,addeddate,contributor,description,program,tuner,text
0,2022-02-02 08:00:00,MSNBCW,"""Chris Hayes reports on some of the biggest ne...",All In With Chris Hayes,Virtual Ch. 787,"tonight on all in. >> they know it\'s true, th..."
1,2022-01-02 00:30:00,RT,"""Markets! Finance! Scandal! Keiser Report i...",Keiser Report,Channel IPTV,"""ah ah whoa, happy new year special kaiser rep..."
2,2022-01-06 13:30:00,RT,Former First Minister of Scotland Alex Salmond...,The Alex Salmond Show,Channel IPTV,"""ah, for june . welcome to the alexander. so i..."
3,2022-01-14 05:00:00,RT,"RT news, interviews and shows available as pod...",News,Channel IPTV,"""ah ah, his royal highness, no more. queen eli..."
4,2022-01-16 17:00:00,RT,"RT news, interviews and shows available as pod...",News,Channel IPTV,"""ah, with tennis superstar new york of h is de..."
...,...,...,...,...,...,...
43279,2022-05-13 03:00:00,FOXNEWSW,Greg Gutfeld examines the news of the day thro...,Gutfeld!,Virtual Ch. 760,who bought them? >> they sent them an email. >...
43280,2022-10-27 00:30:00,RT,"""RT's documentaries give a varied and unique v...",Documentary,Channel IPTV,"""have any quality for russians, all we gotta d..."
43281,2022-10-26 17:30:00,RT,"""RT's documentaries give a varied and unique v...",Documentary,Channel IPTV,"""ah ah a ah with. ringback ringback ah, becaus..."
43282,2022-10-19 05:00:00,FOXNEWSW,Powerful analysis and spirited debates with gu...,Tucker Carlson Tonight,Virtual Ch. 760,""">> they willw em. they fought for us .sn no't..."


In [20]:
tv_data['text'] = tv_data['text'].apply(pre_processing)
tv_data

Unnamed: 0,addeddate,contributor,description,program,tuner,text
0,2022-02-02 08:00:00,MSNBCW,"""Chris Hayes reports on some of the biggest ne...",All In With Chris Hayes,Virtual Ch. 787,"tonight on all in. >> they know it\'s true, th..."
1,2022-01-02 00:30:00,RT,"""Markets! Finance! Scandal! Keiser Report i...",Keiser Report,Channel IPTV,"""ah ah whoa, happy new year special kaiser rep..."
2,2022-01-06 13:30:00,RT,Former First Minister of Scotland Alex Salmond...,The Alex Salmond Show,Channel IPTV,"""ah, for june . welcome to the alexander. so i..."
3,2022-01-14 05:00:00,RT,"RT news, interviews and shows available as pod...",News,Channel IPTV,"""ah ah, his royal highness, no more. queen eli..."
4,2022-01-16 17:00:00,RT,"RT news, interviews and shows available as pod...",News,Channel IPTV,"""ah, with tennis superstar new york of h is de..."
...,...,...,...,...,...,...
43279,2022-05-13 03:00:00,FOXNEWSW,Greg Gutfeld examines the news of the day thro...,Gutfeld!,Virtual Ch. 760,who bought them? >> they sent them an email. >...
43280,2022-10-27 00:30:00,RT,"""RT's documentaries give a varied and unique v...",Documentary,Channel IPTV,"""have any quality for russians, all we gotta d..."
43281,2022-10-26 17:30:00,RT,"""RT's documentaries give a varied and unique v...",Documentary,Channel IPTV,"""ah ah a ah with. ringback ringback ah, becaus..."
43282,2022-10-19 05:00:00,FOXNEWSW,Powerful analysis and spirited debates with gu...,Tucker Carlson Tonight,Virtual Ch. 760,""">> they willw em. they fought for us .sn no't..."


In [61]:
list(tv_data['program'].value_counts().index)

['Documentary',
 'News',
 'BBC News',
 'CNN Newsroom Live',
 'CrossTalk',
 'New Day With John Berman and Brianna Keilar',
 'The Papers',
 'Don Lemon Tonight',
 'Newsday',
 'Worlds Apart',
 'Hannity',
 'HARDtalk',
 'Anderson Cooper 360',
 'FOX\\n&\\nFriends First',
 'Tucker Carlson Tonight',
 'The Ingraham Angle',
 'CNN Newsroom With Poppy Harlow and Jim Sciutto',
 'CNN Newsroom With Alisyn Camerota and Victor Blackwell',
 '"The Last Word With Lawrence O\'Donnell"',
 'The Lead With Jake Tapper',
 'Morning Joe',
 'All In With Chris Hayes',
 'Dateline',
 'The 11th Hour With Stephanie Ruhle',
 'Sportsday',
 'Breakfast',
 'The Rachel Maddow Show',
 'The Travel Show',
 'Fox News Live',
 'The Situation Room With Wolf Blitzer',
 'FOX and Friends',
 'CNN Tonight',
 'CNN Newsroom With Fredricka Whitfield',
 'The Five',
 'Gutfeld!',
 'Going Underground',
 'Click',
 'MSNBC Reports',
 'CNN Newsroom With Pamela Brown',
 'The Beat With Ari Melber',
 'Special Report With Bret Baier',
 'Deadline: White

In [62]:
j = 0
for i in tv_data['text']:
    if j <=5:
        print(i)
        print("**********************************")
        print()
    j +=1

tonight on all in. >> they know it\'s true, they know it\'s there, they know who won the election, but they refused to say your right. >> donald trump himself was in on the scheme to seize voting machines. tonight the most incriminating evidence of the trump coup plot yet. the back man he used to help him do it and with the investigation is discovering from torn trump documents. then, the republicans suing to disqualify madison cawthorn from congress for engaging in\n            \n            insurrection joins me live. plus, jelani cobb on the right wing crackdown on teaching black history as black history month begins and one year after a certain senator\'s great escape to cancun, or works says that texas has failed to prepare for another freeze. all in starts right now.good evening from new york, i\'m chris hayes. we have new revelations tonight about the ex presidents coup plotting that contain some of the most incriminating evidence yet, i\'ll explain. so in the weeks following th

In [22]:
tv_data['Passages'] = tv_data['text'].apply(create_passages)
tv_data['Passages']

0        [tonight on all in. >> they know it\'s true, t...
1        ["ah ah whoa, happy new year special kaiser re...
2        ["ah, for june . welcome to the alexander. so ...
3        ["ah ah, his royal highness, no more. queen el...
4        ["ah, with tennis superstar new york of h is d...
                               ...                        
43279    [who bought them? >> they sent them an email. ...
43280    ["have any quality for russians, all we gotta ...
43281    ["ah ah a ah with. ringback ringback ah, becau...
43282    [">> they willw em. they fought for us .sn no'...
43283    ["ah a. ringback ringback because she spent wi...
Name: Passages, Length: 43284, dtype: object

In [25]:
tv_data.to_csv('TV/Tv_data_processed.csv')

### Embeddings

We specifically embed passages using a version of MPNet
that we fine-tune on the semantic text similarity (STS)
task [37], [56] using unsupervised contrastive learning for
sentence embeddings as specified in Gao et al. [57] on a
random assortment of passages from January 2022 from
our websites. We perform this fine-tuning with the default
hyperparameters (learning rate 3 × 10−5, batch size=128,
and 1M examples) specified in Gao et al. and by freezing
all but the last two layers of a public version of MPNet.2
See Appendix A for details. This ensures that our model is
attuned to the language present on our set of websites. As
seen in Table 1, despite not being trained on the SemEval
STS Benchmark [51], a benchmark for measuring the quality
of text embeddings, our model outperforms the fine-tuned
publicly released version of MPNet. After fine-tuning our
model, from the 2.1M articles, we embed 27,850,016 passages
(11.00 hours on an Nvidia RTX A6000).

In [28]:
# pip install sentence-transformers transformers

In [27]:
from sentence_transformers import SentenceTransformer, util
import torch

In [29]:
# Define the model

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [52]:
def generate_embeddings(text):
    embedding = model.encode(text, convert_to_tensor=True)
    return embedding

In [54]:
# podcast_data['Embeddings'] = podcast_data['Passages'].apply(generate_embeddings)