In [2]:
import pandas as pd
import numpy as np
import re
import string
import emoji
import nltk

In [3]:
def pre_processing(text):
    if isinstance(text, str):
        
        # Removing URLs
        def remove_url(text):
            pattern = re.compile(r'https?://\S+|www\.\S+')
            return pattern.sub(r'', text)

        text = remove_url(text)

        # Removing HTML tags
        def remove_html_tags(text):
            pattern = re.compile('<.*?>')
            return pattern.sub(r'', text)

        text = remove_html_tags(text)
        
        # Removing Emojis
        def remove_emojis(text):
            emoji_unicode_dict = emoji.get_emoji_unicode_dict(lang='en')
            emoji_pattern = re.compile("|".join(emoji_unicode_dict.keys()))
            return emoji_pattern.sub(r'', text)
    
        text = remove_emojis(text)

    return text

### Emails Data

In [17]:
emails_data = pd.read_csv('Email/emails_data_raw.csv')

#filtering 2022 emails for 10 months
emails_2022 = emails_data[emails_data['Year'] == 2022]
emails_2022 = emails_2022[emails_2022['Month'].isin(range(1,11))]
emails_2022.shape

(70765, 13)

In [19]:
emails_df = emails_2022[['Name', 'Subject', 'Date', 'Body']]
emails_df.head()

Unnamed: 0,Name,Subject,Date,Body
44038,Team Ronchetti,CAMPAIGN COUNTDOWN: EIGHT DAYS,2022-10-31T18:03:34Z,"This election is just 8 days away, and WE NEED..."
44042,John Fetterman,I’m not taking a single day off,2022-10-30T12:34:15Z,"Look, my team might not approve, but I’ll just..."
44127,Sara Jacobs,I’m asking one last time,2022-10-31T23:31:42Z,I know you’re probably getting a lot of emails...
44130,Adam Schiff,hey,2022-10-31T23:58:02Z,[ https://act.adamschiff.com/go/62?t=1001&akid...
44134,Brad Pfaff,This is the most important deadline of the race,2022-10-31T22:16:31Z,"Hi there, I just got word from my Finance Dire..."


In [23]:
# formatting date column
emails_df['Date'] = emails_df['Date'].apply(lambda x: pd.to_datetime(x).date())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emails_df['Date'] = emails_df['Date'].apply(lambda x: pd.to_datetime(x).date())


In [25]:
# pre-processing on "Body" column

emails_df['Body'] = emails_df['Body'].apply(pre_processing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emails_df['Body'] = emails_df['Body'].apply(pre_processing)


In [48]:
emails_df.head()

Unnamed: 0,Name,Subject,Date,Body
44038,Team Ronchetti,CAMPAIGN COUNTDOWN: EIGHT DAYS,2022-10-31,"This election is just 8 days away, and WE NEED..."
44042,John Fetterman,I’m not taking a single day off,2022-10-30,"Look, my team might not approve, but I’ll just..."
44127,Sara Jacobs,I’m asking one last time,2022-10-31,I know you’re probably getting a lot of emails...
44130,Adam Schiff,hey,2022-10-31,[ ]AdamSchiff for Congress Hey Derek — I hope...
44134,Brad Pfaff,This is the most important deadline of the race,2022-10-31,"Hi there, I just got word from my Finance Dire..."


In [9]:
# j = 0
# for i in emails_df['Body']:
#     if j <=10:
#         print(i)
#         print("**********************************")
#         print()
#     j +=1

In [50]:
emails_df.to_csv('Email/emails_data_processed.csv')

### Podcast Data

In [27]:
podcast_data = pd.read_csv('Podcast/podcast_data_raw.csv')
podcast_data

Unnamed: 0,Podcast,Date,Part,Transcript
0,Conservative_Review_with_Daniel_Horowitz,2022-05-12,1,And welcome back fellow American patriots and...
1,Conservative_Review_with_Daniel_Horowitz,2022-11-04,1,And welcome back fellow American patriots and...
2,Conservative_Review_with_Daniel_Horowitz,2022-02-10,1,"Set five years after Jedi Fallen Order, follo..."
3,Conservative_Review_with_Daniel_Horowitz,2022-05-16,1,Politics Without the Soap Opera with unfilter...
4,Conservative_Review_with_Daniel_Horowitz,2022-09-13,1,And welcome back fellow American patriots and...
...,...,...,...,...
6944,Louder_with_Crowder,2022-11-10,1,"Alright, Mr. Jeremy Quartering. Does your aud..."
6945,Louder_with_Crowder,2022-09-06,14,"You're running for press, you socialist cheat..."
6946,Louder_with_Crowder,2022-09-06,1,"Yeah, uh, don't get it twisted This late nigh..."
6947,Louder_with_Crowder,2022-09-07,2,It's Parody Week! You can find today's track ...


In [33]:
podcast_data['Month'] = podcast_data['Date'].apply(lambda x: pd.to_datetime(x).month)

In [37]:
podcast_data = podcast_data[podcast_data['Month'].isin(range(1,11))]

In [44]:
podcast_data = podcast_data.drop(columns='Month')
podcast_data

Unnamed: 0,Podcast,Date,Part,Transcript
0,Conservative_Review_with_Daniel_Horowitz,2022-05-12,1,And welcome back fellow American patriots and...
2,Conservative_Review_with_Daniel_Horowitz,2022-02-10,1,"Set five years after Jedi Fallen Order, follo..."
3,Conservative_Review_with_Daniel_Horowitz,2022-05-16,1,Politics Without the Soap Opera with unfilter...
4,Conservative_Review_with_Daniel_Horowitz,2022-09-13,1,And welcome back fellow American patriots and...
7,Conservative_Review_with_Daniel_Horowitz,2022-03-14,1,And welcome back fellow lab rats to the one a...
...,...,...,...,...
6943,Louder_with_Crowder,2022-03-22,1,"Ahoy audio listeners, it's Dave and you're li..."
6945,Louder_with_Crowder,2022-09-06,14,"You're running for press, you socialist cheat..."
6946,Louder_with_Crowder,2022-09-06,1,"Yeah, uh, don't get it twisted This late nigh..."
6947,Louder_with_Crowder,2022-09-07,2,It's Parody Week! You can find today's track ...


In [49]:
podcast_data['Transcript'] = podcast_data['Transcript'].apply(pre_processing)
podcast_data

Unnamed: 0,Podcast,Date,Part,Transcript
0,Conservative_Review_with_Daniel_Horowitz,2022-05-12,1,And welcome back fellow American patriots and...
2,Conservative_Review_with_Daniel_Horowitz,2022-02-10,1,"Set five years after Jedi Fallen Order, follo..."
3,Conservative_Review_with_Daniel_Horowitz,2022-05-16,1,Politics Without the Soap Opera with unfilter...
4,Conservative_Review_with_Daniel_Horowitz,2022-09-13,1,And welcome back fellow American patriots and...
7,Conservative_Review_with_Daniel_Horowitz,2022-03-14,1,And welcome back fellow lab rats to the one a...
...,...,...,...,...
6943,Louder_with_Crowder,2022-03-22,1,"Ahoy audio listeners, it's Dave and you're li..."
6945,Louder_with_Crowder,2022-09-06,14,"You're running for press, you socialist cheat..."
6946,Louder_with_Crowder,2022-09-06,1,"Yeah, uh, don't get it twisted This late nigh..."
6947,Louder_with_Crowder,2022-09-07,2,It's Parody Week! You can find today's track ...


In [15]:
# j = 0
# for i in podcast_data['Transcript']:
#     if j <=5:
#         print(i)
#         print("**********************************")
#         print()
#     j +=1

In [51]:
podcast_data.to_csv('Podcast/podcast_data_processed.csv')

### TV Data

In [52]:
tv_data = pd.read_csv('TV/tv_data_raw_2022.csv')
tv_data

Unnamed: 0,addeddate,contributor,description,program,tuner,text
0,2022-02-02 08:00:00,MSNBCW,"""Chris Hayes reports on some of the biggest ne...",All In With Chris Hayes,Virtual Ch. 787,"tonight on all in. >> they know it\'s true, th..."
1,2022-01-02 00:30:00,RT,"""Markets! Finance! Scandal! Keiser Report i...",Keiser Report,Channel IPTV,"""ah ah whoa, happy new year special kaiser rep..."
2,2022-01-06 13:30:00,RT,Former First Minister of Scotland Alex Salmond...,The Alex Salmond Show,Channel IPTV,"""ah, for june . welcome to the alexander. so i..."
3,2022-01-14 05:00:00,RT,"RT news, interviews and shows available as pod...",News,Channel IPTV,"""ah ah, his royal highness, no more. queen eli..."
4,2022-01-16 17:00:00,RT,"RT news, interviews and shows available as pod...",News,Channel IPTV,"""ah, with tennis superstar new york of h is de..."
...,...,...,...,...,...,...
43279,2022-05-13 03:00:00,FOXNEWSW,Greg Gutfeld examines the news of the day thro...,Gutfeld!,Virtual Ch. 760,who bought them? >> they sent them an email. >...
43280,2022-10-27 00:30:00,RT,"""RT's documentaries give a varied and unique v...",Documentary,Channel IPTV,"""have any quality for russians, all we gotta d..."
43281,2022-10-26 17:30:00,RT,"""RT's documentaries give a varied and unique v...",Documentary,Channel IPTV,"""ah ah a ah with. ringback ringback ah, becaus..."
43282,2022-10-19 05:00:00,FOXNEWSW,Powerful analysis and spirited debates with gu...,Tucker Carlson Tonight,Virtual Ch. 760,""">> they willw em. they fought for us .sn no't..."


In [53]:
tv_data['text'] = tv_data['text'].apply(pre_processing)
tv_data

Unnamed: 0,addeddate,contributor,description,program,tuner,text
0,2022-02-02 08:00:00,MSNBCW,"""Chris Hayes reports on some of the biggest ne...",All In With Chris Hayes,Virtual Ch. 787,"tonight on all in. >> they know it\'s true, th..."
1,2022-01-02 00:30:00,RT,"""Markets! Finance! Scandal! Keiser Report i...",Keiser Report,Channel IPTV,"""ah ah whoa, happy new year special kaiser rep..."
2,2022-01-06 13:30:00,RT,Former First Minister of Scotland Alex Salmond...,The Alex Salmond Show,Channel IPTV,"""ah, for june . welcome to the alexander. so i..."
3,2022-01-14 05:00:00,RT,"RT news, interviews and shows available as pod...",News,Channel IPTV,"""ah ah, his royal highness, no more. queen eli..."
4,2022-01-16 17:00:00,RT,"RT news, interviews and shows available as pod...",News,Channel IPTV,"""ah, with tennis superstar new york of h is de..."
...,...,...,...,...,...,...
43279,2022-05-13 03:00:00,FOXNEWSW,Greg Gutfeld examines the news of the day thro...,Gutfeld!,Virtual Ch. 760,who bought them? >> they sent them an email. >...
43280,2022-10-27 00:30:00,RT,"""RT's documentaries give a varied and unique v...",Documentary,Channel IPTV,"""have any quality for russians, all we gotta d..."
43281,2022-10-26 17:30:00,RT,"""RT's documentaries give a varied and unique v...",Documentary,Channel IPTV,"""ah ah a ah with. ringback ringback ah, becaus..."
43282,2022-10-19 05:00:00,FOXNEWSW,Powerful analysis and spirited debates with gu...,Tucker Carlson Tonight,Virtual Ch. 760,""">> they willw em. they fought for us .sn no't..."


In [54]:
# j = 0
# for i in tv_data['text']:
#     if j <=5:
#         print(i)
#         print("**********************************")
#         print()
#     j +=1

In [57]:
tv_data.to_csv('TV/tv_data_processed.csv')