In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from itertools import chain
from collections import Counter
import string
import random
from itertools import islice
import json


import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from transformers import pipeline
import pickle

In [2]:
email_data = pd.read_pickle('Email/emails_uni_bi_neg.pkl')

In [3]:
email_unigrams = email_data['unigrams']
email_bigrams = email_data['bigrams']

In [4]:
podcast_data = pd.read_pickle('Podcast/podcast_uni_bi_neg.pkl')

In [5]:
podcast_unigrams = podcast_data['unigrams']
podcast_bigrams = podcast_data['bigrams']

In [6]:
tv_data = pd.read_pickle('TV/TV_uni_bigrams.pkl')

In [7]:
tv_unigrams = tv_data['unigrams']
tv_bigrams = tv_data['bigrams']

In [8]:
total_frequencies = {}
for source_unigrams in [email_unigrams, podcast_unigrams, tv_unigrams]:
    for word, freq in source_unigrams.items():
        total_frequencies[word] = total_frequencies.get(word, 0) + freq

relative_frequencies = {}
for source, source_unigrams in [("emails", email_unigrams), ("podcasts", podcast_unigrams), ("tv", tv_unigrams)]:
    relative_frequencies[source] = {word: freq / total_frequencies[word] for word, freq in source_unigrams.items()}

unique_unigrams = {}
for source, source_relative_frequencies in relative_frequencies.items():
    keys = sorted(source_relative_frequencies.keys(), key=lambda x: source_relative_frequencies[x], reverse=True)[:200]
    unique_unigrams[source] = [(key, total_frequencies[key]) for key in keys]

In [9]:
unique_unigrams['emails']

[('͏', 258721),
 ('dpwillis67gmailcom', 24331),
 ('→', 13661),
 ('͏͏', 7754),
 ('todpwillis67gmailcom', 5388),
 ('tounsubscribe', 4501),
 ('endofmonth', 4367),
 ('➞', 4352),
 ('22314', 4094),
 ('★', 3374),
 ('yoursubscription', 3233),
 ('endofquarter', 3084),
 ('20003', 2796),
 ('pleaseclick', 2513),
 ('actionnetworkorg', 2461),
 ('oraddress', 2461),
 ('noncontribution', 2181),
 ('candidatescommittee', 2139),
 ('taxpurposes', 2103),
 ('anymoreunsubscribe', 2040),
 ('majoritypac', 2009),
 ('grassrootssupporters', 1792),
 ('donationwill', 1791),
 ('paymentinformation', 1730),
 ('pleaseunsubscribe', 1693),
 ('22219', 1564),
 ('page625', 1489),
 ('88022', 1483),
 ('32853', 1362),
 ('box536926', 1255),
 ('preferencespage625', 1248),
 ('implyendorsement', 1242),
 ('unsubscribefrom', 1220),
 ('\u200b', 1199),
 ('98104', 1192),
 ('ouremails', 1185),
 ('receiveemail', 1184),
 ('caraveo', 1135),
 ('inerror', 1112),
 ('removeyourself', 1096),
 ('ruinour', 1080),
 ('❌', 1079),
 ('averageonline', 1

In [10]:
unique_unigrams['podcasts']

[('it′s', 3145),
 ('puretalkcom', 2370),
 ('don′t', 1938),
 ('that′s', 1894),
 ('877444gold', 1625),
 ('8773813811', 1592),
 ('moink', 1410),
 ('expressvpn', 1369),
 ('i′m', 1339),
 ('§§', 1338),
 ('65532', 1220),
 ('you′re', 1207),
 ('they′re', 1184),
 ('hometitlelockcom', 1159),
 ('chumba', 1131),
 ('betmgm', 1015),
 ('we′re', 1001),
 ('billoreillycom', 965),
 ('levinforhillsdalecom', 954),
 ('989898', 939),
 ('expressvpncom', 869),
 ('goldmines', 868),
 ('contingencymedicalcom', 856),
 ('poso', 785),
 ('800941sean', 740),
 ('there′s', 728),
 ('goodrancherscom', 710),
 ('timcastcom', 685),
 ('moinkboxcom', 682),
 ('levinpodcast', 668),
 ('goldcocom', 666),
 ('he′s', 651),
 ('patriotmobilecom', 614),
 ('8009062440', 592),
 ('mypillowcom', 582),
 ('warroomhealthcom', 560),
 ('americanfinancingnet', 554),
 ('goldline', 521),
 ('hillsdalecom', 513),
 ('can′t', 503),
 ('inventhelp', 497),
 ('relieffactorcom', 451),
 ('lifelockcom', 442),
 ('blindscom', 442),
 ('unpluggedcom', 394),
 ('nml

In [11]:
unique_unigrams['tv']

[('xe2x80x94', 363567),
 ('xe2x99xaa', 355856),
 ('xe2x80x94xe2x80x94', 41798),
 ('xe2x99xaaxe2x99xaa', 24526),
 ('xefxbfxbd', 23754),
 ('borisjohnson', 22464),
 ('trelegy', 19258),
 ('rinvoq', 19033),
 ('safelite', 18377),
 ('ofxe2x80x94', 18220),
 ('ozempicxc2xae', 17069),
 ('rybelsusxc2xae', 14820),
 ('oncedaily', 10967),
 ('otezla', 9732),
 ('skyrizi', 9158),
 ('0ur', 8398),
 ('0k', 8246),
 ('boostxc2xae', 8131),
 ('toxe2x80x94', 8110),
 ('botoxxc2xae', 8052),
 ('limu', 7873),
 ('tremfyaxc2xae', 7714),
 ('southxe2x80x94east', 7663),
 ('isxe2x80x94', 7562),
 ('itxefxbfxbds', 7405),
 ('forxe2x80x94', 6810),
 ('cabenuva', 6782),
 ('ixe2x80x94', 6500),
 ('itxe2x80x94', 6422),
 ('astepro', 6333),
 ('dupixent', 6309),
 ('northxe2x80x94west', 6230),
 ('relaxium', 6140),
 ('neuriva', 5995),
 ('southxe2x80x94west', 5919),
 ('latuda', 5658),
 ('jardiance', 5630),
 ('psoriatic', 5595),
 ('thatxe2x80x94', 5505),
 ('prevagen', 5404),
 ('indeedcomhire', 5181),
 ('ahah', 5006),
 ('shingrix', 4964

In [12]:
total_bigram_frequencies = {}
for source_bigrams in [email_bigrams, podcast_bigrams, tv_bigrams]:
    for bigram, freq in source_bigrams.items():
        total_bigram_frequencies[bigram] = total_bigram_frequencies.get(bigram, 0) + freq

relative_bigram_frequencies = {}
for source, source_bigrams in [("emails", email_bigrams), ("podcasts", podcast_bigrams), ("tv", tv_bigrams)]:
    relative_bigram_frequencies[source] = {bigram: freq / total_bigram_frequencies[bigram] for bigram, freq in source_bigrams.items()}

unique_bigrams = {}
for source, source_relative_frequencies in relative_bigram_frequencies.items():
    keys = sorted(source_relative_frequencies.keys(), key=lambda x: source_relative_frequencies[x], reverse=True)[:200]
    unique_bigrams[source] = [(key, total_bigram_frequencies[key]) for key in keys]

In [13]:
unique_bigrams['emails']

[(('͏', '͏'), 250200),
 (('to', 'dpwillis67gmailcom'), 16697),
 (('can', '’'), 15731),
 (('actblue', 'express'), 8909),
 (('please', 'unsubscribe'), 7884),
 (('͏', '͏͏'), 7747),
 (('͏͏', '͏'), 7712),
 (('fewer', 'emails'), 7457),
 (('receive', 'email'), 7168),
 (('now', '»'), 6451),
 (('receive', 'fewer'), 5334),
 (('sent', 'todpwillis67gmailcom'), 5289),
 (('unsubscribe', 'here'), 5236),
 (('alexandria', 'va'), 4123),
 (('va', '22314'), 4055),
 (('not', 'deductible'), 4028),
 (('contribution', 'account'), 4021),
 (('fundraising', 'deadline'), 3973),
 (('183', 'alexandria'), 3830),
 (('dpwillis67gmailcom', 'to'), 3829),
 (('street', '183'), 3714),
 (('»', 'donate'), 3613),
 (('isn', '’'), 3347),
 (('america', 'jfc'), 3305),
 (('other', 'amount'), 3235),
 (('change', 'yoursubscription'), 3231),
 (('doesn', '’'), 2897),
 (('dpwillis67gmailcom', 'should'), 2874),
 (('unsubscribe', 'for'), 2855),
 (('subscribed', 'or'), 2785),
 (('yoursubscription', 'information'), 2785),
 (('unsubscribe',

In [14]:
unique_bigrams['podcasts']

[(('beck', 'program'), 3286),
 (('mr', 'producer'), 1752),
 (('to', 'puretalkcom'), 1574),
 (('levin', 'podcast'), 1392),
 (('birch', 'gold'), 1384),
 (('enter', 'promo'), 1341),
 (('§§', '§§'), 1302),
 (('beautiful', 'anonymous'), 1282),
 (('oreilly', 'update'), 1268),
 (('to', '65532'), 1220),
 (('free', 'silver'), 1218),
 (('michael', 'savage'), 1176),
 (('the', 'oreilly'), 1101),
 (('877444gold', 'or'), 1092),
 (('text', 'gold'), 1042),
 (('rough', 'greens'), 1030),
 (('good', 'ranchers'), 1008),
 (('spin', 'news'), 994),
 (('code', 'sean'), 981),
 (('events', 'daily'), 928),
 (('slash', 'sean'), 925),
 (('knowles', 'show'), 922),
 (('to', '989898'), 916),
 (('slash', 'hannity'), 890),
 (('anonymous', 'every'), 847),
 (('bonus', 'bets'), 842),
 (('lock', 'dot'), 838),
 (('talk', 'text'), 817),
 (('contingency', 'medical'), 816),
 (('room', 'pandemic'), 786),
 (('i', 'don′t'), 777),
 (('thats', 'levin'), 775),
 (('slash', 'beck'), 768),
 (('slash', 'ben'), 763),
 (('tuttle', 'twins'

In [15]:
unique_bigrams['tv']

[(('xe2x99xaa', 'xe2x99xaa'), 124106),
 (('xe2x80x94', 'the'), 18736),
 (('xe2x80x94', 'and'), 15905),
 (('comcast', 'business'), 15730),
 (('the', 'xe2x80x94'), 11553),
 (('sleep', 'number'), 9787),
 (('xe2x99xaa', 'ive'), 9763),
 (('weather', 'front'), 9658),
 (('xe2x99xaa', 'i'), 9217),
 (('clearer', 'skin'), 8069),
 (('man', 'xe2x99xaa'), 7761),
 (('smart', 'bed'), 7674),
 (('how', 'abbvie'), 7551),
 (('xe2x80x94', 'a'), 7284),
 (('number', '360'), 7046),
 (('the', 'programme'), 7000),
 (('boostxc2xae', 'high'), 6988),
 (('to', 'xe2x80x94'), 6987),
 (('and', 'xe2x80x94'), 6888),
 (('occur', 'tell'), 6883),
 (('need', 'xe2x99xaa'), 6776),
 (('xe2x80x94', 'that'), 6521),
 (('995', 'a'), 6298),
 (('a', 'xe2x80x94'), 6232),
 (('xe2x99xaa', 'liberty'), 6117),
 (('sunny', 'spells'), 6094),
 (('if', 'allergic'), 6084),
 (('xe2x80x94', 'but'), 6078),
 (('fisher', 'investments'), 6022),
 (('some', 'rain'), 5995),
 (('provider', 'about'), 5910),
 (('xe2x80x94', 'to'), 5794),
 (('xfinity', 'm