In [37]:
import pickle
import spacy
import pandas as pd
from collections import Counter

In [None]:
## Three Considerations!
'''
    1. Keeping only some article types, e.g. news and editorails/opeds
    
    2. removing sentences with bias
    
    3. having a balanced train/dev/test datasets
    
    4. keeping only article of minimum sentence length!  --> 10-20?
    
'''

## Functions

In [39]:
def prepare_tsv(articles, labels, filename):
    with open(filename, 'w') as writer:
        writer.write('article\tlabel\n')
        for article, label in zip(articles, labels):
            processed_article = article.replace('\t',' ')
            processed_article = processed_article.replace('\n',' ')
            processed_article = processed_article.replace('\r',' ')
            writer.write('{}\t{}\n'.format(processed_article, label))

def sent_tsv_file(articles, filename):
    nlp = spacy.load('en_core_web_sm')
    
    with open(filename, 'w') as writer:
        writer.write('sentence\tplaceholder_label\n')
        
        for i, article in enumerate(articles):
            processed_article = article.replace('\t',' ')
            processed_article = processed_article.replace('\n',' ')
            processed_article = processed_article.replace('\r',' ')
            
            doc = nlp(processed_article)
            sentences = remove_biased_sent(i, doc.sents)
            for sent in sentences:
                writer.write('{}\t0\n'.format(sent))
            writer.write('ARTICLE_SPLIT_LINE\t0\n')

def remove_biased_sent(article_id, article_sents):
    bias_list = ['bias_1', 'bias_2', 'bias_3']
    cleaned_sents, count = [], 0
    for sent in article_sents:
        if not any([bias in sent for bias in bias_list]):
            cleaned_sents.append(sent)
        else:
            count += 1
    
    print('removed {} sentences from article {}'.format(count, article_id))

### Reading *Unclean* Data

In [40]:
train = pickle.load(open('/Users/tariq/Downloads/Bloomberg_Editorial_Classifier/data/train_test_v2/X_train.pkl','rb'))
train_labels = pickle.load(open('/Users/tariq/Downloads/Bloomberg_Editorial_Classifier/data/train_test_v2/y_train.pkl','rb'))

dev = pickle.load(open('/Users/tariq/Downloads/Bloomberg_Editorial_Classifier/data/train_test_v2/X_test.pkl','rb'))
dev_labels = pickle.load(open('/Users/tariq/Downloads/Bloomberg_Editorial_Classifier/data/train_test_v2/y_test.pkl','rb'))

test = pickle.load(open('/Users/tariq/Downloads/Bloomberg_Editorial_Classifier/data/collected/metro winnipeg - extra test/dev.pkl','rb'))

### Spacy Zone!

In [17]:
nlp = spacy.load('en_core_web_sm')

train_articles = []
for text in train.article_text:
    processed_article = text.replace('\t',' ')
    processed_article = processed_article.replace('\n',' ')
    processed_article = processed_article.replace('\r',' ')
    train_articles.append(nlp(processed_article))

dev_articles = []
for text in dev.article_text:
    processed_article = text.replace('\t',' ')
    processed_article = processed_article.replace('\n',' ')
    processed_article = processed_article.replace('\r',' ')
    dev_articles.append(nlp(processed_article))

test_articles = []
for text in test.text:
    processed_article = text.replace('\t',' ')
    processed_article = processed_article.replace('\n',' ')
    processed_article = processed_article.replace('\r',' ')
    test_articles.append(nlp(processed_article))

pickle.dump(train_articles, open('../pkl/spacy_train_sent.p','wb'))
pickle.dump(dev_articles, open('../pkl/spacy_dev_sent.p','wb'))
pickle.dump(test_articles, open('../pkl/spacy_test_sent.p','wb'))


train_articles = pickle.load(open('../pkl/spacy_train_sent.p','rb'))
dev_articles = pickle.load(open('../pkl/spacy_dev_sent.p','rb'))
test_articles = pickle.load(open('../pkl/spacy_test_sent.p','rb'))

In [33]:
train.article_text[254]

' A reminder for Mr. Trump who cancelled a trip to Denmark because of a "nasty" comment by its prime minister (who said selling Greenland to the United States is "an absurd discussion"), something we learned about age 5: sticks and stones may break my bones, but names will never hurt me.'

In [36]:
for i,doc in enumerate(train_articles):
    if len(list(doc.sents)) == 10:
        print(i,repr(doc.text))
        print()

46 ' In an article Tuesday ("Coastal Commission considers limiting off-road access to Oceano Dunes," July 2), the term "right" was used to describe use of the dunes by off-roaders. One must be reminded that the word "right" is a very strong and important word. It is something you have as a part of being a citizen or human. It is not given to you. You always have it, as in civil rights. So, when describing the issue of off-roaders on public lands, a word that more accurately defines the issue is "permission," something that can be dissolved if facts change. A driverâ€™s license is a "permission" issued by the state. It can be denied for many reasons, such as reckless speeding. We must not use the words "right" and "permission" interchangeably. Each one has a purpose.'

47 ' While sitting in the patio enjoying a cup of coffee and reading The Californian on the morning of the Fourth of July, I came across a column written by Matt Munoz, a Californian contributing columnist for what’s happ

1789 'Editor: President Donald Trump stood in the rain to honor our great country and the freedom we all enjoy. This speech was not intended to be a book report on history or English literature. An article by Gilliam Brockell of The Washington Post on July 5 didn’t mention the heroics of Sgt. Alvin York, or the young men who took Omaha Beach; it didn’t mention the Korean conflict, when soldiers were forced to retreat with frozen bodies and feet; the Vietnam conflict when many young brave Americans lost their lives for the cause of freedom; or recent casualties of those who served in a voluntary status to protect our freedom, many deployed four or six times. Freedom isn’t free, dear readers. Our president, like former presidents has kept our great America free from all enemies, foreign and domestic. May the God of our fathers continue to protect us. “God Bless America,” the home of the brave and the land of the free. William O. “Bill” Jordan Charleston'

1909 'Editor, As the spouse of a

3836 " ENID, Okla. — Enid Police Department is seeking the public's help in locating a person seen in surveillance video burglarizing a local Tex-Mex restaurant earlier this month. About 6:30 a.m. Oct. 8, 2018, EPD officers were sent to Taco Mayo, 118 N. Van Buren, in reference to a burglary. Officers found the cash register drawer and the safe open. Officers were advised by Taco Mayo management an undisclosed amount of money was taken from the business. Anyone with information leading to the arrest or prosecution of this or any crime can report an anonymous tip with Crime Stoppers at (580) 233-6233, www.enid.org/departments/police or text to 274637, typing “Enid” and a message in the text box. Those submitting a tip could earn a reward up to $1,000 and will not be required to testify nor be identified. Click for the latest,full-access Enid News & Eagle headlines|Text Alerts|app downloads "

3897 ' COVINGTON, Okla. — A 24-year-old Perry man and his passenger were injured early Saturday

5737 "LOS ANGELES — One of the Los Angeles houses where followers of Charles Manson committed notorious murders in 1969 is for sale. The home in the hilly Los Feliz district is where Leno and Rosemary LaBianca were slain the night after actress Sharon Tate and four others were murdered by Manson followers in Benedict Canyon. Redfin listing agent Robert Giambalvo tells the Los Angeles Times the two-bedroom home is priced at $1.98 million. The house falls outside a requirement to inform buyers if a death occurred in a property in the previous three years, but Giambalvo says he noted on the multiple listing service that it's the LaBianca house and agents should do research before showing it. The house has changed hands several times since 1969 and last sold in 1998. ___ Information from: Los Angeles Times, http://www.latimes.com/"

5819 'The internet can\'t help but swoon over Stephen and Ayesha Curry\'s wedding anniversary. The Warriors superstar and the lifestyle guru recently took to I

# Archive

In [12]:
train_info = []
for i, (tag, label, pub) in enumerate(zip(train.tag,train_labels.label, train.source_name)):
    train_info.append((i, tag, label, pub))
    print('{0}\t*{2}*\t{3}\t{1}'.format(i, tag, label, pub))

0	*1*	Californian	opinion
1	*1*	Californian	opinion, community-voices
2	*1*	Californian	opinion, our-view
3	*1*	Californian	community-voices
4	*1*	Californian	opinion, letters-to-editor
5	*1*	Californian	opinion
6	*1*	Californian	opinion, community-voices
7	*1*	Californian	opinion, community-voices
8	*1*	Californian	opinion
9	*1*	Californian	opinion
10	*1*	Californian	opinion, community-voices
11	*1*	Californian	opinion, community-voices
12	*1*	Californian	opinion
13	*1*	Californian	opinion
14	*1*	Californian	opinion, letters-to-editor
15	*1*	Californian	opinion, letters-to-editor
16	*1*	Californian	opinion, letters-to-editor
17	*1*	Californian	opinion, community-voices
18	*1*	Californian	opinion, community-voices
19	*1*	Californian	opinion, community-voices
20	*1*	Californian	opinion
21	*1*	Californian	opinion, community-voices
22	*1*	Californian	opinion
23	*1*	Californian	opinion, community-voices
24	*1*	Californian	opinion, community-voices
25	*1*	Californian	opinion
26	*1*	Californ

1357	*1*	Gazette-mail	columnists
1358	*1*	Gazette-mail	editorial
1359	*1*	Gazette-mail	readers_vent
1360	*1*	Gazette-mail	columnists
1361	*1*	Gazette-mail	editorial
1362	*1*	Gazette-mail	columnists
1363	*1*	Gazette-mail	readers_vent
1364	*1*	Gazette-mail	columnists
1365	*1*	Gazette-mail	op_ed_commentaries
1366	*1*	Gazette-mail	editorial
1367	*1*	Gazette-mail	columnists
1368	*1*	Gazette-mail	op_ed_commentaries
1369	*1*	Gazette-mail	readers_vent
1370	*1*	Gazette-mail	op_ed_commentaries
1371	*1*	Gazette-mail	op_ed_commentaries
1372	*1*	Gazette-mail	op_ed_commentaries
1373	*1*	Gazette-mail	columnists
1374	*1*	Gazette-mail	columnists
1375	*1*	Gazette-mail	editorial
1376	*1*	Gazette-mail	nan
1377	*1*	Gazette-mail	columnists
1378	*1*	Gazette-mail	editorial
1379	*1*	Gazette-mail	columnists
1380	*1*	Gazette-mail	nan
1381	*1*	Gazette-mail	columnists
1382	*1*	Gazette-mail	columnists
1383	*1*	Gazette-mail	editorial
1384	*1*	Gazette-mail	readers_vent
1385	*1*	Gazette-mail	columnists
1386	*1*	Gazett

2797	*1*	Press Democrat	nan
2798	*1*	Press Democrat	nan
2799	*1*	Press Democrat	nan
2800	*1*	Press Democrat	nan
2801	*1*	Press Democrat	nan
2802	*1*	Press Democrat	nan
2803	*1*	Press Democrat	nan
2804	*1*	Press Democrat	nan
2805	*1*	Press Democrat	nan
2806	*1*	Press Democrat	nan
2807	*1*	Press Democrat	nan
2808	*1*	Press Democrat	nan
2809	*1*	Press Democrat	nan
2810	*1*	Press Democrat	nan
2811	*1*	Press Democrat	nan
2812	*1*	Press Democrat	nan
2813	*1*	Press Democrat	nan
2814	*1*	Press Democrat	nan
2815	*1*	Press Democrat	nan
2816	*1*	Press Democrat	nan
2817	*1*	Press Democrat	nan
2818	*1*	Press Democrat	nan
2819	*1*	Press Democrat	nan
2820	*1*	Press Democrat	nan
2821	*1*	Press Democrat	nan
2822	*1*	Press Democrat	nan
2823	*1*	Press Democrat	nan
2824	*1*	Press Democrat	nan
2825	*1*	Press Democrat	nan
2826	*1*	Press Democrat	nan
2827	*1*	Press Democrat	nan
2828	*1*	Press Democrat	nan
2829	*1*	Press Democrat	nan
2830	*1*	Press Democrat	nan
2831	*1*	Press Democrat	nan
2832	*1*	Press Democ

4296	*0*	Enid News	tyson seng, team, david allen memorial ballpark, sport, baseball
4297	*0*	Enid News	luncheon, ywca enid, company, politics, economics, criminal law, ywca, cheri ezzell, fundraiser, auction, jennifer bauman, shanna parker
4298	*0*	Enid News	tyler christians, wade burleson, immigration, sociology, christianity, law, telephony, emmanuel enid, seminar, reservation
4299	*0*	Enid News	far from home, peter parker, film, cinema, mysterio, jake gyllenhaal, scene, kind, trailer
4300	*0*	Enid News	rayjohn ramsey, craig tirey, interception, sport, american football, omaha, touchdown, amarillo, award
4301	*0*	Enid News	lane, release, w. chestnut ave., highway, construction, estimate, detour, official
4302	*0*	Enid News	rape, crime, law, criminal law, economics, police, danielle tudor, suzanne breedlove, wage, fund, rate, sheri amore dickerson
4303	*0*	Enid News	full-longform, keith butricks, robin bench, brook arbeitman, police, crime, criminal law, social services, case, departm

5421	*0*	NW Florida Daily	news
5422	*0*	NW Florida Daily	news
5423	*0*	NW Florida Daily	news
5424	*0*	NW Florida Daily	news
5425	*0*	NW Florida Daily	news
5426	*0*	NW Florida Daily	news
5427	*0*	NW Florida Daily	sports
5428	*0*	NW Florida Daily	news
5429	*0*	NW Florida Daily	news
5430	*0*	NW Florida Daily	news
5431	*0*	NW Florida Daily	entertainmentlife
5432	*0*	NW Florida Daily	news
5433	*0*	NW Florida Daily	news
5434	*0*	NW Florida Daily	news
5435	*0*	NW Florida Daily	news
5436	*0*	NW Florida Daily	news
5437	*0*	NW Florida Daily	news
5438	*0*	NW Florida Daily	news
5439	*0*	NW Florida Daily	news
5440	*0*	NW Florida Daily	sports
5441	*0*	NW Florida Daily	news
5442	*0*	NW Florida Daily	news
5443	*0*	NW Florida Daily	news
5444	*0*	NW Florida Daily	news
5445	*0*	NW Florida Daily	news
5446	*0*	NW Florida Daily	sports
5447	*0*	NW Florida Daily	news
5448	*0*	NW Florida Daily	entertainmentlife
5449	*0*	NW Florida Daily	news
5450	*0*	NW Florida Daily	sports
5451	*0*	NW Florida Daily	news
5452	

In [69]:
len([len(list(articles[i].sents)) for i in range(len(articles))if len(list(articles[i].sents)) < 60])/6386*100.00

90.13466958972754

In [38]:
cleaned_tags = []
for tag in train.tag:
    if type(tag) is not float:
        if ',' in tag:
            
            if tag[0] == '[':
                tag_list = [t.strip()[1:-1]  for t in tag[1:-1].split(',')]
            else:
                 tag_list = [t.strip() for t in tag.split(',')]
            
            for t in tag_list:
                cleaned_tags.append(t)
        
        else:
            cleaned_tags.append(tag)

len(cleaned_tags), Counter(cleaned_tags).most_common()

(18552,
 [('news', 609),
  ('opinion', 388),
  ('politics', 384),
  ('enid', 237),
  ('economics', 224),
  ('sport', 215),
  ('op_ed_commentaries', 214),
  ('columnists', 213),
  ('editorial', 188),
  ('letters-to-editor', 172),
  ('law', 166),
  ('commerce', 151),
  ('education', 146),
  ('Not available', 129),
  ('oklahoma', 119),
  ('work', 119),
  ('', 114),
  ('criminal law', 98),
  ('school', 95),
  ('sports', 94),
  ('christianity', 89),
  ('institutes', 89),
  ('community-voices', 83),
  ('crime', 81),
  ('military', 79),
  ('medicine', 79),
  ('food', 73),
  ('worship', 72),
  ('readers_vent', 71),
  ('american football', 70),
  ('baseball', 69),
  ('business', 65),
  ('finance', 63),
  ('wvu', 60),
  ('company', 58),
  ('kanawha_county', 51),
  ('police', 49),
  ('student', 47),
  ('transports', 47),
  ('building industry', 44),
  ('football', 42),
  ('legislation', 41),
  ('music', 41),
  ('marshall_university', 41),
  ('entertainment', 39),
  ('team', 38),
  ('cops_and_cour

In [37]:
cleaned_tags_list = []
for tag in train.tag:
    if type(tag) is not float:
        if ',' in tag:
            
            if tag[0] == '[':
                tag_list = [t.strip()[1:-1]  for t in tag[1:-1].split(',')]
            else:
                 tag_list = [t.strip() for t in tag.split(',')]
            
            cleaned_tags_list.append(tag_list)
        
        else:
            cleaned_tags_list.append([tag])
    else:
        cleaned_tags_list.append(['nan'])

len(cleaned_tags_list)

6386

In [80]:
op_ids = [i for i,tag_list in enumerate(cleaned_tags_list) if 'opinion' in tag_list or 'editorial' in tag_list or 'op_ed_commentaries' in tag_list]
news_ids = [i for i,tag_list in enumerate(cleaned_tags_list) if 'news' in tag_list and i > 3192]

In [81]:
len(op_ids),len(news_ids)

(754, 440)

In [83]:
Counter([pub for i,pub in enumerate(train.source_name) if i in news_ids]).most_common()

[('Californian', 239),
 ('NW Florida Daily', 131),
 ('Gazette-mail', 68),
 ('Washington Observer Report', 2)]

In [85]:
Counter([pub for i,pub in enumerate(train.source_name) if i in op_ids]).most_common()

[('Gazette-mail', 387), ('Californian', 361), ('Enid News', 6)]

## Dev Set

In [47]:
dev_info = []
for i, (tag, label, pub) in enumerate(zip(dev.tag,dev_labels.label, dev.source_name)):
    dev_info.append((i, tag, label, pub))
    print('{0}\t*{2}*\t{3}\t{1}'.format(i, tag, label, pub))

0	*0*	Californian	food
1	*0*	Californian	news
2	*0*	Californian	kern-county-fair, news
3	*1*	Californian	opinion, community-voices
4	*0*	Californian	new products and services, products and services, corporate news, business, oral health, health, genomics, biology, science
5	*0*	Californian	infanticide, homicide, violent crime, crime, general news
6	*0*	Californian	arts-theater
7	*0*	Californian	criminal investigations, crime, general news, law and order, arrests, legal proceedings, missing persons
8	*0*	Californian	military and defense, government and politics, school administration, education, social affairs, undergraduate education, higher education
9	*0*	Californian	news
10	*0*	Californian	news, education
11	*0*	Californian	news
12	*0*	Californian	entertainment
13	*0*	Californian	news, education
14	*0*	Californian	news
15	*0*	Californian	national
16	*0*	Californian	birds, animals
17	*0*	Californian	news
18	*0*	Californian	news
19	*0*	Californian	news
20	*0*	Californian	news
21	*0*	C

1322	*0*	Gazette-mail	arts_and_entertainment
1323	*0*	Gazette-mail	nan
1324	*0*	Gazette-mail	nan
1325	*0*	Gazette-mail	nan
1326	*0*	Gazette-mail	nan
1327	*0*	Gazette-mail	nan
1328	*0*	Gazette-mail	nan
1329	*1*	Gazette-mail	nan
1330	*0*	Gazette-mail	one_month_at_a_time
1331	*0*	Gazette-mail	kanawha_valley, education
1332	*0*	Gazette-mail	all_eers_podcast, wvu
1333	*0*	Gazette-mail	education, kanawha_valley
1334	*0*	Gazette-mail	energy_and_environment
1335	*0*	Gazette-mail	news
1336	*0*	Gazette-mail	legal_affairs
1337	*0*	Gazette-mail	health, kanawha_valley
1338	*0*	Gazette-mail	football
1339	*0*	Gazette-mail	football
1340	*0*	Gazette-mail	derek_redd, wvu, wvamupdate
1341	*0*	Gazette-mail	wvu
1342	*0*	Gazette-mail	nan
1343	*0*	Gazette-mail	boys_soccer, girls_soccer
1344	*0*	Gazette-mail	daily_mail_features
1345	*0*	Gazette-mail	nan
1346	*0*	Gazette-mail	nan
1347	*0*	Gazette-mail	sports
1348	*0*	Gazette-mail	boys_soccer, girls_soccer
1349	*0*	Gazette-mail	education, kanawha_county
1350	*1

2529	*0*	Press Democrat	nan
2530	*0*	Press Democrat	nan
2531	*0*	Press Democrat	nan
2532	*0*	Press Democrat	nan
2533	*0*	Press Democrat	nan
2534	*0*	Press Democrat	nan
2535	*0*	Press Democrat	nan
2536	*0*	Press Democrat	nan
2537	*0*	Press Democrat	nan
2538	*0*	Press Democrat	nan
2539	*0*	Press Democrat	nan
2540	*0*	Press Democrat	nan
2541	*0*	Press Democrat	nan
2542	*0*	Press Democrat	nan
2543	*0*	Press Democrat	nan
2544	*0*	Press Democrat	nan
2545	*0*	Press Democrat	nan
2546	*0*	Press Democrat	nan
2547	*0*	Press Democrat	nan
2548	*0*	Press Democrat	nan
2549	*0*	Press Democrat	nan
2550	*0*	Press Democrat	nan
2551	*0*	Press Democrat	nan
2552	*0*	Press Democrat	nan
2553	*0*	Press Democrat	nan
2554	*0*	Press Democrat	nan
2555	*0*	Press Democrat	nan
2556	*1*	Press Democrat	nan
2557	*1*	Press Democrat	nan
2558	*1*	Press Democrat	nan
2559	*0*	Press Democrat	nan
2560	*0*	Press Democrat	nan
2561	*0*	Press Democrat	nan
2562	*0*	Press Democrat	nan
2563	*0*	Press Democrat	nan
2564	*0*	Press Democ

In [51]:
dev.article_text.values[440]

' What a wonderful water year it has been. We have had water in the Kern River through Bakersfield since February and it is now October. What a beautiful difference this has made in the community as a whole. It has been an aesthetic and recreational paradise and has continuously recharged our groundwater for future use. My daily commute is along the Kern River running through our beautiful city so I have been able to observe and enjoy many of the sites and sounds made possible by this phenomenon. I have heard the whitewater rushing underneath me as I cross the pedestrian bicycle bridge at Riverwalk Park. On that same bridge, I have seen hundreds of families and couples taking a walk and soaking in the natural wonder of the surroundings. In the early mornings and evenings, I have shared with others the joys of the beautiful sunrise and sunsets across the water. In the midday summer sun, I have had the pleasure of watching families playing together in the water at Beach Park, Yokuts Park

In [54]:
dev.article_text.values[438]

' I’ve been reading posts online about “those” homeless people and myths about them. They’re not bused here, they’re from here, and they’re our family. All homeless people aren’t drug addicts, criminals, mentally ill or lazy. They each have their own reason for being homeless and most want to work. We need affordable housing and people certified to do community health outreach work so they can truly connect with our homeless. I’m concerned about a plan to cite people with nowhere to go simply because we don’t want to see them and we’re more concerned with NIMBYisms, which include not in my neighborhood, business, city/county. We want citations/tickets/fines (which they cannot afford), court dates (which they cannot get to), drummed up reasons for tickets, arrests and a big push into a system of bail, attorney fees, understanding of court language and plea bargains? This is not a solution. Is there a problem? Yes. Something needs to be done, but this “plan” is not the way. Don’t crimina

In [45]:
cleaned_tags = []

for tag, label in zip(dev.tag, dev_labels.label):
    if label == 1:
        if type(tag) is not float:
            if ',' in tag:

                if tag[0] == '[':
                    tag_list = [t.strip()[1:-1]  for t in tag[1:-1].split(',')]
                else:
                     tag_list = [t.strip() for t in tag.split(',')]

                for t in tag_list:
                    cleaned_tags.append(t)

            else:
                cleaned_tags.append(tag)

len(cleaned_tags), Counter(cleaned_tags).most_common()

(693,
 [('opinion', 67),
  ('letters-to-editor', 27),
  ('op_ed_commentaries', 26),
  ('columnists', 20),
  ('editorial', 15),
  ('community-voices', 14),
  ('politics', 13),
  ('letters', 8),
  ('christianity', 7),
  ('gastronomy', 6),
  ('food', 6),
  ('crime', 6),
  ('economics', 6),
  ('worship', 5),
  ('police', 5),
  ('publishing', 4),
  ('military', 4),
  ('jesus', 4),
  ('cross', 4),
  ('neal', 4),
  ('god', 4),
  ('commerce', 4),
  ('op-eds', 4),
  ('our-view', 3),
  ('Afghan war', 3),
  ('Impeachment', 3),
  ('law', 3),
  ('american', 3),
  ('finance', 3),
  ('greta thunberg', 3),
  ('readers_vent', 3),
  ('news', 3),
  ('Afghan taliban', 2),
  ('vaping deaths', 2),
  ('corruption in Iraq', 2),
  ('air force', 2),
  ('aeronautics', 2),
  ('peter', 2),
  ('triumph', 2),
  ('holy cross', 2),
  ('meaning', 2),
  ('america', 2),
  ('criminal law', 2),
  ('bill maher', 2),
  ('dietetics', 2),
  ('talk show', 2),
  ('columnist', 2),
  ('comedian', 2),
  ('encouragement', 2),
  ('sh

In [42]:
cleaned_tags_list = []
for tag in dev.tag:
    if type(tag) is not float:
        if ',' in tag:
            
            if tag[0] == '[':
                tag_list = [t.strip()[1:-1]  for t in tag[1:-1].split(',')]
            else:
                 tag_list = [t.strip() for t in tag.split(',')]
            
            cleaned_tags_list.append(tag_list)
        
        else:
            cleaned_tags_list.append([tag])
    else:
        cleaned_tags_list.append(['nan'])

len(cleaned_tags_list)

3436

In [None]:
('letters-to-editor', 27),  # 50 total
('community-voices', 14)
('letters', 8),
('letter_to_editor', 1),

('op_ed_commentaries', 26), # 70 total
('columnists', 20),
('editorial', 15),  
('op-eds', 4),
('our-view', 3),
('columnist', 2),

In [129]:
letterspub = ['Californian', 'Washington Observer Report', 'Gazette-mail']
letter_ids, letter_news_ids, news_pub_counter, let_pub_counter = [], [], [], []

for i, (tag_list, label, pub) in enumerate(zip(cleaned_tags_list,dev_labels.label, dev.source_name)):
    for tag in tag_list:
        if tag in ['letters-to-editor','community-voices', 'letters', 'letter_to_editor']:
#             print(i, pub, tag_list)
            letter_ids.append(i)
            let_pub_counter.append(pub)
            break
    if 'news' in tag_list and pub in letterspub and label == 0:
        letter_news_ids.append(i)
        news_pub_counter.append(pub)
    
len(letter_ids), len(letter_news_ids), Counter(news_pub_counter), Counter(let_pub_counter)

(50,
 271,
 Counter({'Californian': 225,
          'Gazette-mail': 34,
          'Washington Observer Report': 12}),
 Counter({'Californian': 41,
          'Gazette-mail': 1,
          'Washington Observer Report': 8}))

In [138]:
edispub = ['Californian', 'Washington Observer Report', 'Enid News', 'Gazette-mail']
editorial_ids, editorial_news_ids, edi_news_pub_counter, edi_pub_counter = [], [], [], []

for i, (tag_list, label, pub) in enumerate(zip(cleaned_tags_list,dev_labels.label, dev.source_name)):
    for tag in tag_list:
        if tag in ['op_ed_commentaries','columnists', 'editorial', 'op-eds', 'our-view', 'columnist']:
            if label ==1:
#             print(i, label, pub, tag_list)
                editorial_ids.append(i)
                edi_pub_counter.append(pub)
                break
    if 'news' in tag_list and pub in edispub and label == 0:
        editorial_news_ids.append(i)
        edi_news_pub_counter.append(pub)
    
len(editorial_ids), len(editorial_news_ids), Counter(edi_news_pub_counter), Counter(edi_pub_counter)

(70,
 272,
 Counter({'Californian': 225,
          'Enid News': 1,
          'Gazette-mail': 34,
          'Washington Observer Report': 12}),
 Counter({'Californian': 3,
          'Enid News': 2,
          'Gazette-mail': 61,
          'Washington Observer Report': 4}))

In [139]:
dev_set_ids = {'editorial_ids' : editorial_ids,
               'editorial_news_ids' : editorial_news_ids[-70:],
               'letter_ids' : letter_ids,
               'letter_news_ids' : letter_news_ids[:41]+letter_news_ids[-9:]}
pickle.dump(dev_set_ids, open('../pkl/dev_set_ids_edi_let.p','wb'))

## Test

In [75]:
winnipeg = pd.read_csv('/Users/tariq/Downloads/Bloomberg_Editorial_Classifier/data/collected/metro winnipeg - extra test/metro_winnipeg.csv')

In [109]:
#two classes: 1.editorial, guest, oped.        2.other
'''Counter({'regular': 1387,
         'editorial': 148,
         'other': 144,
         'guest': 124,
         'oped': 2})'''

other_limit, editorial_limit = 144, 274;
other_count, editorial_count = 0, 0

editorial_ids, news_editorial_ids = [], []
other_ids, news_other_ids = [], []

for i, (tag, label) in enumerate(zip(winnipeg.Editorial, test.label)):
    
    if tag in ['editorial', 'guest', 'oped']:
        assert label == 1
        editorial_ids.append(i)
    elif tag == 'other':
        assert label == 1
        other_ids.append(i)
    
    if label == 0 and editorial_count < editorial_limit:
        news_editorial_ids.append(i)
        editorial_count += 1
    
    if label == 0 and other_count < other_limit:
        news_other_ids.append(i)
        other_count += 1
        
len(editorial_ids), len(news_editorial_ids), len(other_ids), len(news_other_ids)

(274, 274, 144, 144)

In [110]:
test_set_ids = {'editorial_ids' : editorial_ids,
               'editorial_news_ids' : news_editorial_ids,
               'other_ids' : other_ids,
               'other_news_ids' : news_other_ids}
pickle.dump(test_set_ids, open('../pkl/test_set_ids_edi_let.p','wb'))

In [98]:
148+124+2

274