In [4]:
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize

In [5]:
laWash = pd.read_csv("LaTimesAndWashingtonPostArticles.csv").drop('Unnamed: 0', axis=1)
laWash

Unnamed: 0,text,num_sent
0,Now that there are new nutrition labels o...,32
1,"YOUNTVILLE, Calif. &MD Dawnine Dyer's f...",50
2,"YOUNTVILLE, Calif. &MD One of the few w...",17
3,This lime-flavored shrimp and vegetable s...,21
4,"The premiere issue of Saveur, the America...",13
...,...,...
95737,WASHINGTON &MD Asserting that domestic ...,24
95738,"PLYMOUTH, Montserrat &MD Dunes of fine ...",39
95739,"PLYMOUTH, Montserrat &MD In Montserrat'...",55
95740,WASHINGTON &MD A steady rise in births ...,25


In [6]:
laWashArticles = laWash.text.astype('string').apply(sent_tokenize).values

In [7]:
shorter_reports = [report for report in laWashArticles if len(report) == 11 or len(report) == 12][:2000]
len(shorter_reports)

2000

In [8]:
import random

random.seed(487)
num_permutations_per_report = 20

permutations = []
for i, report in enumerate(shorter_reports):
    perms_of_curr_report = [report]
    for _ in range(num_permutations_per_report):
        shuffled = report.copy()
        random.shuffle(shuffled)
        if shuffled not in perms_of_curr_report:
            permutations.append((shuffled, i))
            perms_of_curr_report.append(shuffled)
            
print(permutations[0])
print(shorter_reports[0])

(["Norman's model is more complex, computing how (in one theory) cosmic gas would be heated as it was pulled into the gravitational ``wells'' created by dark matter at almost 190 miles per second, creating shock waves that superheat intergalactic gases.", 'The Connection Machine-5 supercomputer took 30 hours to build this universe in a box.', 'The new model agrees so well with actual observations, he said, that it may guide astronomers toward new discoveries.', 'An astrophysics team at the University of Illinois used a massively parallel supercomputer to visualize the universe as it would be seen in X-rays, which are emitted by superhot gases or in violent events.', 'In the last decade, many astronomers came to believe that at least 98 percent of the universe consists of dark matter: invisible material of an unknown nature, detected only through the effects of its gravity.', 'Eventually, a New Mexico State University model showed that a mix of cold (low-energy) and hot could explain th

In [9]:
len(shorter_reports) == len(permutations) / 20

True

In [14]:
joined_reports = []
joined_reports_ids = []
joined_perms = []
joined_perms_ids = []

for i, report in enumerate(shorter_reports):
    joined_reports.append(' '.join(report))
    joined_reports_ids.append(i)

for perm, idx in permutations:
    joined_perms.append(' '.join(perm))
    joined_perms_ids.append(idx)

print(joined_reports[0])
print(joined_perms[0])

     A new supercomputer model of the universe  &MD  the most complex such simulation ever created  &MD  lends credence to the theory that the original recipe for the real cosmos probably included both cold and hot versions of the mysterious ingredient known as dark matter. An astrophysics team at the University of Illinois used a massively parallel supercomputer to visualize the universe as it would be seen in X-rays, which are emitted by superhot gases or in violent events. ``Our simulation is the first that is sufficiently comprehensive to make theoretical predictions that can be compared with observations,'' said team leader Michael Norman, who presented the work at a meeting of the American Astronomical Society last week in Minneapolis. The new model agrees so well with actual observations, he said, that it may guide astronomers toward new discoveries. In the last decade, many astronomers came to believe that at least 98 percent of the universe consists of dark matter: invisible m

In [19]:
d = {
    'paragraph': joined_reports + joined_perms,
    'is_coherent': [1] * len(joined_reports) + [0] * len(joined_perms),
    'id': joined_reports_ids + joined_perms_ids
}
pars_with_label = pd.DataFrame(d)
pars_with_label

Unnamed: 0,paragraph,is_coherent,id
0,A new supercomputer model of the universe...,1,0
1,1. ``Mrs. Doubtfire'' 2. ``A Perfect ...,1,1
2,Five key U.S. maritime unions formally ag...,1,2
3,OUR HOURLY BREAD: Bread doesn't demand mu...,1,3
4,"LYON, France &MD Much of the criticism ...",1,4
...,...,...,...
41995,"Before he can leave the United States, Abu Mar...",0,1999
41996,State Department spokesman Nicholas Burns said...,0,1999
41997,``The decision was taken on the basis of overa...,0,1999
41998,An INS spokesman said the agency's case to exc...,0,1999


In [20]:
pars_with_label['num_sent'] = pars_with_label.paragraph.apply(lambda x : len(sent_tokenize(x)))
pars_with_label

Unnamed: 0,paragraph,is_coherent,id,num_sent
0,A new supercomputer model of the universe...,1,0,12
1,1. ``Mrs. Doubtfire'' 2. ``A Perfect ...,1,1,11
2,Five key U.S. maritime unions formally ag...,1,2,12
3,OUR HOURLY BREAD: Bread doesn't demand mu...,1,3,12
4,"LYON, France &MD Much of the criticism ...",1,4,12
...,...,...,...,...
41995,"Before he can leave the United States, Abu Mar...",0,1999,11
41996,State Department spokesman Nicholas Burns said...,0,1999,11
41997,``The decision was taken on the basis of overa...,0,1999,11
41998,An INS spokesman said the agency's case to exc...,0,1999,11


In [21]:
pars_with_label.to_csv('LATimesWashPostPerms.csv', index=False)