# Getting and Cleaning Data

--------

In [7]:
from urllib.request import urlretrieve
import os
from os.path import isfile, isdir, join
from tqdm import tqdm
import zipfile
%matplotlib inline
import pandas as pd
import numpy as np

### Download the data set

In [2]:
dataset_folder_path = 'data'
dataset_filename = 'SentenceCorpus.zip'
dataset_name = 'SentenceCorpus'

class DLProgress(tqdm):
    last_block = 0

    def hook(self, block_num=1, block_size=1, total_size=None):
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num

if not isfile(dataset_filename):
    with DLProgress(unit='B', unit_scale=True, miniters=1) as pbar:
        urlretrieve(
            'https://archive.ics.uci.edu/ml/machine-learning-databases/00311/SentenceCorpus.zip',
            dataset_filename,
            pbar.hook)

if not isdir(dataset_folder_path):
    with zipfile.ZipFile(dataset_filename) as zip_ref:
        zip_ref.extractall(dataset_folder_path)
        
os.listdir(dataset_folder_path+'/'+ dataset_name)

['.DS_Store',
 'Instructions_for_SentenceAnnotation.pdf',
 'labeled_articles',
 'README',
 'unlabeled_articles',
 'word_lists']

### Reading Data

In [3]:
#os.listdir('data/SentenceCorpus/labeled_articles/')

In [4]:
import glob
file_list =  glob.glob("data/SentenceCorpus/labeled_articles/*.txt")
len(file_list)
#print(file_list)

90

#### Prepare list of label and text

In [5]:
label = []
text = []
for file in file_list:
    with open(file) as f:
        for line in f:
               if line[0] != '#':
                    #print(line[0:4])
                    label.append(line[0:4])
                    #print(line[6:])
                    text.append(line[5:])                 

#### Prepare DataFrame

In [8]:
Doc = pd.DataFrame(np.array([label,text])).transpose()
Doc.columns =['Label','Text']
Doc.shape

(3117, 2)

In [9]:
Doc.head()

Unnamed: 0,Label,Text
0,MISC,The Minimum Description Length principle for o...
1,MISC,"If the underlying model class is discrete, the..."
2,MISC,"For MDL, in general one can only have loss bou..."
3,AIMX,We show that this is even the case if the mode...
4,OWNX,We derive a new upper bound on the prediction ...


#### Save Data

In [19]:
Doc.to_csv("data.csv")