# Pre-process the 20 Newsgroup dataset

##### Download the Original 20 Newsgroups data set from : http://qwone.com/~jason/20Newsgroups/


In [3]:
import os
import numpy as np
import pandas as pd

### Get the path of all files in the 20 Newsgroup dataset

In [12]:
files = []
dataset_directory = os.path.join (os.getcwd (), "dataset_20news")
def get_files () :
    for (root, dirs, file) in os.walk (dataset_directory) :
        for f in file :
            if ".ipynb_checkpoints" not in root :
                files.append (os.path.join (root, f))

In [13]:
get_files()
print (files[2000:2003])
print ("Total no. of files : ", len (files))

['D:\\CS Projects\\Plagiarism-Detection\\dataset_20news\\20_newsgroups\\comp.os.ms-windows.misc\\10000', 'D:\\CS Projects\\Plagiarism-Detection\\dataset_20news\\20_newsgroups\\comp.os.ms-windows.misc\\10001', 'D:\\CS Projects\\Plagiarism-Detection\\dataset_20news\\20_newsgroups\\comp.os.ms-windows.misc\\10002']
Total no. of files :  19997


### Get the raw text in all the files

In [14]:
files_text = []
for file in files :
    files_text.append (open (os.path.join ("dataset", file), 'r', encoding = "cp437").read ())

In [15]:
print (len (files_text))

19997


### Perform Pre-processing on the data
```
1. Remove Stopwords
2. Remove any special characters
3. Convert the sentence to Lowercase
4. Tokenize the sentence
5. Perform Stemming
6. Remove words that contain only digits
```

In [18]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer

nltk.download ('punkt')
nltk.download ('stopwords')
lancaster = LancasterStemmer()

def pre_process (_sentences) :
    en_stops = set (stopwords.words ('english')) # Remove Stopwords
    processed_sentences = []

    for sentence in _sentences :
        sentence = re.sub (r'[^\w\s]','', sentence) # Remove special characters
        lower_sen = sentence.lower () # Convert the sentence to Lowercase
        token_sen = word_tokenize (lower_sen) # Tokenize the sentence

        new_sentence = []
        for word in token_sen:
            if word not in en_stops and word.isnumeric() == False : # Remove words that contain only digits
                word = lancaster.stem (word) # Perform Stemming
                new_sentence.append (word)

        if len (new_sentence) > 3 :
            processed_sentences.append (new_sentence)
    return processed_sentences

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shraeyas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shraeyas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
documents = []
for file_text in files_text :
    sentences = file_text.split (".")
    documents.append (pre_process (sentences))

### Create a final Dataframe for the pre-processed documents
##### The dataframe structure will be of the format :
```
document_id | sentence
---------------------------------------------------
 0          | statement with comma separated words

```

In [21]:
document_df = []
for i, document in enumerate (documents) :
    for sentence in document :
        sen = " ".join (sentence)
        document_df.append ([i, sen])

In [22]:
document_df = pd.DataFrame (document_df, columns = ["document_id", "sentences"])

In [23]:
document_df.head ()

Unnamed: 0,document_id,sentences
0,0,ath faq ath resourc sum book address mus anyth...
1,0,uk dat mon mar gmt expir thu apr gmt followupt...
2,0,ath distribut world org mant consult cambridg
3,0,uk lin archivenam atheismresourc altatheismarc...
4,0,ath resourc address ath org us freedom relig f...


### Save the final Processed Dataset to disk as a csv

In [24]:
document_df.to_csv (os.path.join ("dataset_processed", "documents_processed.csv"), index = False)