### Process Overview:
- Input: sampled_100.json 

- Output: preprocessed_100.json

##### Text Preprocessing Steps: 
- Lowercasing: Convert all text to lowercase to ensure consitency 

- Tokenization: Split the text into individual words or tokens.  

- Removing Punctuation: Eliminate punctuation marks  

- Removing Stopwords: Remove common words like "the", "is"   

- Stemming or Lemmatization: Reduce words to  theirs root form  

- Removing Numbers and Special Characters  

- Removing Extra Whitespaces 

##### Tokenization Using BertTokenizer (for BERT Vectorization)

In [1]:
import json
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from transformers import BertTokenizer

def preprocess_corpus(input_file_path, output_file_path):
    # Open and read the JSON file
    with open(input_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Initialize the stopwords, stemmer, lemmatizer, and tokenizer
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Add custom stopwords
    custom_stopwords = {'said', 'could', 'might', 'like', 'also', 'would', 
                        'abstract', 'copyright'}
    stop_words.update(custom_stopwords)

    # Define the preprocessing function:
    def preprocess_text(text):
        # Lowercasing
        text = text.lower()

        # Tokenization using BertTokenizer
        tokens = tokenizer.tokenize(text)

        # Removing punctuation and special tokens introduced by BERT (like ##)
        tokens = [word for word in tokens if word not in string.punctuation and not word.startswith('##')]

        # Removing Stopwords and single characters
        tokens = [word for word in tokens if word not in stop_words and len(word) > 1]

        # Lemmatization
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

        # Removing Numbers and Special Characters
        tokens = [word for word in tokens if word.isalpha()]

        # Join tokens back into text
        processed_text = ' '.join(tokens)

        return processed_text

    # Preprocess each story in the data, retaining only the preprocessed text
    preprocessed_data = []
    for story in data:
        preprocessed_text = preprocess_text(story['story_text'])
        preprocessed_data.append(preprocessed_text)

    # Write the preprocessed data to the output file
    with open(output_file_path, 'w', encoding='utf-8') as f:
        json.dump(preprocessed_data, f, ensure_ascii=False, indent=4)

# Example usage
input_file_path = 'sampled_500.json'
output_file_path = 'tokenizedBERT_500.json'
preprocess_corpus(input_file_path, output_file_path)


  from .autonotebook import tqdm as notebook_tqdm


##### Adding Story_ID for Documents

In [2]:
import json

def strip_metainfo(input_file_path, output_file_path):
    # Read the JSON file
    with open(input_file_path, 'r') as infile:
        stories = json.load(infile)
    
    # Process each story to strip meta info
    stripped_stories = []
    for idx, story in enumerate(stories, start=1):
        # Find the position of the word 'body'
        body_index = story.find('body')
        if body_index != -1:
            # Extract the content after the word 'body'
            stripped_story = story[body_index + len('body'):].strip()
            # Create a dictionary with story_ID and stripped story
            stripped_stories.append({"ID": idx, "Text": stripped_story})
    
    # Write the stripped content to a new JSON file
    with open(output_file_path, 'w') as outfile:
        json.dump(stripped_stories, outfile, indent=2)

# Example usage
input_file_path = 'tokenizedBERT_500.json'
output_file_path = 'tokenizedBERT_ID_500.json'
strip_metainfo(input_file_path, output_file_path)

##### Tokenization Using Ordinary Tokenizer (for LDA Topic Modeling)

In [3]:
import json
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

def preprocess_corpus(input_file_path, output_file_path):
    # Open and read the JSON file
    with open(input_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Initialize the stopwords, stemmer, and other required objects
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    # Add custom stopwords
    custom_stopwords = {'said', 'could', 'might', 'like', 'also', 'would'}
    stop_words.update(custom_stopwords)

    # Define the preprocessing function:
    def preprocess_text(text):
        # Lowercasing
        text = text.lower() 

        # Tokenization
        tokens = word_tokenize(text) 

        # Removing Punctuation
        tokens = [word for word in tokens if word not in string.punctuation]

        # Removing Stopwords and single characters
        tokens = [word for word in tokens if word not in stop_words and len(word) > 1]

        # Lemmatization
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

        # Removing Numbers and Special Characters
        tokens = [word for word in tokens if word.isalpha()]

        # Join tokens back into text
        processed_text = ' '.join(tokens)

        return processed_text

    # Preprocess each story in the data, retaining only the preprocessed text
    preprocessed_data = []
    for story in data:
        preprocessed_text = preprocess_text(story['story_text'])
        preprocessed_data.append(preprocessed_text)

    # Write the preprocessed data to the output file
    with open(output_file_path, 'w', encoding='utf-8') as f:
        json.dump(preprocessed_data, f, ensure_ascii=False, indent=4)

# Example usage
input_file_path = 'sampled_500.json'
output_file_path = 'tokenizedRawORD.json'
preprocess_corpus(input_file_path, output_file_path)


##### Clean Tokenizing by Removing MetaInfo

In [10]:
import json

def strip_metainfo(input_file_path, output_file_path):
    # Read the JSON file
    with open(input_file_path, 'r') as infile:
        stories = json.load(infile)
    
    # Process each story to strip meta info
    stripped_stories = []
    for story in stories:
        # Find the position of the word 'body'
        body_index = story.find('body')
        if body_index != -1:
            # Extract the content before the word 'body'
            stripped_story = story[body_index + len('body'):].strip()
            stripped_stories.append(stripped_story)
    
    # Write the stripped content to a new JSON file
    with open(output_file_path, 'w') as outfile:
        json.dump(stripped_stories, outfile, indent=2)

# Example usage
input_file_path  = 'XXXRaw.json'
output_file_path = 'YYYFine.json'
strip_metainfo(input_file_path, output_file_path)
