In [3]:
import re
import json

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    # Remove non-alphanumeric characters except spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove words with only 1 character
    text = ' '.join(word for word in text.split() if len(word) > 1)
    # Remove numbers that are fully made of digits
    text = re.sub(r'\b\d+\b', '', text)
    return text

def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # Load each line as a JSON object
            json_object = json.loads(line.strip())
            preprocessed_data = preprocess_text(json_object['content'])
            data.append(preprocessed_data)
    return data

file_path = 'signal-news1/signal-news1.jsonl'
jsonl_data = read_jsonl(file_path)

jsonl_data[:10]

['tim ernst grew up in williamsville playing video games like mega man and the legend of zelda on his console and ultima and warcraft read tim ernst grew up in williamsville playing video games like mega man and the legend of zelda on his console and ultima and warcraft on his computer loved them they were something just saw so much potential in ernst said it was always heartbreaking to play game that was terrible and if you played one that was amazing it was just an unbelievable experience today he gets to play and design and develop video games for living ernst  is general manager of kabam games where he oversees about  employees in the companys san francisco development studio hes playing leading role in the companys latest mobile game star wars uprising which came out thursday and is tied to the eagerly awaited next installment in the star wars movie franchise set for release in december kabam has found niche in making interactive roleplaying games for smartphones and tablets the g

In [11]:
with open('preprocessed.txt', 'w') as file:
    for line in jsonl_data:
        file.write(line + '\n')

In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'  # Adjective
    elif treebank_tag.startswith('V'):
        return 'v'  # Verb
    elif treebank_tag.startswith('N'):
        return 'n'  # Noun
    elif treebank_tag.startswith('R'):
        return 'r'  # Adverb
    else:
        return 'n'  # Default to Noun if not one of the above

def lemmatize_text_with_pos(text):
    words = word_tokenize(text)
    tagged_words = pos_tag(words)
    
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(pos_tag))
        for word, pos_tag in tagged_words]
    
    return ' '.join(lemmatized_words)

lemmatized_data_with_pos = [lemmatize_text_with_pos(item) for item in jsonl_data]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sleepyard\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Sleepyard\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sleepyard\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
lemmatized_data_with_pos[:10]

['tim ernst grow up in williamsville play video game like mega man and the legend of zelda on his console and ultima and warcraft read tim ernst grow up in williamsville play video game like mega man and the legend of zelda on his console and ultima and warcraft on his computer love them they be something just saw so much potential in ernst say it be always heartbreaking to play game that be terrible and if you play one that be amaze it be just an unbelievable experience today he get to play and design and develop video game for living ernst be general manager of kabam game where he oversee about employee in the companys san francisco development studio he play lead role in the company late mobile game star war uprise which come out thursday and be tie to the eagerly await next installment in the star war movie franchise set for release in december kabam have find niche in make interactive roleplaying game for smartphones and tablets the game be free to download and play but offer user

In [10]:
with open('lemmatized.txt', 'w') as file:
    for line in lemmatized_data_with_pos:
        file.write(line + '\n')