In [None]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Define file path and chunk size
file_path = "995,000_rows.csv"
chunksize = 25000

# Define your cleaning function
def clean_text(data):
    if not isinstance(data, str):  # Handle NaN values safely
        return ""
    data = data.lower()
    data = re.sub(r'\s+', " ", data)
    data = re.sub(r'\d{1,2}[./-]\d{1,2}[./-]\d{2,4}', "<DATE>", data)
    data = re.sub(r'(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec).? \d{1,2},? \d{4}', "<DATE>", data)
    data = re.sub(r'\d{4}-\d{2}-\d{2}', "<DATE>", data)
    data = re.sub(r'[\w._%+-]+@[\w.-]+\.[a-zA-Z]{2,}', "<EMAIL>", data)
    data = re.sub(r'http[s]?://[^\s]+', "<URL>", data)
    data = re.sub(r'\d+(\.\d+)?', "<NUM>", data)
    return data

# Initialize stopwords and stemmer
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

# Define function to tokenize, remove stopwords, and stem
def tokenize_and_stem(text):
    tokens = word_tokenize(text)
    filtered_tokens = [ps.stem(word) for word in tokens if word.isalpha() and word not in stop_words]
    return filtered_tokens

# List of columns to process
columns_to_clean = [
    "id", "domain", "type", "url", "content", "scraped_at", "inserted_at",
    "updated_at", "title", "authors", "keywords", "meta_keywords",
    "meta_description", "tags", "summary"
]

# Option 1: Process and store all chunks in a single DataFrame
preprocessed_data = pd.DataFrame()

for chunk in pd.read_csv(file_path, chunksize=chunksize, low_memory=False):
    for col in columns_to_clean:
        if col in chunk.columns:
            # Clean the column
            chunk[col] = chunk[col].apply(clean_text)
            # Create new column with tokenized and stemmed tokens
            token_col = col + "_tokens"
            chunk[token_col] = chunk[col].apply(tokenize_and_stem)
    preprocessed_data = pd.concat([preprocessed_data, chunk], ignore_index=True)

# Save the final DataFrame to a CSV file if desired
preprocessed_data.to_csv("preprocessed_data_option1.csv", index=False)


In [5]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

file_path = "995,000_rows.csv"
file = pd.read_csv(file_path)

Final_file_path = "preprocessed_data_option1.csv"
data = pd.read_csv(Final_file_path)

#Check Unique values
print(len(file['content'].unique()))
print(len(data['content'].unique()))

  file = pd.read_csv(file_path)
  data = pd.read_csv(Final_file_path)


812913
805448


In [6]:
chunksize = 10000 

# Columns to analyze for missing metadata
metadata_cols = ['authors', 'meta_keywords', 'meta_description', 'tags', 'summary']

# Accumulators
total_rows = 0
missing_counts_acc = None
domain_counts_acc = {}
error_count_acc = 0
content_lengths = []

# Process the CSV in chunks
for chunk in pd.read_csv(Final_file_path, chunksize=chunksize, low_memory=False):
    total_rows += len(chunk)
    
    # Observation 1: Missing values for metadata columns
    chunk_missing = chunk[metadata_cols].isnull().sum()
    if missing_counts_acc is None:
        missing_counts_acc = chunk_missing
    else:
        missing_counts_acc += chunk_missing

    # Observation 2: Domain distribution 
    chunk_domain_counts = chunk['domain'].value_counts()
    for domain, count in chunk_domain_counts.items():
        domain_counts_acc[domain] = domain_counts_acc.get(domain, 0) + count

    # Observation 3: Content Artifacts and Anomalies
    # Convert the 'content' column to string
    chunk['content'] = chunk['content'].astype(str)
    # Detect rows containing explicitly "error"
    error_mask = chunk['content'].str.contains(r"\berror\b", case=False, regex=True, na=False)
    error_count_acc += error_mask.sum()


# Results after processing
print("Total rows processed:", total_rows)

print("\nMissing values in metadata columns (count):")
print(missing_counts_acc)
print("\nMissing values in metadata columns (percentage):")
print((missing_counts_acc / total_rows * 100).round(2))

print("\nDomain distribution (top 10):")
domain_series = pd.Series(domain_counts_acc).sort_values(ascending=False)
print(domain_series.head(10))

print("\nTotal articles with explicity 'error' in content:", error_count_acc)

Total rows processed: 995000

Missing values in metadata columns (count):
authors             442757
meta_keywords        38790
meta_description    525106
tags                764081
summary             995000
dtype: int64

Missing values in metadata columns (percentage):
authors              44.50
meta_keywords         3.90
meta_description     52.77
tags                 76.79
summary             100.00
dtype: float64

Domain distribution (top 10):
nytimes.com           176144
beforeitsnews.com      91468
dailykos.com           77640
express.co.uk          55983
nationalreview.com     37377
sputniknews.com        37229
abovetopsecret.com     27947
wikileaks.org          23699
www.newsmax.com        12688
www.ammoland.com       11129
dtype: int64

Total articles with explicity 'error' in content: 78554


In [7]:
#Split dataset in 80% train and 10% test and 10% validation
from sklearn.model_selection import train_test_split
training_data = data.sample(frac=0.8, random_state=42)
remaining_data = data.drop(training_data.index)
validation_data = remaining_data.sample(frac=0.5, random_state=42)
testing_data = remaining_data.drop(validation_data.index)

print("Training data shape:", len(training_data))
print("Validation data shape:", len(validation_data))
print("Testing data shape:", len(testing_data))

Training data shape: 796000
Validation data shape: 99500
Testing data shape: 99500
