## Load and Explore the Dataset Structure

In [11]:
# Import Libraries
import pandas as pd
import glob
import os
import html
import re
from tqdm import tqdm
tqdm.pandas()

# Text normalization
import textacy
import textacy.preprocessing as tprep

## Data Loading and Column Standardization:

In [5]:
def load_news_data(directory='.'):
    """
    Load and combine multiple CSV files containing news data from a directory.
    """
    # Define column mapping
    column_mapping = {
        'source_name': 'source',
        'publishedAt': 'date',
        'content': 'text',
        'title': 'title',
        'description': 'description'
    }
    
    # Get all CSV files in the directory
    csv_files = glob.glob(os.path.join(directory, '*.csv'))
    
    # Initialize an empty list to store individual dataframes
    dfs = []
    
    # Read each CSV file and append to the list
    for file in csv_files:
        try:
            temp_df = pd.read_csv(file)
            dfs.append(temp_df)
        except Exception as e:
            print(f"Error reading file {file}: {str(e)}")
    
    # Combine all dataframes
    if not dfs:
        raise ValueError("No CSV files were successfully loaded")
    
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Select and rename columns according to mapping
    final_df = combined_df[column_mapping.keys()].rename(columns=column_mapping)
    
    # Convert date column to datetime
    final_df['date'] = pd.to_datetime(final_df['date'])
    
    # Basic cleaning
    for text_col in ['text', 'title', 'description']:
        # Replace NaN with empty string
        final_df[text_col] = final_df[text_col].fillna('')
        # Basic string cleaning
        final_df[text_col] = final_df[text_col].str.strip()
    
    print(f"Loaded {len(final_df)} rows from {len(csv_files)} files")
    return final_df

# Load the data
try:
    news_df = load_news_data()
    
    # Display basic information about the dataset
    print("\nDataset Info:")
    print(news_df.info())
    
    # Display first few rows
    print("\nFirst few rows:")
    print(news_df.head())
    
except Exception as e:
    print(f"Error loading data: {str(e)}")

Loaded 755 rows from 5 files

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 755 entries, 0 to 754
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   source       570 non-null    object             
 1   date         570 non-null    datetime64[ns, UTC]
 2   text         755 non-null    object             
 3   title        755 non-null    object             
 4   description  755 non-null    object             
dtypes: datetime64[ns, UTC](1), object(4)
memory usage: 29.6+ KB
None

First few rows:
             source                      date  \
0       Gizmodo.com 2025-02-04 15:00:56+00:00   
1  Business Insider 2025-02-04 18:25:21+00:00   
2      Substack.com 2025-02-04 21:28:04+00:00   
3  Business Insider 2025-02-04 21:26:32+00:00   
4      heise online 2025-02-04 14:00:00+00:00   

                                                text  \
0  Usually when large language model

## Text Cleaning 

In [6]:
def clean_news_text(text):
    # Handle None or empty strings
    if not text or pd.isna(text):
        return ""
        
    # convert html escapes
    text = html.unescape(text)
    
    # Remove "[+XXX chars]" pattern found in your data
    text = re.sub(r'\[\+\d+ chars\]', '', text)
    
    # Remove URLs
    text = re.sub(r'https?://\S+', '', text)
    
    # Clean special characters and encoding issues
    text = re.sub(r'â€™', "'", text)  # Fix specific encoding issues in your data
    text = re.sub(r'â€"', "-", text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

# Apply cleaning
news_df['clean_text'] = news_df['text'].progress_apply(clean_news_text)

100%|████████████████████████████████████████████████████████████████████████████████████████| 755/755 [00:00<?, ?it/s]


## Text Normalization

In [13]:
def normalize_text(text):
    text = tprep.normalize.unicode(text)
    text = tprep.normalize.quotation_marks(text)
    text = tprep.remove.accents(text)
    return text

news_df['normalized_text'] = news_df['clean_text'].progress_apply(normalize_text)

100%|█████████████████████████████████████████████████████████████████████████████| 755/755 [00:00<00:00, 24388.12it/s]
