In [17]:
import pandas as pd

# Load the dataset from CSV
df = pd.read_csv('News_Dataset.csv')

# Ensure case sensitivity and remove extra spaces from the category column
df['category'] = df['category'].str.strip()

# List of categories to filter (convert them to uppercase to match your dataset)
categories = [
    'politics', 'entertainment', 'World news', 'sports', 'crime', 'environment', 
    'food & drink', 'style & beauty', 'parenting', 'comedy', 'culture & arts', 
    'business', 'science', 'tech', 'education', 'travel', 
    'religion', 'college'
]

# Convert categories to uppercase to match the dataset
categories = [cat.upper() for cat in categories]

# Iterate over each category, filter the dataset, and save 500 samples to a new CSV file
for category in categories:
    # Filter the dataframe for the specific category (case-sensitive)
    category_df = df[df['category'] == category]
    
    # Debugging: Print the number of rows in the filtered dataframe
    print(f"Category '{category}' - Rows: {len(category_df)}")
    
    if len(category_df) > 0:
        # Get only 500 samples (or less if there are not enough entries)
        category_sample = category_df.sample(n=5000, random_state=42) if len(category_df) >= 5000 else category_df
        
        # Save to a new CSV file, with case-sensitive category names in the file name
        category_sample.to_csv(f'{category}_news.csv', index=False)
    else:
        print(f"No data found for category: {category}")


Category 'POLITICS' - Rows: 35602
Category 'ENTERTAINMENT' - Rows: 17362
Category 'WORLD NEWS' - Rows: 3299
Category 'SPORTS' - Rows: 5077
Category 'CRIME' - Rows: 3562
Category 'ENVIRONMENT' - Rows: 1444
Category 'FOOD & DRINK' - Rows: 6340
Category 'STYLE & BEAUTY' - Rows: 9814
Category 'PARENTING' - Rows: 8791
Category 'COMEDY' - Rows: 5400
Category 'CULTURE & ARTS' - Rows: 1074
Category 'BUSINESS' - Rows: 5992
Category 'SCIENCE' - Rows: 2206
Category 'TECH' - Rows: 2104
Category 'EDUCATION' - Rows: 1014
Category 'TRAVEL' - Rows: 9900
Category 'RELIGION' - Rows: 2577
Category 'COLLEGE' - Rows: 1144


In [18]:
import pandas as pd
import glob

# List of categories to ensure we're reading the correct files
categories = [
    'politics', 'entertainment', 'World news', 'sports', 'crime', 'environment', 
    'food & drink', 'style & beauty', 'parenting', 'comedy', 'culture & arts', 
    'business', 'science', 'tech', 'education', 'travel', 
    'religion', 'college'
]

# Convert categories to uppercase to match the filenames
categories = [cat.upper() for cat in categories]

# List to store all the dataframes
dataframes = []

# Iterate over each category to load its corresponding CSV file
for category in categories:
    try:
        # Read the CSV file for the category
        df = pd.read_csv(f'{category}_news.csv')
        dataframes.append(df)
    except FileNotFoundError:
        print(f"CSV file for category '{category}' not found!")

# Concatenate all dataframes into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Save the combined dataframe to a new CSV file
combined_df.to_csv('News_Data_Final.csv', index=False)

print(f"Combined dataset has {len(combined_df)} rows.")


Combined dataset has 63424 rows.
