# Analizam si curatam datasetul

In [1]:
import pandas as pd
import glob
import os

In [None]:
input_dir = "../../datasets/Article_category" 

csv_files = glob.glob(os.path.join(input_dir, "*.csv"))
print(f"Număr fișiere CSV găsite: {len(csv_files)}")

Număr fișiere CSV găsite: 210


In [4]:
df_list = []
for file in csv_files:
    try:
        df = pd.read_csv(file, on_bad_lines='skip', engine='python')
        df_list.append(df)
    except Exception as e:
        print(f"Eroare la citirea fișierului {file}: {e}")

df_all = pd.concat(df_list, ignore_index=True)
print(f"Număr total de articole combinate: {len(df_all)}")

Număr total de articole combinate: 22154527


In [5]:
print("Numarul de valori lipsa in df_all:", df_all.isnull().sum())

Numarul de valori lipsa in df_all: category           0
content     17512337
dtype: int64


In [6]:
df_all.sample(5)

Unnamed: 0,category,content
11035540,POLITICS,Former U.S. Ambassador to the United Nations J...
6048256,ENTERTAINMENT,
21414552,POLITICS,LOADING ERROR LOADING\n\nPresident Donald Trum...
21152908,POLITICS,
19112368,HEALTHY LIVING,


In [None]:
df_filtered = df_all.dropna(subset=["content"])

output_file = os.path.join(input_dir, "news_categories_filtered.csv")
df_filtered.to_csv(output_file, index=False)

print(f"Număr total de articole după filtrare: {len(df_filtered)}")
print(f"Fișier salvat: {output_file}")

Număr total de articole după filtrare: 4642190
Fișier salvat: ../../datasets/Article_category/news_categories_filtered.csv


In [8]:
df_all.value_counts("category")

category
POLITICS          5533579
ENTERTAINMENT     2401269
HEALTHY LIVING     843469
QUEER VOICES       780558
WELLNESS           737241
COMEDY             673453
WORLD NEWS         638092
SPORTS             630015
BLACK VOICES       600922
BUSINESS           587022
TRAVEL             568384
THE WORLDPOST      558879
PARENTS            532931
WOMEN              498710
CRIME              443558
MEDIA              421680
WEIRD NEWS         398589
PARENTING          380270
STYLE & BEAUTY     353212
IMPACT             351663
GREEN              332289
RELIGION           315650
FOOD & DRINK       297330
STYLE              295600
U.S. NEWS          283349
TASTE              274423
WORLDPOST          264849
TECH               208241
ARTS & CULTURE     208126
SCIENCE            205477
HOME & LIVING      186096
GOOD NEWS          172861
LATINO VOICES      165007
ARTS               150520
FIFTY              145212
WEDDINGS           138992
DIVORCE            136440
COLLEGE            129291
EDU

## Try to repair it

In [5]:
categories_list = [
    "POLITICS", "ENTERTAINMENT", "HEALTHY LIVING", "QUEER VOICES",
    "WELLNESS", "COMEDY", "WORLD NEWS", "SPORTS", "BLACK VOICES",
    "BUSINESS", "TRAVEL", "THE WORLDPOST", "PARENTS", "WOMEN", "CRIME",
    "MEDIA", "WEIRD NEWS", "PARENTING", "STYLE & BEAUTY", "IMPACT",
    "GREEN", "RELIGION", "FOOD & DRINK", "STYLE", "U.S. NEWS", "TASTE",
    "WORLDPOST", "TECH", "ARTS & CULTURE", "SCIENCE", "HOME & LIVING",
    "GOOD NEWS", "LATINO VOICES", "ARTS", "FIFTY", "WEDDINGS", "DIVORCE",
    "COLLEGE", "EDUCATION", "ENVIRONMENT", "MONEY", "CULTURE & ARTS"
]
categories_set = set(categories_list)

In [None]:
def detect_category_at_start(line, categories):
    
    line_stripped = line.strip()
    
    for cat in categories:
        if line_stripped.upper().startswith(cat.upper()):
            cat_len = len(cat)
            leftover = line_stripped[cat_len:]
            leftover = leftover.lstrip(' ,.:!?-—"\'')
            
            return cat.upper(), leftover
    return None, line  


In [None]:
def parse_file_by_start_category(file_in, file_out):
    
    with open(file_in, "r", encoding="utf-8") as fin, \
         open(file_out, "w", encoding="utf-8") as fout:
        
        fout.write("category,content\n")
        
        current_category = None
        content_lines = []
        
        for line in fin:
            line_stripped = line.rstrip("\r\n")
            
            cat_found, leftover = detect_category_at_start(line_stripped, categories_list)
            
            if cat_found is not None:
                if current_category is not None:
                    content_text = " ".join(content_lines)
                    content_text = content_text.replace('"','""')
                    fout.write(f'"{current_category}","{content_text}"\n')
                
                current_category = cat_found
                content_lines = []
                
                if leftover.strip():
                    content_lines.append(leftover.strip())
            
            else:
                if line_stripped.strip():
                    content_lines.append(line_stripped.strip())
        
        if current_category is not None:
            content_text = " ".join(content_lines)
            content_text = content_text.replace('"','""')
            fout.write(f'"{current_category}","{content_text}"\n')


In [None]:
files_with_ext = glob.glob("../../datasets/Article_category/*.csv")

files = [os.path.splitext(os.path.basename(f))[0] for f in files_with_ext]

for file in files:
    file_in = os.path.join("../../datasets/Article_category", file + ".csv")
    file_out = os.path.join("../../datasets/Article_category", file + "_fixed.csv")
    parse_file_by_start_category(file_in, file_out)
    print(f"Processed: {file_in} -> {file_out}")

Processed: ../../datasets/Article_category/news_articles_content_part_23.csv -> ../../datasets/Article_category/news_articles_content_part_23_fixed.csv
Processed: ../../datasets/Article_category/news_articles_content_part_63.csv -> ../../datasets/Article_category/news_articles_content_part_63_fixed.csv
Processed: ../../datasets/Article_category/news_articles_content_part_113.csv -> ../../datasets/Article_category/news_articles_content_part_113_fixed.csv
Processed: ../../datasets/Article_category/news_articles_content_part_88.csv -> ../../datasets/Article_category/news_articles_content_part_88_fixed.csv
Processed: ../../datasets/Article_category/news_articles_content_part_58.csv -> ../../datasets/Article_category/news_articles_content_part_58_fixed.csv
Processed: ../../datasets/Article_category/news_articles_content_part_80.csv -> ../../datasets/Article_category/news_articles_content_part_80_fixed.csv
Processed: ../../datasets/Article_category/news_articles_content_part_76.csv -> ../../

In [13]:
df = pd.read_csv(file_out)

In [14]:
df.value_counts("category")

category
POLITICS          273
ENTERTAINMENT     185
U.S. NEWS         174
WORLD NEWS        138
SPORTS             36
CRIME              26
ENVIRONMENT        25
CULTURE & ARTS     23
WELLNESS           16
PARENTING          15
FOOD & DRINK       14
WEIRD NEWS         14
COMEDY             11
STYLE & BEAUTY      9
BUSINESS            9
TECH                9
GREEN               8
SCIENCE             6
WOMEN               5
MEDIA               4
EDUCATION           3
PARENTS             3
HOME & LIVING       3
BLACK VOICES        3
STYLE               2
QUEER VOICES        2
TRAVEL              2
FIFTY               1
COLLEGE             1
Name: count, dtype: int64

In [None]:
input_dir = "../../datasets/Article_category"  

csv_files = glob.glob(os.path.join(input_dir, "*fixed.csv"))
print(f"Număr fișiere CSV găsite: {len(csv_files)}")

df_list = []
for file in csv_files:
    try:
        df = pd.read_csv(file, on_bad_lines='skip', engine='python')
        df_list.append(df)
    except Exception as e:
        print(f"Eroare la citirea fișierului {file}: {e}")

df_all = pd.concat(df_list, ignore_index=True)
print(f"Număr total de articole combinate: {len(df_all)}")

Număr fișiere CSV găsite: 210
Număr total de articole combinate: 22288851


In [17]:
print("Numarul de valori lipsa in df_all:", df_all.isnull().sum())


Numarul de valori lipsa in df_all: category           0
content     17512558
dtype: int64


In [None]:
from collections import Counter

df = pd.read_csv("news_dataset.csv")  

line_counter = Counter()

for content in df["content"].dropna():
    lines = content.splitlines()
    line_counter.update(lines)

most_common_lines = line_counter.most_common(50)
for line, freq in most_common_lines:
    print(f"[{freq} apariții] {repr(line)}")