In [117]:
from pymongo import MongoClient
import pandas as pd
from difflib import SequenceMatcher
import matplotlib.pyplot as plt

#conexion a mongo
client = MongoClient("mongodb://localhost:27017/")
db = client["financial_news"]
collection = db["articles"]
clean_collection = db["articles_clean"]

In [118]:
docs = list(collection.find())
df = pd.DataFrame(docs)

print("Initial dataset info:")
df.info()
df.head()

Initial dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   _id           150 non-null    object        
 1   id            150 non-null    object        
 2   title         150 non-null    object        
 3   description   138 non-null    object        
 4   content       150 non-null    object        
 5   url           150 non-null    object        
 6   source        150 non-null    object        
 7   published_at  150 non-null    datetime64[ns]
 8   collected_at  150 non-null    datetime64[ns]
 9   extra         150 non-null    object        
dtypes: datetime64[ns](2), object(8)
memory usage: 11.8+ KB


Unnamed: 0,_id,id,title,description,content,url,source,published_at,collected_at,extra
0,68b25a179f46408f5018cdc8,b1d8a2b23538d1136045063c70c0e4bf,Bitcoin Flash Crash Roils Crypto Market,Did a single whale disrupt the crypto ocean?,Crypto prices dipped Monday following a so-cal...,https://gizmodo.com/bitcoin-price-flash-crash-...,Gizmodo.com,2025-08-25 17:50:49,2025-08-29 22:55:35.344,{'author': 'Matt Novak'}
1,68b25a179f46408f5018cdc9,3bdc3edfc897bc95f271a46835cf1403,Trump Turns 401(k)s Into Crypto Machines,A new executive order could funnel billions fr...,"President Donald Trump, who has rebranded hims...",https://gizmodo.com/trump-turns-401ks-into-cry...,Gizmodo.com,2025-08-07 23:22:10,2025-08-29 22:55:35.344,{'author': 'Luc Olinga'}
2,68b25a179f46408f5018cdca,2fb8199562f60a5c2f2e727bcbd79d96,Donald Trump Orders Crackdown on Politically-M...,"In a new executive order, the US president has...",Carter termed this alleged discrimination camp...,https://www.wired.com/story/donald-trump-order...,Wired,2025-08-08 07:02:49,2025-08-29 22:55:35.344,{'author': 'Joel Khalili'}
3,68b25a179f46408f5018cdcb,8b052c32e6c92d817202f538c62481e2,El Salvador’s Crypto President Could Stay in P...,Trump's buddy in Central America may become a ...,El Salvador’s Legislative Assembly approved ch...,https://gizmodo.com/el-salvadors-crypto-presid...,Gizmodo.com,2025-08-01 18:50:12,2025-08-29 22:55:35.344,{'author': 'Matt Novak'}
4,68b25a179f46408f5018cdcc,0d965d457d882086abe7a93a0afea96b,“We Miss All the Trains to Get Rich”: The Real...,I went to a journalism convention expecting to...,Ive been at the National Association of Black ...,https://gizmodo.com/we-miss-all-the-trains-to-...,Gizmodo.com,2025-08-07 15:23:51,2025-08-29 22:55:35.344,{'author': 'Luc Olinga'}


# Limpieza

In [119]:
df["content_length"] = df["content"].fillna("").str.len()
df["content_length"].describe()

count     150.000000
mean     1172.373333
std      1963.196847
min        57.000000
25%       214.000000
50%       214.000000
75%       215.000000
max      8239.000000
Name: content_length, dtype: float64

In [120]:
df = df[df["content_length"] > 150]

In [121]:
initial_count = len(df)
df = df.drop_duplicates(subset='id')
print(f"Removed {initial_count - len(df)} exact duplicates.")

Removed 2 exact duplicates.


In [122]:
df["published_at"] = pd.to_datetime(df["published_at"], errors="coerce")
df = df[df["published_at"].notna()]

In [123]:
print("\nSource counts:")
print(df["source"].value_counts())


Source counts:
source
Forbes                    36
Yahoo Entertainment       19
Gizmodo.com               14
CoinOtag                  11
Cryptopolitan              8
Bitcoin World              7
BitcoinSistemi             4
NewsBTC                    4
TheStreet                  4
Dlnews.com                 4
Bitcoinist                 3
Business Insider           3
TimesTabloid               3
The Coin Rise              3
CoinDesk                   3
Biztoc.com                 2
Barchart.com               2
TechRadar                  1
Slate Magazine             1
R-bloggers.com             1
Wired                      1
USA Today                  1
Substack.com               1
TMZ                        1
Coinspeaker                1
AppleInsider               1
Kotaku                     1
Calculatedriskblog.com     1
Bitcoinke.io               1
Bitcoin.com                1
Name: count, dtype: int64


In [129]:
print("\nPublished at range:", df["published_at"].min(), "-", df["published_at"].max())


Published at range: 2025-07-29 01:01:03 - 2025-08-29 22:46:55


In [130]:
clean_collection.insert_many(df.to_dict('records'))
print(f"Inserted {len(df)} clean articles into MongoDB collection 'articles_clean'")

Inserted 143 clean articles into MongoDB collection 'articles_clean'
