In [3]:
import pandas as pd
import string
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from collections import Counter

nltk.download("stopwords")
nltk.download("punkt")


In [5]:

# List all your CSV files
files = ["Deepseek_Day_One.csv", "Deepseek_Day_Two.csv", "Deepseek_Day_Three.csv", "Deepseek_Day_Four.csv", "Deepseek_Day_Five.csv"]

# Read and concatenate them
df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)

# Save to a new CSV file
df.to_csv("deepseek_all.csv", index=False)

print("Files merged successfully into 'deepseek_all.csv'")


Files merged successfully into 'deepseek_all.csv'


In [7]:
print(len(df))  


1050


In [9]:
print(df.shape[0]) 

1050


In [11]:
import pandas as pd

files = ["Deepseek_Day_One.csv", "Deepseek_Day_Two.csv", "Deepseek_Day_Three.csv", "Deepseek_Day_Four.csv", "Deepseek_Day_Five.csv"]

for file in files:
    df_temp = pd.read_csv(file)
    print(f"{file}: {df_temp.columns.tolist()}")


Deepseek_Day_One.csv: ['Unnamed: 0', 'source_id', 'source_name', 'author', 'title', 'description', 'url', 'urlToImage', 'publishedAt', 'content']
Deepseek_Day_Two.csv: ['source_id', 'source_name', 'author', 'title', 'description', 'url', 'urlToImage', 'publishedAt', 'content']
Deepseek_Day_Three.csv: ['source_id', 'source_name', 'author', 'title', 'description', 'url', 'urlToImage', 'publishedAt', 'content']
Deepseek_Day_Four.csv: ['source_id', 'source_name', 'author', 'title', 'description', 'url', 'urlToImage', 'publishedAt', 'content']
Deepseek_Day_Five.csv: ['source_id', 'source_name', 'author', 'title', 'description', 'url', 'urlToImage', 'publishedAt', 'content']


In [13]:

files = ["Deepseek_Day_One.csv", "Deepseek_Day_Two.csv", "Deepseek_Day_Three.csv", "Deepseek_Day_Four.csv", "Deepseek_Day_Five.csv"]

df = pd.concat([pd.read_csv(f, index_col=0) if "Unnamed: 0" in pd.read_csv(f, nrows=1).columns else pd.read_csv(f) for f in files], ignore_index=True)

df.to_csv("deepseek_all.csv", index=False)

print("Files merged successfully into 'deepseek_all.csv'")


Files merged successfully into 'deepseek_all.csv'


In [15]:
print(len(df))  
print(df.shape[0]) 

1050
1050


In [17]:
print(df.columns.tolist())  


['source_id', 'source_name', 'author', 'title', 'description', 'url', 'urlToImage', 'publishedAt', 'content']


In [19]:
duplicates = df[df.duplicated(subset=['title', 'url'])]
print(f"Number of duplicate articles: {len(duplicates)}")


Number of duplicate articles: 100


In [21]:
# Remove duplicates
df = df.drop_duplicates()

# Check the new shape
print(f"New shape after removing duplicates: {df.shape}")


New shape after removing duplicates: (950, 9)


Data Cleaning and Tokenization

In [24]:

# Load dataset
df = pd.read_csv("deepseek_all.csv")

# Select text columns
text_columns = ["title", "description", "content"]
df[text_columns] = df[text_columns].fillna("")  # Replace NaN with empty strings

# Define stopwords and punctuation
sw = set(stopwords.words("english"))
punctuation = set(string.punctuation)

# Function to clean and tokenize text
def clean_and_tokenize(text):
    text = "".join([ch for ch in text if ch not in punctuation])  # Remove punctuation
    tokens = [token.lower().strip() for token in word_tokenize(text)]  # Tokenize and lowercase
    return [token for token in tokens if token not in sw]  # Remove stopwords

# Dictionary to store cleaned tokens
cleaned_news_data = defaultdict(list)

# Iterate through each article
for idx, row in df.iterrows():
    article_id = f"article_{idx}"  # Unique key for each article
    combined_text = " ".join([row[col] for col in text_columns])  # Merge title, description, content
    cleaned_tokens = clean_and_tokenize(combined_text)  # Apply cleaning function
    cleaned_news_data[article_id] = cleaned_tokens  # Store cleaned tokens

# Convert cleaned data to a DataFrame
cleaned_df = pd.DataFrame({"article_id": list(cleaned_news_data.keys()), "tokens": list(cleaned_news_data.values())})

# Save cleaned tokens to CSV
cleaned_df.to_csv("deepseek_cleaned.csv", index=False)

print("Data cleaning and tokenization complete! Saved as 'deepseek_cleaned.csv'.")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/parisakamizi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/parisakamizi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Data cleaning and tokenization complete! Saved as 'deepseek_cleaned.csv'.


In [26]:
def descriptive_stats(tokens, top_n=5, verbose=True):
    """
    Given a list of tokens, print and return key statistics:
    - Number of tokens
    - Number of unique tokens
    - Lexical diversity
    - Number of characters
    - Most common tokens
    """

    num_tokens = len(tokens)
    num_unique_tokens = len(set(tokens))
    lexical_diversity = num_unique_tokens / num_tokens if num_tokens > 0 else 0
    num_characters = sum(len(token) for token in tokens)
    
    # Find most common words
    common_tokens = Counter(tokens).most_common(top_n)

    if verbose:
        print(f"Total Tokens: {num_tokens}")
        print(f"Unique Tokens: {num_unique_tokens}")
        print(f"Total Characters: {num_characters}")
        print(f"Lexical Diversity: {lexical_diversity:.3f}")
        print(f"Top {top_n} Most Common Tokens: {common_tokens}")
    
    return {
        "num_tokens": num_tokens,
        "num_unique_tokens": num_unique_tokens,
        "lexical_diversity": lexical_diversity,
        "num_characters": num_characters,
        "common_tokens": common_tokens
    }

# Load cleaned tokenized dataset
cleaned_df = pd.read_csv("deepseek_cleaned.csv")

# Convert token strings back to lists
cleaned_df["tokens"] = cleaned_df["tokens"].apply(lambda x: eval(x) if isinstance(x, str) else x)

# Combine all tokens into one list
all_tokens = [token for tokens in cleaned_df["tokens"] for token in tokens]

# Perform EDA
eda_results = descriptive_stats(all_tokens)

# Convert EDA results into DataFrame for easy export
eda_df = pd.DataFrame([eda_results])

# Save EDA results
eda_df.to_csv("deepseek_eda.csv", index=False)

print("EDA complete! Results saved as 'deepseek_eda.csv'.")


Total Tokens: 33142
Unique Tokens: 9094
Total Characters: 198474
Lexical Diversity: 0.274
Top 5 Most Common Tokens: [('deepseek', 1274), ('de', 670), ('ai', 658), ('chars', 539), ('la', 325)]
EDA complete! Results saved as 'deepseek_eda.csv'.
