In [1]:
import sys
sys.executable

'/Users/silvia/University/Dyploma/Analysis_of_biased_news/.venv/bin/python'

In [2]:
import polars as pl
import numpy as np
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

True

In [3]:
data = pl.read_excel("../data/labeled_dataset.xlsx")
data = data.rename({"__UNNAMED__0": "id"})

## Basic Analysis

In [4]:
num_documents = len(data)
print(f"Total number of documents: {num_documents:,}")

data = data.with_columns(
    pl.col("sentence").str.len_chars().alias("char_length"),
    pl.col("sentence").str.split(" ").list.len().alias("word_count")
)

print(f"\nDocument Length Statistics (characters):")
print(f"  Mean: {data['char_length'].mean():.2f}")
print(f"  Median: {data['char_length'].median():.2f}")
print(f"  Min: {data['char_length'].min()}")
print(f"  Max: {data['char_length'].max()}")
print(f"  Std Dev: {data['char_length'].std():.2f}")

print(f"\nDocument Length Statistics (words):")
print(f"  Mean: {data['word_count'].mean():.2f}")
print(f"  Median: {data['word_count'].median():.2f}")
print(f"  Min: {data['word_count'].min()}")
print(f"  Max: {data['word_count'].max()}")
print(f"  Std Dev: {data['word_count'].std():.2f}")

all_text = " ".join(data["sentence"].to_list())
words = re.findall(r'\b\w+\b', all_text.lower())
vocabulary = set(words)
print(f"\nVocabulary size (unique words): {len(vocabulary):,}")
print(f"Total words: {len(words):,}")
print(f"Average word length: {np.mean([len(w) for w in words]):.2f} characters")

Total number of documents: 1,700

Document Length Statistics (characters):
  Mean: 209.72
  Median: 206.00
  Min: 42
  Max: 606
  Std Dev: 72.42

Document Length Statistics (words):
  Mean: 33.48
  Median: 33.00
  Min: 7
  Max: 100
  Std Dev: 11.93

Vocabulary size (unique words): 8,864
Total words: 58,534
Average word length: 5.01 characters


## Distribution of word frequencies

In [5]:
# Unigram
word_freq = Counter(words)
print(f"TOp 20 most common words (unigram):")
for word, freq in word_freq.most_common(20):
    print(f"  {word:20s}: {freq:6,} ({freq/len(words)*100:.2f}%)")

#Bigram
bigrams = [" ".join(words[i:i+2]) for i in range(len(words)-1)]
bigram_freq = Counter(bigrams)
print(f"\nTop 20 most common bigrams:")
for bigram, freq in bigram_freq.most_common(20):
    print(f"  {bigram:35s}: {freq:6,}")

TOp 20 most common words (unigram):
  the                 :  3,183 (5.44%)
  to                  :  1,753 (2.99%)
  of                  :  1,618 (2.76%)
  and                 :  1,422 (2.43%)
  a                   :  1,340 (2.29%)
  in                  :  1,106 (1.89%)
  s                   :    813 (1.39%)
  that                :    807 (1.38%)
  for                 :    614 (1.05%)
  on                  :    552 (0.94%)
  trump               :    479 (0.82%)
  is                  :    458 (0.78%)
  as                  :    405 (0.69%)
  by                  :    342 (0.58%)
  with                :    326 (0.56%)
  has                 :    316 (0.54%)
  are                 :    300 (0.51%)
  have                :    299 (0.51%)
  it                  :    295 (0.50%)
  his                 :    281 (0.48%)

Top 20 most common bigrams:
  of the                             :    302
  in the                             :    270
  u s                                :    152
  to the         

## Tokenization and preprocessing evaluation

In [6]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalnum() and t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens

sample_text = data["sentence"][0]
print(f"\nOriginal text sample:")
print(f"  {sample_text[:200]}...")
print(f"\nPreprocessed tokens:")
preprocessed = preprocess_text(sample_text)
print(f"  {preprocessed[:30]}")

stopword_count = sum(1 for w in words if w.lower() in stop_words)
print(f"\nStopwords analysis:")
print(f"  Stopwords found: {stopword_count:,} ({stopword_count/len(words)*100:.2f}%)")
print(f"  Words after stopword removal: {len(words) - stopword_count:,}")

stopwords_in_corpus = [w for w in words if w.lower() in stop_words]
stopword_freq = Counter(stopwords_in_corpus)
print(f"\n  Top 10 most frequent stopwords:")
for word, freq in stopword_freq.most_common(10):
    print(f"    {word:15s}: {freq:6,}")


Original text sample:
  YouTube is making clear there will be no “birtherism” on its platform during this year’s U.S. presidential election – a belated response to a type of conspiracy theory more prevalent in the 2012 race....

Preprocessed tokens:
  ['youtube', 'making', 'clear', 'birtherism', 'platform', 'year', 'presidential', 'election', 'belated', 'response', 'type', 'conspiracy', 'theory', 'prevalent', '2012', 'race']

Stopwords analysis:
  Stopwords found: 23,759 (40.59%)
  Words after stopword removal: 34,775

  Top 10 most frequent stopwords:
    the            :  3,183
    to             :  1,753
    of             :  1,618
    and            :  1,422
    a              :  1,340
    in             :  1,106
    s              :    813
    that           :    807
    for            :    614
    on             :    552


## Sentiment Exploration

In [7]:
def get_sentiment(text):
    try:
        blob = TextBlob(text)
        return blob.sentiment.polarity, blob.sentiment.subjectivity
    except:
        return 0.0, 0.0

print("\nCalculating sentiment scores (sample of 500 documents)...")
sample_size = min(500, len(data))
sentiments = [get_sentiment(text) for text in data["sentence"][:sample_size].to_list()]
polarities = [s[0] for s in sentiments]
subjectivities = [s[1] for s in sentiments]

print(f"\nSentiment Statistics:")
print(f"  Polarity (range: -1 to 1, negative to positive):")
print(f"    Mean: {np.mean(polarities):.3f}")
print(f"    Median: {np.median(polarities):.3f}")
print(f"    Std Dev: {np.std(polarities):.3f}")

print(f"\n  Subjectivity (range: 0 to 1, objective to subjective):")
print(f"    Mean: {np.mean(subjectivities):.3f}")
print(f"    Median: {np.median(subjectivities):.3f}")
print(f"    Std Dev: {np.std(subjectivities):.3f}")

# Sentiment distribution
positive = sum(1 for p in polarities if p > 0.1)
negative = sum(1 for p in polarities if p < -0.1)
neutral = sample_size - positive - negative

print(f"\n  Sentiment Distribution:")
print(f"    Positive: {positive:4} ({positive/sample_size*100:.1f}%)")
print(f"    Neutral:  {neutral:4} ({neutral/sample_size*100:.1f}%)")
print(f"    Negative: {negative:4} ({negative/sample_size*100:.1f}%)")


Calculating sentiment scores (sample of 500 documents)...

Sentiment Statistics:
  Polarity (range: -1 to 1, negative to positive):
    Mean: 0.029
    Median: 0.000
    Std Dev: 0.202

  Subjectivity (range: 0 to 1, objective to subjective):
    Mean: 0.384
    Median: 0.399
    Std Dev: 0.244

  Sentiment Distribution:
    Positive:  145 (29.0%)
    Neutral:   269 (53.8%)
    Negative:   86 (17.2%)


## Topic Exploration

In [8]:
# TF-IDF Analysis
print("\nPerforming TF-IDF analysis...")
tfidf_vectorizer = TfidfVectorizer(
    max_features=100,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2
)

sample_texts = data["sentence"][:1000].to_list()
tfidf_matrix = tfidf_vectorizer.fit_transform(sample_texts)
feature_names = tfidf_vectorizer.get_feature_names_out()

tfidf_scores = np.array(tfidf_matrix.mean(axis=0)).flatten()
top_indices = tfidf_scores.argsort()[-20:][::-1]

print(f"\nTop 20 keywords by TF-IDF score:")
for idx in top_indices:
    print(f"  {feature_names[idx]:30s}: {tfidf_scores[idx]:.4f}")

print("\n\nPerforming LDA topic modeling (5 topics)...")
n_topics = 5
n_top_words = 10

count_vectorizer = CountVectorizer(
    max_features=1000,
    stop_words='english',
    min_df=2,
    max_df=0.8
)

count_matrix = count_vectorizer.fit_transform(sample_texts)
lda = LatentDirichletAllocation(
    n_components=n_topics,
    random_state=42,
    max_iter=10
)
lda.fit(count_matrix)

feature_names_lda = count_vectorizer.get_feature_names_out()

print(f"\nTop {n_top_words} words for each topic:")
for topic_idx, topic in enumerate(lda.components_):
    top_words_idx = topic.argsort()[-n_top_words:][::-1]
    top_words = [feature_names_lda[i] for i in top_words_idx]
    print(f"\nTopic {topic_idx + 1}: {', '.join(top_words)}")


Performing TF-IDF analysis...

Top 20 keywords by TF-IDF score:
  trump                         : 0.0655
  women                         : 0.0410
  president                     : 0.0391
  new                           : 0.0381
  abortion                      : 0.0352
  people                        : 0.0347
  climate                       : 0.0341
  said                          : 0.0313
  anti                          : 0.0299
  coronavirus                   : 0.0297
  year                          : 0.0266
  student                       : 0.0262
  democrats                     : 0.0256
  american                      : 0.0243
  donald                        : 0.0238
  donald trump                  : 0.0238
  world                         : 0.0225
  media                         : 0.0223
  years                         : 0.0221
  change                        : 0.0219


Performing LDA topic modeling (5 topics)...

Top 10 words for each topic:

Topic 1: abortion, women, climate, sta

## Noise Identification

In [9]:
null_count = data["sentence"].null_count()
print(f"\nNull/Empty documents: {null_count}")

# Check for very short documents (potential spam)
very_short = data.filter(pl.col("word_count") < 5).height
print(f"Very short documents (<5 words): {very_short} ({very_short/num_documents*100:.2f}%)")

# Check for very long documents (potential concatenation issues)
very_long = data.filter(pl.col("word_count") > 200).height
print(f"Very long documents (>200 words): {very_long} ({very_long/num_documents*100:.2f}%)")

# Check for encoding issues
encoding_issues = 0
problematic_chars = ['â€™', 'â€"', 'â€¦', 'â€˜', 'â€œ', '�']
for text in data["sentence"][:1000].to_list():
    if any(char in text for char in problematic_chars):
        encoding_issues += 1

print(f"\nDocuments with encoding issues (sample): {encoding_issues} ({encoding_issues/min(1000, num_documents)*100:.2f}%)")
print(f"  Common problematic patterns: {', '.join(problematic_chars)}")

# Check for duplicates
duplicates = data.group_by("sentence").agg(pl.len().alias("count")).filter(pl.col("count") > 1)
print(f"\nDuplicate sentences: {duplicates.height}")

# Check for non-English text (simple heuristic)
def contains_non_ascii(text):
    return not all(ord(c) < 128 for c in text)

non_ascii_count = sum(1 for text in data["sentence"][:1000].to_list() if contains_non_ascii(text))
print(f"\nDocuments with non-ASCII characters (sample): {non_ascii_count} ({non_ascii_count/min(1000, num_documents)*100:.2f}%)")

# Check for URL presence
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
urls_count = sum(1 for text in data["sentence"].to_list() if re.search(url_pattern, text))
print(f"Documents containing URLs: {urls_count} ({urls_count/num_documents*100:.2f}%)")

# Special characters ratio
def special_char_ratio(text):
    special = len(re.findall(r'[^a-zA-Z0-9\s]', text))
    return special / len(text) if len(text) > 0 else 0

special_ratios = [special_char_ratio(text) for text in data["sentence"][:1000].to_list()]
high_special = sum(1 for r in special_ratios if r > 0.2)
print(f"\nDocuments with high special character ratio (>20%): {high_special} ({high_special/min(1000, num_documents)*100:.2f}%)")


Null/Empty documents: 0
Very short documents (<5 words): 0 (0.00%)
Very long documents (>200 words): 0 (0.00%)

Documents with encoding issues (sample): 0 (0.00%)
  Common problematic patterns: â€™, â€", â€¦, â€˜, â€œ, �

Duplicate sentences: 0

Documents with non-ASCII characters (sample): 435 (43.50%)
Documents containing URLs: 0 (0.00%)

Documents with high special character ratio (>20%): 0 (0.00%)
