# **Project 17**: Post-Event Damage Assessment Using News Article Analysis 

## Group Members:
- Walid Nouicer (wnouicer24@student.oulu.fi)
- Uswah Batool (uswah.batool@student.oulu.fi)
- Piero Campos Villagaray (pcamposv25@student.oulu.fi)

________________

### Task 1: Event Selection and Data Collection

In [None]:
from task_1 import get_urls, get_articles, articles2csv, append_article_to_csv

# get_urls() can be used to extract urls from news sources
# if no news sources are passed in function parameters, the function uses a default list
# this process can take a very long time, for the sake of the demo we will use a preset example
# the natural disaster we will be using as an example is Hurricane Melissa, which is an active
# Atlantic hurricane currently accelerating northeastward away from the Bahamas and toward Bermuda.
example_urls = [
    "https://www.aljazeera.com/gallery/2025/10/30/hurricane-melissa-leaves-trail-of-destruction-across-northern-caribbean",
    "https://apnews.com/article/hurricane-melissa-jamaica-cuba-landslide-rain-flood-d7d120b8443b1630d12c77e0a3fe25b0",
]

# get_articles() filters and scrapes article data from urls based on qwery words passed into the function
# we can save articles to a csv file during he function's runtime by setting live_save=True, default value is False
# default path for the csv fileis "data/articles.csv"
qw = ["hurricane", "melissa"]
example_articles = get_articles(example_urls, qw, limit=200, live_save=False)
for article in example_articles:
    print(f"\nTitle: {article[0]}\ndate: {article[1]}\nSource: {article[2]}\nText: {article[3][:200]}...\n")
    print("___________________________________________________________")

### Task 2: Data Cleaning and Preprocessing

In [None]:
from task_2 import load_file, clean_text

# load_file() automatically loads data from "data/articles.csv", a different path can be passed in function params
# data = load_file()

# clean_text() performs lemmatization, and removes stopwords, punctuation, symbols, and converts text to lowecase
# cleaned_data = clean_text(data)

# articles2csv() can be used to save the processed data
# articles2csv(
#     articles=cleaned_data,
#     path="data/cleaned_data.csv",
#     fields=["title", "date", "source", "article_text", "clean_text"],
# )

# the clean_text column is saved as a long string for convenience
print(f"{load_file(path="data/cleaned_data.csv")[1][4][:100]}...")

### Task 3: Language Filtering and Quality Check

In [None]:
import pandas as pd
from task_3 import filter_english_articles_with_descriptive_stats

# filter_english_articles_with_descriptive_stats() filters non-english and duplicate 
# articles, then prints a quality report

data = pd.read_csv("data/cleaned_data.csv")
filter_english_articles_with_descriptive_stats(data)

### Task 4: Keyword Extraction and Frequency Analysis

In [None]:
from task_4 import plot_and_save, keyword_analysis

# keyword_analysis() extract top keywords using both CountVectorizer and TF-IDF,
# visualize results in bar charts and word clouds
data = pd.read_csv("data/filtered_articles.csv")
keyword_analysis(data)

### Task 5: Corpus Statistical Analysis and Zipf’s Law

In [None]:
from task_5 import zipf_analysis

# zipf_analysis() performs Zipf’s Law analysis and visualize rank-frequency relationship
data = pd.read_csv("data/filtered_articles.csv")
zipf_analysis(data)

### Task 6: Lexical and Readability Analysis

In [None]:
from task_6 import analyze_article_metrics

df = pd.read_csv(r"data/articles.csv") 
print(f"Total articles loaded: {len(df)}")

# analyze_article_metrics() calculates lexical metrics and returns results
lexical_results = []
for idx, row in df.iterrows():
    result = analyze_article_metrics(row)
    if result is not None:
        lexical_results.append(result)

lexical_df = pd.DataFrame(lexical_results)

print("\n" + "=" * 70)
print("LEXICAL METRICS SUMMARY")
print("=" * 70)
print(lexical_df.describe())  ## Summary Table

print("\nSENTENCE COUNT:")
print(f"  Min sentences: {lexical_df['sentence_count'].min()}")
print(f"  Max sentences: {lexical_df['sentence_count'].max()}")
print(f"  Mean sentences: {lexical_df['sentence_count'].mean():.1f}")

lexical_df.to_csv(r"data/Lexical_Analysis.csv", index=False)
print(f"\n Results saved to 'Lexical_Analysis.csv'")

### Task 7: Sentiment and Emotion Detection

### Task 8: Named Entity and Quantitative Impact Analysis

In [None]:
from task_8 import damage_summary, plot_entity_frequency, aggregate_text

# damage_summary() calculate the number of instances in a dataset where and entity
# is labeled as MONEY or CARDINAL. for the latter case, filter out
# sentences describing distance or speed measurements.

aggr_text = aggregate_text(data=load_file()[1:])
damage_summary(aggr_text)
plot_entity_frequency(aggr_text)

### Task 9: Event Impact Scoring Model

In [None]:
from task_9 import impact2csv

data = load_file("data/cleaned_data.csv")

# impact2csv() takes csv data and computes sentiment polarity, emotion intensity, damage-related keywords frequency, and impact score
# then saves the result into a csv file and returns the generated data with articles sorted based on impact score
scored_articles = impact2csv(in_data=data)

for article in scored_articles[1:]:
    title, source, polarity, intensity, damage, impact = article
    print("Article aitle: ", title)
    print("Source: ", source)
    print("Sentiment polarity = ", polarity)
    print("Emotion intensity = ", intensity)
    print("Damage-related keywords frequency = ", damage)
    print("Impact score = ", impact)
    print("________________________________________________________________________________________________\n")

### Task 10: Statistical Summary and Visualization Dashboard