In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("C:\\Users\\biswa\\Downloads\\Datasets-2025-08-08\\news_events_cleaned.csv")
df.head(5)

Unnamed: 0,event_id,event_type,summary,category,found_at,confidence,article_sentence,human_approved,planning,amount,...,company_id,company_name,company_domain,company_ticker,article_id,article_title,article_url,article_published_at,article_body,article_author
0,0020f127-3470-4cce-8989-1c79f45da217,news_event,Unipart Manufacturing Group recognized as Tran...,recognized_as,2022-07-10 20:00:00,0.8759,In addition to being named the safest organisa...,False,False,,...,000bd323-1bf8-5c7a-9941-e6c155c29d10,Unipart Manufacturing Group,unipart.com,,d172abc1-3755-4cef-946e-7de944806e7d,Unipart ranked the safest organisation in the ...,https://www.unipart.com/unipart-ranked-the-saf...,2022-07-10 20:00:00,Unipart has been further recognised for its un...,
1,009be1ff-6cfb-4e9f-a415-69baf71f47f3,news_event,OOS International received award two safety aw...,receives_award,2019-12-19 10:45:17,0.9497,Since then OOS International has been an activ...,False,False,,...,000ff896-4292-5b15-9c81-8bf4d76c10d7,OOS International,oosinternational.com,,58c0d5fd-068d-4bab-8ac4-47e19bbdf091,OOS International Award for completing another...,https://www.oosinternational.com/oos-news/awar...,2019-12-19 10:45:17,"On behalf of CIM OOS Tiradentes, Mr Jose Marci...",admin
2,01444124-7375-4f03-8879-eb8200b31504,news_event,NWN Corporation received award Global Winner f...,receives_award,2022-07-12 20:00:00,0.6887,"As a result, with nearly 400 nominees from ove...",False,False,,...,000d8a9c-882c-57f2-8b4c-2afc786d0fa1,NWN Corporation,nwncarousel.com,,ef330a38-8624-41c1-8b75-d1b96e7dbd45,Skyrocketing Employee Engagement with Microsof...,https://nwncarousel.com/blog/skyrocketing-empl...,2022-07-12 22:00:00,More than 270 million people rely on Teams for...,
3,031a304c-29ca-415e-a815-e9c915896540,news_event,Grape Solutions Plc. is developing Mobiliti ap...,is_developing,2023-04-02 22:00:00,0.5987,MVM Mobiliti and Grape Solutions have been wor...,False,False,,...,0008b75f-9d15-54ae-b70a-52301945e397,Grape Solutions Plc.,grape.solutions,,0525807d-6ff6-44a0-9c36-8be3afceba5b,Hungarian development makes electric vehicle c...,https://www.iqfin.net/technology/hungarian-dev...,2023-04-02 22:00:00,Grape Solutions and MVM Mobiliti partner with ...,
4,037783ca-f3f7-4782-8a81-df3cae1ac936,news_event,"NWN Corporation launched two new kits, At-Home...",launches,2022-04-13 01:02:36,0.718,"NWN Carousel, the leading integrated cloud com...",False,False,,...,000d8a9c-882c-57f2-8b4c-2afc786d0fa1,NWN Corporation,nwncarousel.com,,16061c55-111d-496a-9e3e-837dddc3454b,NWN Carousel Launches Two Hybrid Work Solution...,https://www.businesswire.com/news/home/2022041...,2022-04-13 03:00:00,"NWN Carousel, the leading integrated cloud com...",


**1. Issue Detection & Remediation:**

- Write code/scripts (e.g., Python, SQL, or both) to detect and fix key data quality issues identified during your analysis (e.g., remove duplicates, correct invalid values, standardize formats).

### Data Cleaning 

In [3]:
# remove duplicate if any
# Check duplicate rows
duplicate_rows = df.duplicated()
print(f"Number of duplicate rows: {duplicate_rows.sum()}")

# Remove duplicates (keeping first occurrence)
df = df.drop_duplicates()
print(f"Data shape after removing duplicates: {df.shape}")


Number of duplicate rows: 0
Data shape after removing duplicates: (612910, 24)


In [4]:
#handeling missing values 
# Threshold: drop columns with >80% missing
threshold = 0.8
missing_ratio = df.isnull().mean()
cols_to_drop = missing_ratio[missing_ratio > threshold].index
df = df.drop(columns=cols_to_drop)
print(f"Dropped columns: {list(cols_to_drop)}")


Dropped columns: ['amount', 'amount_normalized', 'headcount', 'article_author']


In [5]:
df.isnull().sum()

event_id                     0
event_type                   0
summary                      0
category                     0
found_at                     0
confidence                   8
article_sentence             0
human_approved               0
planning                     0
effective_date          369050
location                444747
company_id               14768
company_name             14875
company_domain           14768
company_ticker          479215
article_id                   0
article_title               22
article_url                  0
article_published_at         0
article_body                 0
dtype: int64

In [9]:
#location missing value treatment
df['location'] = df['location'].fillna('Unknown')



In [10]:
# Drop rows where company_id is missing
df = df.dropna(subset=['company_id'])

# If company_name or domain is missing, fill with placeholder
df['company_name'] = df['company_name'].fillna('Unknown Company')
df['company_domain'] = df['company_domain'].fillna('unknown.com')


In [11]:
df['company_ticker'] = df['company_ticker'].fillna('Unknown')


In [12]:
df['article_title'] = df['article_title'].fillna('Untitled Article')


In [14]:
# Option 1: Fill missing confidence with median
df['confidence'] = df['confidence'].fillna(df['confidence'].median())



In [7]:
#date format standz.
# Convert date columns to datetime
date_cols = ['found_at', 'effective_date', 'article_published_at']
for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

# Optional: fill missing effective_date with found_at
if 'effective_date' in df.columns:
    df['effective_date'] = df['effective_date'].fillna(df['found_at'])


In [8]:
# Strip whitespaces and convert to lowercase
df['category'] = df['category'].str.strip().str.lower()

# Keep only valid categories
valid_categories = [
    'recognized_as','receives_award','is_developing','launches','receives_financing',
    'hires','partners_with','attends_event','invests_into_assets','promotes',
    'expands_offices_to','acquires','sells_assets_to','leaves','signs_new_client',
    'closes_offices_in','files_suit_against','invests_into','expands_facilities',
    'retires_from','opens_new_location','merges_with','has_issues_with','integrates_with',
    'identified_as_competitor_of','goes_public','increases_headcount_by',
    'decreases_headcount_by','expands_offices_in'
]

df = df[df['category'].isin(valid_categories)]


In [16]:
df.isnull().sum()

event_id                0
event_type              0
summary                 0
category                0
found_at                0
confidence              0
article_sentence        0
human_approved          0
planning                0
effective_date          0
location                0
company_id              0
company_name            0
company_domain          0
company_ticker          0
article_id              0
article_title           0
article_url             0
article_published_at    0
article_body            0
dtype: int64

In [17]:
# Save as CSV--final
output_path = "C:/Users/biswa/Downloads/Datasets-2025-08-08/news_events_final.csv"
df.to_csv(output_path, index=False)

print(f"Saved {len(df)} records into {output_path}")

Saved 598142 records into C:/Users/biswa/Downloads/Datasets-2025-08-08/news_events_final.csv
