## Part 1:

In [26]:
import re

# a)
# six-digit date (DDMMYY) followed by 4 digit identifier (IIII)
# show either DDMMYYIIII or DDMMYY-IIII format

cpr_pattern = r'\b((?:0[1-9]|[12][0-9]|3[01]))((?:0[1-9]|1[0-2]))(\d{2})-?(\d{4})\b'

def extract_cpr(cpr_str: str):
    match = re.fullmatch(cpr_pattern, cpr_str)
    if match:
        return match.groups()
    else:
        return None

test_cprs = ["0102031234", "010203-1234", "3112995678", "311299-5678", "1234567890"]

for cpr in test_cprs:
    result = extract_cpr(cpr)
    if result:
        print(f"Valid CPR: {cpr} -> DD: {result[0]}, MM: {result[1]}, YY: {result[2]}, IIII: {result[3]}")
    else:
        print(f"Invalid CPR: {cpr}")

Valid CPR: 0102031234 -> DD: 01, MM: 02, YY: 03, IIII: 1234
Valid CPR: 010203-1234 -> DD: 01, MM: 02, YY: 03, IIII: 1234
Valid CPR: 3112995678 -> DD: 31, MM: 12, YY: 99, IIII: 5678
Valid CPR: 311299-5678 -> DD: 31, MM: 12, YY: 99, IIII: 5678
Invalid CPR: 1234567890


In [27]:
# b)
# Function to return relevant centuary
def centuarty(yy: int, iiii: int) -> int:
    if 1 <= iiii <= 3999:
        return 1900
    elif 4000 <= iiii <= 4999:
        return 2000 if 0 <= yy <= 36 else 1900
    elif 5000 <= iiii <= 8999:
        return 2000 if 0 <= yy <= 57 else 1800
    elif 9000 <= iiii <= 9999:
        return 2000 if 0 <= yy <= 36 else 1900
    else:
        raise ValueError("Invalid CPR identifier (IIII)")
    
# Test cases
test_cases = [
    (99, 1234),  # 1900
    (15, 4500),  # 2000
    (50, 4500),  # 1900
    (30, 6000),  # 2000
    (70, 6000),  # 1800
    (20, 9500),  # 2000
    (80, 9500),  # 1900
]

for yy, iiii in test_cases:
    print(f"YY: {yy}, IIII: {iiii} -> Century: {centuarty(yy, iiii)}")

YY: 99, IIII: 1234 -> Century: 1900
YY: 15, IIII: 4500 -> Century: 2000
YY: 50, IIII: 4500 -> Century: 1900
YY: 30, IIII: 6000 -> Century: 2000
YY: 70, IIII: 6000 -> Century: 1800
YY: 20, IIII: 9500 -> Century: 2000
YY: 80, IIII: 9500 -> Century: 1900


## Part 2:

In [28]:
import pandas as pd

# 1-2)
# Read csv file
df = pd.read_csv("news_sample.csv")

# display head
#print(df.head())


In [29]:
# 3)
# 3. Inspect the DataFrame
print("First 5 rows:")
print(df.head(), "\n")

print("DataFrame Info:")
print(df.info(), "\n")

print("Summary Statistics:")
print(df.describe(include='all'), "\n")

# 4. Look at sample rows of a text column to see formatting issues (e.g., content)
print("Sample content in row 0:")
print(df.loc[0, 'content'], "\n")

First 5 rows:
   Unnamed: 0   id                domain        type  \
0           0  141               awm.com  unreliable   
1           1  256     beforeitsnews.com        fake   
2           2  700           cnnnext.com  unreliable   
3           3  768               awm.com  unreliable   
4           4  791  bipartisanreport.com   clickbait   

                                                 url  \
0  http://awm.com/church-congregation-brings-gift...   
1  http://beforeitsnews.com/awakening-start-here/...   
2  http://www.cnnnext.com/video/18526/never-hike-...   
3  http://awm.com/elusive-alien-of-the-sea-caught...   
4  http://bipartisanreport.com/2018/01/21/trumps-...   

                                             content  \
0  Sometimes the power of Christmas will make you...   
1  AWAKENING OF 12 STRANDS of DNA – “Reconnecting...   
2  Never Hike Alone: A Friday the 13th Fan Film U...   
3  When a rare shark was caught, scientists were ...   
4  Donald Trump has the unnervin