In [None]:
import re
def process_file(file_path):


    with open(file_path, 'r') as f:
        data = f.read()

    # Step 1: Extract username
    username_match = re.search(r"([^\n]+)\s+(?:2nd degree|3rd degree)", data)
    username = username_match.group(1).strip() if username_match else "Not available"

    # Step 2: Extract number of connections
    connections_match = re.search(r"(\d+)\s+connections", data)
    connections = connections_match.group(1) if connections_match else "Not available"

    # Step 3: Extract experience and calculate total years and months
    experience_pattern = r"Experience(.*?)Education"
    experience_section = re.search(experience_pattern, data, re.DOTALL)
    experience_section = experience_section.group(1).strip() if experience_section else ""

    duration_pattern = r"(\d+)\s+yrs(?:\s+(\d+)\s+mos)?"
    matches = re.findall(duration_pattern, experience_section)

    # Remove duplicates by converting to a set
    unique_matches = list(set(matches))

    # Calculate total years and months
    total_years = 0
    total_months = 0
    for match in unique_matches:
        years = int(match[0]) if match[0] else 0
        months = int(match[1]) if match[1] else 0
        total_years += years
        total_months += months

    # Convert months to years if needed
    total_years += total_months // 12
    total_months = total_months % 12

    # Step 4: Extract education
    education_pattern = r"Education(.*?)(Contact info|$)"
    education = re.search(education_pattern, data, re.DOTALL)
    education = education.group(1).strip() if education else "Not available"

    # Step 5: Extract the highest-priority degree
    priorities = ["PhD", "MS", "BS"]
    def extract_highest_priority_degree(data, priorities):
        for degree in priorities:
            if degree in data:
                return degree
        return "None"
    highest_degree = extract_highest_priority_degree(education, priorities)

    # Step 6: Extract recent post timing
    post_pattern = r'(\d+)(mo|d|yr)'
    post_match = re.findall(post_pattern, data)
    if post_match:
        number, unit = post_match[0]
        if unit == 'mo':
            recent_post = f"{number} months"
        elif unit == 'yr':
            recent_post = f"{number} years"
        elif unit == 'd':
            recent_post = f"{number} days"
        else:
            recent_post = "Not available"
    else:
        recent_post = "Not available"

    # Step 7: Extract reactions, comments, and repost counts
    reaction_regex = r'like\w*\s*(\d+)\s*'
    comments_regex = r'(\d+)\s*comments'
    reposts_regex = r'(\d+)\s*reposts'
    reaction_match = re.search(reaction_regex, data)
    comments_match = re.search(comments_regex, data)
    reposts_match = re.search(reposts_regex, data)
    reactions = reaction_match.group(1) if reaction_match else "Not available"
    comments = comments_match.group(1) if comments_match else "Not available"
    reposts = reposts_match.group(1) if reposts_match else "Not available"

    # Compile all results into a dictionary
    results = {
        "Username": username,
        "Education": highest_degree,
        "Experience (Years)": total_years,
        "Experience (Months)": total_months,
        "Connections": connections,
        "Recent Post": recent_post,
        "Reactions on Recent Post": reactions,
        "Comments on Recent Post": comments,
        "Repost Count on Recent Post": reposts,
    }

    return results


In [None]:
file_path2 = '/content/andrew ng.txt'
output2 = process_file(file_path2)


for key, value in output2.items():
    print(f"{key}: {value}")

Username: Andrew Ng
Education: PhD
Experience (Years): 56
Experience (Months): 9
Connections: 24
Recent Post: 1 days
Reactions on Recent Post: 2
Comments on Recent Post: 87
Repost Count on Recent Post: 130
