In [2]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

#cinfiguration 
num_rows = 50000
filename = " optum_eligibility_practice.csv"

data= {
    'Member_id':[f'M{100000 + i}'for i in range(num_rows)],
    'Subscriber_id':[f'S{100000 + random.randint(0,num_rows//2)}' for _ in range(num_rows)],
    'First_name': [random.choice(['John','Jane','Michael','Sarah','Robert','Emiliy']) for _ in range(num_rows)],
    'Last_name':[random.choice(['Smith','Johnson','Williams','Brown','Jones','Garcia'])for _ in range(num_rows)],
    'DOB':[datetime(1950,1,1)+timedelta(days=random.randint(0,25000)) for _ in  range(num_rows)],
    'Gender': [random.choice(['M','F','U']) for _ in range(num_rows)],
    'Plan_code':[random.choice(['PPO_001', 'HMO_CORE', 'HDHP_SILVER', 'MEDICARE_ADV', None]) for _ in range(num_rows)],
    'Effective_date':[datetime(2023,1,1) + timedelta(days=random.randint(0,365)) for _ in range(num_rows)],
    'Term_Date': [datetime(2024, 1, 1) + timedelta(days=random.randint(0, 730)) if random.random() > 0.7 else None for _ in range(num_rows)],  
    'Client_ID':[random.choice(['OPT_AMZ', 'OPT_WLM', 'OPT_GGL', 'OPT_UHG']) for _ in range(num_rows)],
    'Monthly_Premium': [round(random.uniform(150.0,850.0),2) for _ in range(num_rows)]
}

df = pd.DataFrame(data)
# introduce some messy data  (duplicates and nulls for practice)
df.loc[df.sample(frac = 0.05).index,'Member_id'] = df['Member_id'].iloc[0] #Create duplicates
df.to_csv('optum_eligibility_practice.csv',index = False)
print(filename)

print(f" file '{filename}' created successfully. Size {df.memory_usage(deep = True).sum()/1024**2:.2f} MB")

 optum_eligibility_practice.csv
 file ' optum_eligibility_practice.csv' created successfully. Size 19.60 MB


In [3]:
import sqlite3
#1. load the csv data
df = pd.read_csv('optum_eligibility_practice.csv')
#2. connect to or sqk=lite database 
conn = sqlite3.connect('optum_Practice.db')
cursor = conn.cursor()
#3.Import the dataa name into a table named 'Eligibility'
df.to_sql('Eligibility',conn, if_exists = 'replace',index = False)

#4.create an INDEx (important for performance on 4mb files)
cursor.execute('CREATE INDEX index_member_id on Eligibility (Member_ID)')
print("Database 'Optum_Practice.db' created successfully!")
conn.close()

Database 'Optum_Practice.db' created successfully!


In [4]:
conn = sqlite3.connect('optum_Practice.db')
cursor = conn.cursor()
cursor.execute('''SELECT COUNT(*) AS Total_Rows, 
COUNT(DISTINCT Member_id) as Unique_Member_Count FROM Eligibility;''')
result = cursor.fetchone()
total_rows = result[0]
unique_members = result[1]
print(total_rows)
print("Unique Member:", unique_members)
conn.commit()


50000
Unique Member: 47500


In [5]:
conn = sqlite3.connect('optum_Practice.db')
cursor = conn.cursor()
#cursor.execute('''CREATE TABLE clients(
               #client_id INTEGER PRIMARY KEY AUTOINCREMENT,
               #client_name TEXT NOT NULL,
               #Contract_type  TEXT,
               #Account_Manager TEXT,
               #Region TEXT)''')

cursor.execute('''INSERT INTO Clients(client_name,contract_type,Account_Manager,Region)
               VALUES('Amazon', 'Full-Risk', 'Sarah Jenkins','MN'),
                     ('Walmart', 'ASO', 'Mike Ross','TX'),
                     ('Google', 'Shared-Savings', 'Elena Gilbert','NY')
''');     
conn.commit()
conn.close()      

    

In [6]:
conn = sqlite3.connect('optum_Practice.db')
cursor = conn.cursor()
cursor.execute("""SELECT c.client_name,
                c.Account_Manager, 
               COUNT (e.Member_ID) as Total_enrolled
               FROM eligibility e
               JOIN Clients c ON e.client_id = c.client_id
               GROUP BY c.client_name """)

               

<sqlite3.Cursor at 0x20aab4b64c0>

In [7]:
conn = sqlite3.connect('optum_Practice.db')
cursor = conn.cursor()
data = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';",conn)
print(data)

conn.commit()


              name
0          clients
1  sqlite_sequence
2      Eligibility


In [8]:
# CLEANING DATA
conn = sqlite3.connect('optum_Practice.db')
cursor = conn.cursor() 
data = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type ='table';" ,conn)

print(df)
leaks = df[(df['Plan_code'].isnull()) & (df['Monthly_Premium'] > 0)]
print(f" 1. revenue leak found: {len(leaks)} cases.")

#2. Check for data logic error 
data_errors = df[df['Term_Date'] <= df['Effective_date']]
print(f" 2. Data equence errors: {len(data_errors)}")

# standarization 
df['Gender'] = df['Gender'].replace({'M': 'Male','F': 'Female','U': 'Unknown'})



      Member_id Subscriber_id First_name Last_name         DOB Gender  \
0       M100000       S116186       John  Williams  1954-10-13      F   
1       M100001       S115134    Michael   Johnson  1965-09-13      F   
2       M100002       S112689       Jane     Smith  2000-09-18      F   
3       M100003       S108036      Sarah    Garcia  1993-01-04      U   
4       M100000       S102706       John     Smith  1951-11-01      U   
...         ...           ...        ...       ...         ...    ...   
49995   M149995       S100785       Jane  Williams  1966-03-29      M   
49996   M149996       S113697     Robert     Smith  1983-09-16      M   
49997   M149997       S100378      Sarah     Smith  2011-01-09      M   
49998   M149998       S122195     Robert     Smith  2005-12-01      M   
49999   M100000       S112283     Emiliy   Johnson  1960-06-05      U   

      Plan_code Effective_date   Term_Date Client_ID  Monthly_Premium  
0           NaN     2023-02-01  2025-10-10   OPT_UH

In [24]:
cursor.execute(''' UPDATE eligibility
               SET Plan_code = 'BASIC_HMO'
               WHERE Plan_code IS NULL AND Monthly_Premium > 0
               ''')
cursor.execute(''' UPDATE eligibility
               SET Term_Date = date(Effective_date, '+1 year')
               WHERE Term_Date <= Effective_date ''')
cursor.execute('''UPDATE eligibility 
               SET Gender = 'Unknown' WHERE Gender = 'U'
               ''')
cursor.execute(" SELECT Member_id, COUNT(*) FROM eligibility GROUP BY Member_id")
results = cursor.fetchall()
for Member_id , count in results:
    print(f"{Member_id}: {count}") 
conn.commit()

M100000: 2501
M100001: 1
M100002: 1
M100003: 1
M100005: 1
M100006: 1
M100007: 1
M100008: 1
M100009: 1
M100010: 1
M100012: 1
M100013: 1
M100015: 1
M100016: 1
M100017: 1
M100018: 1
M100019: 1
M100020: 1
M100021: 1
M100022: 1
M100023: 1
M100024: 1
M100025: 1
M100026: 1
M100027: 1
M100028: 1
M100029: 1
M100030: 1
M100031: 1
M100032: 1
M100033: 1
M100034: 1
M100035: 1
M100036: 1
M100037: 1
M100038: 1
M100039: 1
M100040: 1
M100041: 1
M100042: 1
M100043: 1
M100044: 1
M100045: 1
M100046: 1
M100047: 1
M100048: 1
M100049: 1
M100050: 1
M100053: 1
M100055: 1
M100056: 1
M100057: 1
M100058: 1
M100059: 1
M100060: 1
M100061: 1
M100062: 1
M100063: 1
M100064: 1
M100065: 1
M100066: 1
M100067: 1
M100068: 1
M100069: 1
M100070: 1
M100071: 1
M100072: 1
M100074: 1
M100075: 1
M100076: 1
M100077: 1
M100078: 1
M100079: 1
M100080: 1
M100082: 1
M100083: 1
M100084: 1
M100085: 1
M100086: 1
M100087: 1
M100088: 1
M100089: 1
M100090: 1
M100091: 1
M100092: 1
M100093: 1
M100094: 1
M100095: 1
M100096: 1
M100097: 1
M100098

In [None]:
data = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type ='table';", conn)
print(pd)
print(data)
print(df)
# Verification Check
check_df = pd.read_sql_query("SELECT Gender, COUNT(*) FROM eligibility GROUP BY Gender", conn)
check_termdate = check_df = pd.read_sql_query("SELECT Term_date , COUNT(*) FROM eligibility GROUP BY Term_date", conn)



print(check_df)


<module 'pandas' from 'c:\\Users\\Lenovo\\AppData\\Local\\Programs\\Python\\Python313\\Lib\\site-packages\\pandas\\__init__.py'>
              name
0          clients
1  sqlite_sequence
2      Eligibility
      Member_id Subscriber_id First_name Last_name         DOB   Gender  \
0       M100000       S116186       John  Williams  1954-10-13   Female   
1       M100001       S115134    Michael   Johnson  1965-09-13   Female   
2       M100002       S112689       Jane     Smith  2000-09-18   Female   
3       M100003       S108036      Sarah    Garcia  1993-01-04  Unknown   
4       M100000       S102706       John     Smith  1951-11-01  Unknown   
...         ...           ...        ...       ...         ...      ...   
49995   M149995       S100785       Jane  Williams  1966-03-29     Male   
49996   M149996       S113697     Robert     Smith  1983-09-16     Male   
49997   M149997       S100378      Sarah     Smith  2011-01-09     Male   
49998   M149998       S122195     Robert     