In [1]:
import pandas as pd

# Load datasets
file_paths = {
    "Global_PCOS_Demographics": "Global_PCOS_Demographics_Cleaned.csv",
    "PCOS_Medication_Reactions": "PCOS_Medication_Reactions_Cleaned.csv",
    "Lifestyle_Diet_Dataset": "Lifestyle_Diet_Dataset_Cleaned.csv",
    "Core_PCOS_Dataset": "Core_PCOS_Dataset_Cleaned.csv",
}

# Read datasets into pandas dataframes
datasets = {name: pd.read_csv(path) for name, path in file_paths.items()}

# Get basic info and first few rows of each dataset
dataset_summary = {}
for name, df in datasets.items():
    dataset_summary[name] = {
        "Shape": df.shape,
        "Columns": df.columns.tolist(),
        "Missing Values": df.isnull().sum().sum(),
        "First Rows": df.head()
    }

# Display the dataset summary
dataset_summary


{'Global_PCOS_Demographics': {'Shape': (84118, 17),
  'Columns': ['Country',
   'Age',
   'BMI',
   'Menstrual_Regularity',
   'Hirsutism',
   'Acne_Severity',
   'Family_History_of_PCOS',
   'Insulin_Resistance',
   'Lifestyle_Score',
   'Stress_Levels',
   'Urban_Rural',
   'Socioeconomic_Status',
   'Awareness_of_PCOS',
   'Fertility_Concerns',
   'Undiagnosed_PCOS_Likelihood',
   'Ethnicity',
   'Diagnosis'],
  'Missing Values': 41992,
  'First Rows':       Country  Age          BMI Menstrual_Regularity Hirsutism Acne_Severity  \
  0  Madagascar   26   overweight              regular       yes        severe   
  1     Vietnam   16  underweight              regular       yes           NaN   
  2     Somalia   41       normal              regular        no      moderate   
  3      Malawi   27       normal            irregular        no          mild   
  4      France   26   overweight            irregular       yes           NaN   
  
    Family_History_of_PCOS Insulin_Resistance  

## Arm

In [9]:
# Attempt to install mlxtend
!pip install mlxtend




In [18]:
# Import necessary libraries for ARM
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
import networkx as nx

# Load PCOS Medication Reactions Dataset
file_path_medication = "PCOS_Medication_Reactions_Cleaned.csv"
df_medication = pd.read_csv(file_path_medication)

# Step 1: Convert data into a transaction format (one-hot encoding)
# Drop non-categorical columns and convert to a list of transactions
transactions = df_medication.dropna().astype(str).values.tolist()

# Encode transactions into a binary matrix
te = TransactionEncoder()
encoded_data = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(encoded_data, columns=te.columns_)

# Step 2: Apply Apriori Algorithm to find frequent itemsets
frequent_itemsets = apriori(df_encoded, min_support=0.1, use_colnames=True)

# Step 3: Generate Association Rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

# Step 4: Extract top 15 rules for support, confidence, and lift
top_support = rules.sort_values(by="support", ascending=False).head(15)
top_confidence = rules.sort_values(by="confidence", ascending=False).head(15)
top_lift = rules.sort_values(by="lift", ascending=False).head(15)

# Create a directed graph
G = nx.DiGraph()

# Add edges with lift values
for index, row in top_lift.iterrows():
    G.add_edge(str(row["antecedents"]), str(row["consequents"]), weight=row["lift"])

# Generate node positions using spring layout
pos = nx.spring_layout(G, seed=42)

# Extract node and edge information for Plotly
edge_x, edge_y, edge_text = [], [], []
for edge in G.edges(data=True):
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])  # None creates breaks between edges
    edge_y.extend([y0, y1, None])
    edge_text.append(f"Lift: {edge[2]['weight']:.2f}")

# Create edge traces (lines connecting nodes)
edge_trace = go.Scatter(
    x=edge_x, y=edge_y, line=dict(width=1.5, color="gray"),
    hoverinfo="text", mode="lines", text=edge_text
)

# Create node traces (each node is an itemset)
node_x, node_y, node_text, node_size = [], [], [], []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_text.append(str(node))
    node_size.append(20 + 10 * sum([1 for e in G.edges(node)]))  # Scale based on connections

# Create node trace
node_trace = go.Scatter(
    x=node_x, y=node_y, mode="markers+text", hoverinfo="text",
    marker=dict(size=node_size, color="lightblue", line=dict(width=2)),
    text=node_text, textposition="top center"
)

# Create interactive Plotly figure
fig = go.Figure(data=[edge_trace, node_trace])
fig.update_layout(
    title="Association Rule Network (Top Lift)",
    showlegend=False, hovermode="closest",
    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
)

fig.show()

# Display top rules in terminal
print("Top 15 Association Rules by Support:")
print(top_support)

print("\nTop 15 Association Rules by Confidence:")
print(top_confidence)

print("\nTop 15 Association Rules by Lift:")
print(top_lift)



Top 15 Association Rules by Support:
  antecedents consequents  antecedent support  consequent support  support  \
0    (Female)      (60.5)            0.632653            0.346939  0.22449   
1      (60.5)    (Female)            0.346939            0.632653  0.22449   

   confidence     lift  representativity  leverage  conviction  zhangs_metric  \
0    0.354839  1.02277               1.0  0.004998    1.012245       0.060606   
1    0.647059  1.02277               1.0  0.004998    1.040816       0.034091   

    jaccard  certainty  kulczynski  
0  0.297297   0.012097    0.500949  
1  0.297297   0.039216    0.500949  

Top 15 Association Rules by Confidence:
  antecedents consequents  antecedent support  consequent support  support  \
1      (60.5)    (Female)            0.346939            0.632653  0.22449   
0    (Female)      (60.5)            0.632653            0.346939  0.22449   

   confidence     lift  representativity  leverage  conviction  zhangs_metric  \
1    0.647059  1

In [3]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
import networkx as nx
import plotly.graph_objects as go

file_path_global = "Global_PCOS_Demographics_Cleaned.csv"
df_global = pd.read_csv(file_path_global)

transactions_global = df_global.dropna().astype(str).values.tolist()

print("=== Raw Transaction Data (Before Transformation) ===")
print("Each row represents a transaction (list of categorical values).")
for i, transaction in enumerate(transactions_global[:5]):
    print(f"Transaction {i+1}: {transaction}")

te = TransactionEncoder()
encoded_data_global = te.fit(transactions_global).transform(transactions_global)
df_encoded_global = pd.DataFrame(encoded_data_global, columns=te.columns_)

print("\n=== Transformed DataFrame (After One-Hot Encoding) ===")
print("Each column represents an item (feature), and rows indicate presence (True/False).")
print(df_encoded_global.head())





=== Raw Transaction Data (Before Transformation) ===
Each row represents a transaction (list of categorical values).
Transaction 1: ['Madagascar', '26', 'overweight', 'regular', 'yes', 'severe', 'yes', 'yes', '2', 'low', 'rural', 'high', 'yes', 'no', '0.107938173829133', 'hispanic', 'yes']
Transaction 2: ['Somalia', '41', 'normal', 'regular', 'no', 'moderate', 'no', 'no', '7', 'medium', 'urban', 'middle', 'yes', 'yes', '0.2029007363515103', 'other', 'no']
Transaction 3: ['Malawi', '27', 'normal', 'irregular', 'no', 'mild', 'no', 'no', '10', 'low', 'urban', 'high', 'yes', 'no', '0.0739259556724011', 'caucasian', 'yes']
Transaction 4: ['Rwanda', '39', 'obese', 'regular', 'yes', 'mild', 'no', 'yes', '10', 'medium', 'urban', 'low', 'no', 'yes', '0.2080617748723916', 'african', 'no']
Transaction 5: ['Tanzania', '38', 'normal', 'regular', 'yes', 'mild', 'no', 'no', '8', 'medium', 'rural', 'middle', 'no', 'no', '0.2059565159657989', 'caucasian', 'no']

=== Transformed DataFrame (After One-Hot

In [4]:

frequent_itemsets_global = apriori(df_encoded_global, min_support=0.1, use_colnames=True)

rules_global = association_rules(frequent_itemsets_global, metric="lift", min_threshold=1.0)

top_support_global = rules_global.sort_values(by="support", ascending=False).head(15)
top_confidence_global = rules_global.sort_values(by="confidence", ascending=False).head(15)
top_lift_global = rules_global.sort_values(by="lift", ascending=False).head(15)

G_global = nx.DiGraph()

for index, row in top_lift_global.iterrows():
    G_global.add_edge(str(row["antecedents"]), str(row["consequents"]), weight=row["lift"])

pos_global = nx.spring_layout(G_global, seed=42)

edge_x_global, edge_y_global, edge_text_global = [], [], []
for edge in G_global.edges(data=True):
    x0, y0 = pos_global[edge[0]]
    x1, y1 = pos_global[edge[1]]
    edge_x_global.extend([x0, x1, None])
    edge_y_global.extend([y0, y1, None])
    edge_text_global.append(f"Lift: {edge[2]['weight']:.2f}")

edge_trace_global = go.Scatter(
    x=edge_x_global, y=edge_y_global, line=dict(width=1.5, color="gray"),
    hoverinfo="text", mode="lines", text=edge_text_global
)

node_x_global, node_y_global, node_text_global, node_size_global = [], [], [], []
for node in G_global.nodes():
    x, y = pos_global[node]
    node_x_global.append(x)
    node_y_global.append(y)
    node_text_global.append(str(node))
    node_size_global.append(20 + 10 * sum([1 for e in G_global.edges(node)]))  # Scale based on connections

node_trace_global = go.Scatter(
    x=node_x_global, y=node_y_global, mode="markers+text", hoverinfo="text",
    marker=dict(size=node_size_global, color="lightblue", line=dict(width=2)),
    text=node_text_global, textposition="top center"
)

fig_global = go.Figure(data=[edge_trace_global, node_trace_global])
fig_global.update_layout(
    title="Association Rule Network (Top Lift) - Global PCOS Data",
    showlegend=False, hovermode="closest",
    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
)

fig_global.show()

In [2]:

print("\nTop 15 Association Rules by Support:")
display(top_support_global)

print("\nTop 15 Association Rules by Confidence:")
display(top_confidence_global)

print("\nTop 15 Association Rules by Lift:")
display(top_lift_global)



Top 15 Association Rules by Support:


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
112,(urban),(no),0.699307,0.999399,0.698907,0.999427,1.000028,1.0,2e-05,1.048961,9.3e-05,0.699047,0.046676,0.849377
113,(no),(urban),0.999399,0.699307,0.698907,0.699327,1.000028,1.0,2e-05,1.000065,0.046702,0.699047,6.5e-05,0.849377
131,(urban),(yes),0.699307,0.932504,0.652775,0.933459,1.001024,1.0,0.000668,1.014347,0.003401,0.666752,0.014144,0.816741
130,(yes),(urban),0.932504,0.699307,0.652775,0.700023,1.001024,1.0,0.000668,1.002387,0.015152,0.666752,0.002381,0.816741
760,(yes),"(no, urban)",0.932504,0.698907,0.652374,0.699594,1.000983,1.0,0.000641,1.002287,0.014547,0.666343,0.002281,0.816507
758,"(yes, no)",(urban),0.931904,0.699307,0.652374,0.700045,1.001055,1.0,0.000687,1.002458,0.015469,0.666479,0.002452,0.816466
761,(urban),"(yes, no)",0.699307,0.931904,0.652374,0.932886,1.001055,1.0,0.000687,1.014642,0.003503,0.666479,0.014431,0.816466
759,"(no, urban)",(yes),0.698907,0.932504,0.652374,0.933421,1.000983,1.0,0.000641,1.013766,0.003261,0.666343,0.013579,0.816507
67,(no),(low),0.999399,0.509755,0.509472,0.509778,1.000044,1.0,2.3e-05,1.000046,0.073671,0.509633,4.6e-05,0.754611
66,(low),(no),0.509755,0.999399,0.509472,0.999443,1.000044,1.0,2.3e-05,1.079482,9e-05,0.509633,0.07363,0.754611



Top 15 Association Rules by Confidence:


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
1666,"(moderate, middle, regular)",(no),0.104064,0.999399,0.104064,1.0,1.000601,1.0,6.3e-05,inf,0.000671,0.104127,1.0,0.552063
1860,"(moderate, regular, urban)",(no),0.14851,0.999399,0.148494,0.999888,1.000489,1.0,7.3e-05,5.346374,0.000574,0.14858,0.812957,0.574235
2786,"(moderate, yes, regular, urban)",(no),0.138363,0.999399,0.138346,0.999879,1.000481,1.0,6.6e-05,4.981056,0.000557,0.138427,0.799239,0.569154
1220,"(moderate, low, regular)",(no),0.108437,0.999399,0.10842,0.999846,1.000447,1.0,4.8e-05,3.90373,0.000501,0.108484,0.743835,0.554166
1500,"(medium, moderate, urban)",(no),0.10802,0.999399,0.108003,0.999845,1.000447,1.0,4.8e-05,3.888709,0.0005,0.108066,0.742845,0.553957
266,"(high, overweight)",(no),0.107536,0.999399,0.107519,0.999845,1.000446,1.0,4.8e-05,3.871284,0.000499,0.107582,0.741688,0.553714
662,"(moderate, regular)",(no),0.211483,0.999399,0.21145,0.999842,1.000443,1.0,9.4e-05,3.806693,0.000562,0.21157,0.737305,0.605709
2478,"(moderate, medium, yes, urban)",(no),0.101177,0.999399,0.10116,0.999835,1.000436,1.0,4.4e-05,3.64236,0.000485,0.101219,0.725453,0.550528
2226,"(moderate, yes, low, regular)",(no),0.100926,0.999399,0.10091,0.999835,1.000436,1.0,4.4e-05,3.633347,0.000484,0.100969,0.724772,0.550402
0,(3),(no),0.100809,0.999399,0.100793,0.999834,1.000436,1.0,4.4e-05,3.629141,0.000484,0.100852,0.724453,0.550344



Top 15 Association Rules by Lift:


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
2492,"(medium, no, urban)","(moderate, yes)",0.348994,0.281465,0.10116,0.289861,1.029829,1.0,0.00293,1.011823,0.044493,0.19112,0.011685,0.324633
2493,"(moderate, yes)","(medium, no, urban)",0.281465,0.348994,0.10116,0.359405,1.029829,1.0,0.00293,1.016251,0.040312,0.19112,0.015991,0.324633
2500,"(medium, urban)","(moderate, yes, no)",0.349195,0.281399,0.10116,0.289695,1.029483,1.0,0.002897,1.01168,0.044005,0.191072,0.011545,0.324592
2485,"(moderate, yes, no)","(medium, urban)",0.281399,0.349195,0.10116,0.35949,1.029483,1.0,0.002897,1.016073,0.039853,0.191072,0.015819,0.324592
1536,"(medium, urban)","(yes, moderate)",0.349195,0.281465,0.101177,0.289743,1.029408,1.0,0.00289,1.011654,0.043897,0.191086,0.01152,0.324603
1533,"(yes, moderate)","(medium, urban)",0.281465,0.349195,0.101177,0.359464,1.029408,1.0,0.00289,1.016032,0.039759,0.191086,0.015779,0.324603
2503,(moderate),"(yes, medium, no, urban)",0.301627,0.326062,0.10116,0.335381,1.02858,1.0,0.002811,1.014021,0.039786,0.192126,0.013827,0.322814
2482,"(yes, medium, no, urban)",(moderate),0.326062,0.301627,0.10116,0.310248,1.02858,1.0,0.002811,1.012498,0.041229,0.192126,0.012344,0.322814
2489,"(yes, medium, urban)","(moderate, no)",0.326262,0.301561,0.10116,0.310057,1.028176,1.0,0.002772,1.012315,0.040674,0.192077,0.012165,0.322756
2496,"(moderate, no)","(yes, medium, urban)",0.301561,0.326262,0.10116,0.335455,1.028176,1.0,0.002772,1.013833,0.039236,0.192077,0.013644,0.322756
