In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [3]:
data_path = input("Enter dataset CSV file path: ").strip()
df = pd.read_csv(data_path)
print("✅ Dataset loaded with shape:", df.shape)

# Keep only numeric columns
df = df.select_dtypes(include=[np.number])
numeric_cols = df.columns.tolist()
print("Numeric columns available:", numeric_cols)

# Create folder for histogram images
os.makedirs("histograms", exist_ok=True)

✅ Dataset loaded with shape: (768, 9)
Numeric columns available: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']


In [4]:
num_bins = 10
histograms = {}
for col in df.columns:
    counts, bins = np.histogram(df[col], bins=num_bins)
    histograms[col] = {"counts": counts, "bins": bins}

print("\n✅ Histograms computed for numeric columns.")


✅ Histograms computed for numeric columns.


In [5]:
cov_matrix = np.cov(df.values, rowvar=False)
precision_matrix = np.linalg.pinv(cov_matrix)  # Pseudo-inverse for stability

relation_matrix = pd.DataFrame(index=df.columns, columns=df.columns)
for i, col1 in enumerate(df.columns):
    for j, col2 in enumerate(df.columns):
        if i == j:
            relation_matrix.loc[col1, col2] = 0
        else:
            val = precision_matrix[i, j]
            if val > 0:
                relation_matrix.loc[col1, col2] = 1
            elif val < 0:
                relation_matrix.loc[col1, col2] = -1
            else:
                relation_matrix.loc[col1, col2] = 0

relation_matrix = relation_matrix.astype(int)
relation_matrix.to_csv("relation_matrix.csv", index=True)

print("\n✅ Relation matrix computed and saved as relation_matrix.csv")


✅ Relation matrix computed and saved as relation_matrix.csv


In [8]:
def evaluate_relation_accuracy(threshold=0.5):
    # Correlation baseline
    corr_matrix = df.corr().fillna(0)
    corr_sign = corr_matrix.map(lambda x: 1 if x > threshold else (-1 if x < -threshold else 0))

    # Compare with relation matrix
    matches = (relation_matrix == corr_sign).sum().sum()
    total = relation_matrix.size
    accuracy = (matches / total) * 100
    return accuracy

acc = evaluate_relation_accuracy()
print(f"\n📊 Relation detection accuracy compared to correlation baseline: {acc:.2f}%")



📊 Relation detection accuracy compared to correlation baseline: 0.00%


In [10]:
def compare_columns(col1, col2):
    mean1, mean2 = df[col1].mean(), df[col2].mean()
    if mean1 > mean2:
        return f"{col1} has higher average ({mean1:.2f}) than {col2} ({mean2:.2f})"
    elif mean1 < mean2:
        return f"{col2} has higher average ({mean2:.2f}) than {col1} ({mean1:.2f})"
    else:
        return f"{col1} and {col2} have equal averages ({mean1:.2f})"

def check_relation(col1, col2):
    val = relation_matrix.loc[col1, col2]
    if val == 1:
        return f"{col1} and {col2} have a positive conditional relation"
    elif val == -1:
        return f"{col1} and {col2} have a negative conditional relation"
    else:
        return f"No significant conditional relation between {col1} and {col2}"

def plot_histogram(col, bins=num_bins):
    if col not in df.columns:
        print("⚠️ Invalid column name.")
        return
    data = df[col]
    counts, bin_edges = np.histogram(data, bins=bins)

    # Print bin ranges and counts
    print(f"\nHistogram of {col}:")
    for i, count in enumerate(counts):
        print(f"Bin {i+1}: {bin_edges[i]:.3f} - {bin_edges[i+1]:.3f} : {count}")

    # Plot and save image
    plt.figure(figsize=(6,4))
    plt.hist(data, bins=bins, color='skyblue', edgecolor='black')
    plt.title(f"Histogram of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.grid(axis='y', alpha=0.75)
    plt.tight_layout()
    filename = f"histograms/hist_{col}.png"
    plt.savefig(filename)
    plt.close()
    print(f"✅ Histogram image saved as {filename}")

def show_top_bottom(col, n=5, top=True):
    n = int(n)
    if top:
        return df[col].nlargest(n).to_list()
    else:
        return df[col].nsmallest(n).to_list()

def show_stats(col, stats_list=None):
    desc = df[col].describe()
    if stats_list:
        stats_list = [s.lower() for s in stats_list]
        filtered = {k:v for k,v in desc.items() if k.lower() in stats_list}
        return filtered
    else:
        return desc.to_dict()

In [11]:
print("\n🤖 Enhanced Histogram & Relation Chatbot")
print("Available numeric columns:", numeric_cols)
print("You can ask: 'compare X and Y', 'relation between X and Y', 'histogram of Z', 'stats of Z', 'top N of Z', 'bottom N of Z', 'columns', 'help'")
print("Type 'exit' to quit.\n")

while True:
    user_input = input("Ask your question: ").strip().lower()

    if user_input in ['exit', 'quit', 'stop']:
        print("👋 Exiting chatbot.")
        break

    try:
        # List columns
        if user_input in ["columns", "column names"]:
            print("Numeric columns:", numeric_cols)

        # Help
        elif "help" in user_input:
            print("Commands:\n- compare X and Y\n- relation between X and Y\n- histogram of Z\n- stats of Z\n- top N of Z\n- bottom N of Z\n- columns\n- exit/quit/stop")

        # Compare two columns
        elif "compare" in user_input:
            cols = [col for col in df.columns if col.lower() in user_input]
            if len(cols) >= 2:
                print(compare_columns(cols[0], cols[1]))
            else:
                print("⚠️ Please specify two numeric columns to compare.")

        # Conditional relation
        elif "relation" in user_input:
            cols = [col for col in df.columns if col.lower() in user_input]
            if len(cols) >= 2:
                print(check_relation(cols[0], cols[1]))
            else:
                print("⚠️ Please specify two numeric columns to check relation.")

        # Histogram
        elif "histogram" in user_input or "bin" in user_input:
            cols = [col for col in df.columns if col.lower() in user_input]
            if cols:
                plot_histogram(cols[0])
            else:
                print("⚠️ Please specify a numeric column for histogram.")

        # Top N values
        elif user_input.startswith("top"):
            n = 5  # default
            for col in numeric_cols:
                if col.lower() in user_input:
                    if any(word.isdigit() for word in user_input.split()):
                        n = int([word for word in user_input.split() if word.isdigit()][0])
                    print(f"Top {n} values of {col}: {show_top_bottom(col, n=n, top=True)}")
                    break
            else:
                print("⚠️ Please specify a valid numeric column for top N values.")

        # Bottom N values
        elif user_input.startswith("bottom"):
            n = 5  # default
            for col in numeric_cols:
                if col.lower() in user_input:
                    if any(word.isdigit() for word in user_input.split()):
                        n = int([word for word in user_input.split() if word.isdigit()][0])
                    print(f"Bottom {n} values of {col}: {show_top_bottom(col, n=n, top=False)}")
                    break
            else:
                print("⚠️ Please specify a valid numeric column for bottom N values.")

        # Stats command
        elif user_input.startswith("stats"):
            stats_list = None
            for col in numeric_cols:
                if col.lower() in user_input:
                    stats_part = user_input[user_input.find("stats")+5:].replace("of","").replace(col.lower(),"").replace(","," ").split()
                    stats_list = stats_part if stats_part else None
                    stats_dict = show_stats(col, stats_list)
                    for k, v in stats_dict.items():
                        print(f"{k}: {v:.2f}")
                    break
            else:
                print("⚠️ Please specify a valid numeric column for stats.")


        # Unknown
        else:
            print("🤷 Sorry, I can answer questions about histograms, comparisons, stats, top/bottom values, and conditional relations.")

    except Exception as e:
        print("⚠️ Error:", e)


🤖 Enhanced Histogram & Relation Chatbot
Available numeric columns: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
You can ask: 'compare X and Y', 'relation between X and Y', 'histogram of Z', 'stats of Z', 'top N of Z', 'bottom N of Z', 'columns', 'help'
Type 'exit' to quit.

Top 5 values of Age: [81, 72, 70, 69, 69]
Top 2 values of BMI: [67.1, 59.4]
👋 Exiting chatbot.
