In [5]:
import sys
# Cell 1: Install required libraries
!pip install --upgrade pip setuptools wheel >/dev/null
!pip uninstall numpy pandas -y >/dev/null
!pip install openai pandas numpy matplotlib seaborn scikit-learn gradio==3.45.1 python-magic >/dev/null
# Restart kernel after installation to ensure environment is clean
if 'ipykernel' in sys.modules:
    from IPython import get_ipython
    get_ipython().kernel.do_shutdown(restart=True)

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.3 which is incompatible.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
pytensor 2.35.1 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
jaxlib 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.3 which is incompatible.
yfinance 0.2.66 requires websockets>=13.0, but you have websockets 11.0.3 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26

In [1]:
# Cell 2: imports and helpers
import os, io, json, textwrap, tempfile
from getpass import getpass
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
import openai
import gradio as gr

# plotting convenience
def save_plot(fig, fname):
    fig.tight_layout()
    fig.savefig(fname, dpi=150)
    plt.close(fig)


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gradio as gr
from sklearn.preprocessing import LabelEncoder
import io, os, textwrap
from pathlib import Path

def summarize_dataset(df):
    summary = {}
    summary["Rows"] = df.shape[0]
    summary["Columns"] = df.shape[1]
    summary["Column Names"] = list(df.columns)
    summary["Data Types"] = df.dtypes.astype(str).to_dict()
    summary["Missing Values"] = df.isnull().sum().to_dict()
    summary["Numeric Summary"] = df.describe().to_dict()
    return summary

def auto_insights(df):
    insights = []
    insights.append(f"Dataset has {df.shape[0]} rows and {df.shape[1]} columns.")

    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
    if numeric_cols:
        insights.append(f"Numeric columns: {', '.join(numeric_cols)}")
        corr = df[numeric_cols].corr()
        high_corr = corr.unstack().sort_values(ascending=False)
        high_corr = [(a, b, round(v,2)) for (a,b,v) in zip(corr.columns, corr.index, corr.values.diagonal())]
        insights.append("Top correlated pairs (approx): " + ", ".join([f"{a}-{b}" for a,b,_ in high_corr[:3]]))
    else:
        insights.append("No numeric columns detected.")

    cat_cols = df.select_dtypes(exclude=np.number).columns.tolist()
    if cat_cols:
        insights.append(f"Categorical columns: {', '.join(cat_cols)}")
        for c in cat_cols[:2]:
            top = df[c].value_counts().nlargest(3)
            insights.append(f"Top categories in '{c}': {', '.join(top.index)}")
    else:
        insights.append("No categorical columns detected.")

    return "\n".join(insights)


In [3]:
def generate_plots(df, out_dir="plots", max_plots=6):
    Path(out_dir).mkdir(exist_ok=True)
    images = []

    # numeric histograms
    num_cols = df.select_dtypes(include=np.number).columns
    for i, col in enumerate(num_cols[:max_plots]):
        fig, ax = plt.subplots(figsize=(5, 3))
        sns.histplot(df[col].dropna(), kde=True, ax=ax)
        ax.set_title(f"Distribution of {col}")
        path = f"{out_dir}/hist_{i}_{col}.png"
        plt.tight_layout()
        plt.savefig(path)
        plt.close(fig)
        images.append(path)

    # categorical bar charts
    cat_cols = df.select_dtypes(exclude=np.number).columns
    for i, col in enumerate(cat_cols[:3]):
        fig, ax = plt.subplots(figsize=(5, 3))
        top = df[col].fillna("Missing").value_counts().nlargest(10)
        sns.barplot(x=top.values, y=top.index, ax=ax)
        ax.set_title(f"Top values in {col}")
        path = f"{out_dir}/bar_{i}_{col}.png"
        plt.tight_layout()
        plt.savefig(path)
        plt.close(fig)
        images.append(path)

    # correlation heatmap
    if len(num_cols) >= 2:
        fig, ax = plt.subplots(figsize=(5, 4))
        sns.heatmap(df[num_cols].corr(), cmap="coolwarm", annot=False)
        ax.set_title("Correlation Heatmap")
        path = f"{out_dir}/corr_heatmap.png"
        plt.tight_layout()
        plt.savefig(path)
        plt.close(fig)
        images.append(path)

    return images


In [4]:
def analyze_file(file):
    if file is None:
        return "Please upload a file.", None, None
    try:
        df = pd.read_csv(file.name)
    except:
        df = pd.read_excel(file.name)

    summary = summarize_dataset(df)
    insights = auto_insights(df)
    plots = generate_plots(df)

    summary_text = "\n".join([f"**{k}:** {v}" for k,v in summary.items() if k != "Numeric Summary"])
    return summary_text, insights, plots

with gr.Blocks() as demo:
    gr.Markdown("# ðŸ“Š Smart Data Explorer (No API Key Needed)")
    gr.Markdown("Upload any CSV or Excel file to automatically explore your data.")

    file_input = gr.File(label="Upload your dataset (CSV or Excel)")
    analyze_button = gr.Button("Analyze Dataset")

    summary_output = gr.Markdown(label="Dataset Summary")
    insights_output = gr.Textbox(label="Auto Insights", lines=8)
    gallery = gr.Gallery(label="Auto-generated Plots").style(grid=[2], height="400px")

    analyze_button.click(analyze_file, inputs=file_input, outputs=[summary_output, insights_output, gallery])

demo.launch()


  gallery = gr.Gallery(label="Auto-generated Plots").style(grid=[2], height="400px")
  gallery = gr.Gallery(label="Auto-generated Plots").style(grid=[2], height="400px")


IMPORTANT: You are using gradio version 3.45.1, however version 4.44.1 is available, please upgrade.
--------
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

