In [1]:
!pip install streamlit ngrok pandas matplotlib seaborn langchain-google-genai langchain-experimental

Collecting streamlit
  Downloading streamlit-1.37.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting ngrok
  Downloading ngrok-1.4.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting langchain-google-genai
  Downloading langchain_google_genai-1.0.10-py3-none-any.whl.metadata (3.8 kB)
Collecting langchain-experimental
  Downloading langchain_experimental-0.0.64-py3-none-any.whl.metadata (1.7 kB)
Collecting tenacity<9,>=8.1.0 (from streamlit)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<5,>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.2-py3-none-manylinux2014_x86_64.whl.metadata (38 kB)
Collecting langchain-core<0.3,>=0.2.33 (from langchain-google-genai)
  Downloading langchain_

In [2]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl.metadata (7.4 kB)
Downloading pyngrok-7.2.0-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.0


In [3]:
!pip install streamlit



In [19]:
%%writefile main.py
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import io
import json
import re

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_experimental.agents import create_csv_agent

import os
os.environ["GOOGLE_API_KEY"] = "AIzaSyBnCieYLFNf3w_IEHQeGHyssyMQAOqDSi8"  # create api key in GCP Console

# Set page config at the very beginning
st.set_page_config(layout="wide", page_title="Data Insight Explorer")

def load_and_preprocess_data(file):
    df = pd.read_csv(file)

    # Selecting numerical columns
    numerical_columns = df.select_dtypes(include=['number']).columns

    # Dropping rows with NaN values in numerical columns
    df = df.dropna(subset=numerical_columns)

    return df

# Function to display data summary
def display_data_summary(df):
    st.subheader("First 20 Rows")
    st.write(df.head(20))

    st.subheader("Data Summary")
    st.write(df.describe())

    st.subheader("Data Information")
    buffer = io.StringIO()
    df.info(buf=buffer)
    s = buffer.getvalue()
    st.text(s)

    st.subheader("Missing Values")
    st.write(df.isnull().sum())

    # Adding statistics
    st.subheader("Mean, Median, Mode")
    mean = df.mean(numeric_only=True)
    median = df.median(numeric_only=True)
    mode = df.mode().iloc[0]
    st.write("Mean:\n", mean)
    st.write("Median:\n", median)
    st.write("Mode:\n", mode)

    st.subheader("Correlation")
    numerical_df = df.select_dtypes(include=['number'])
    if not numerical_df.empty:
        corr = numerical_df.corr()
        st.write(corr)
    else:
        st.write("No numerical columns available for correlation calculation.")

def statistical_analysis(df, agent):
    st.subheader("Numeric Column Statistics")
    numeric_cols = df.select_dtypes(include=['number']).columns
    stats = df[numeric_cols].agg(['mean', 'median', 'std'])
    st.write(stats)

    st.subheader("Mode (Most Frequent Value)")
    mode = df.mode().iloc[0]
    st.write(mode)

    st.subheader("Correlation Heatmap")
    corr = df[numeric_cols].corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr, annot=True, cmap='coolwarm')  # Coolwarm cmap applied here
    st.pyplot(plt)
    plt.close()

    st.subheader("Categorical Column Analysis")
    cat_cols = df.select_dtypes(include=['object']).columns
    for col in cat_cols:
        st.write(f"Unique values in {col}:", df[col].nunique())
        st.write(df[col].value_counts())

def clean_json_string(json_string):
    json_match = re.search(r'\{[\s\S]*\}', json_string)
    if json_match:
        json_str = json_match.group(0)
        try:
            json_data = json.loads(json_str)
            return json.dumps(json_data)
        except json.JSONDecodeError:
            return None
    return None

def generate_plots(agent, df, num_plots=5):
    prompt = f"""Analyze the given dataset and suggest {num_plots} most informative and relevant plots. For each plot, provide:
    1. A title for the plot
    2. Python code to generate the plot using matplotlib and seaborn
    3. A brief explanation of what the plot shows and why it's informative

    Return your response as a JSON string with the following structure:
    {{
        "plots": [
            {{
                "title": "Plot title",
                "code": "Python code to generate the plot",
                "explanation": "Brief explanation of the plot"
            }},
            ...
        ]
    }}
    IMPORTANT: Your response should only contain the JSON string, nothing else."""

    response = agent.run(prompt)

    cleaned_json = clean_json_string(response)

    if cleaned_json:
        try:
            plot_data = json.loads(cleaned_json)
            return plot_data
        except json.JSONDecodeError as e:
            st.error(f"Failed to parse the cleaned JSON. Error: {str(e)}")
    else:
        st.error("Failed to extract valid JSON from the agent's response.")

    return None

def display_insights(insights, df):
    for i, plot in enumerate(insights['plots'], 1):
        st.subheader(f"Plot {i}: {plot['title']}")
        fig_col1, fig_col2 = st.columns([3, 1])
        with fig_col1:
            st.code(plot['code'], language='python')
            try:
                exec(plot['code'], globals(), {'df': df, 'plt': plt, 'sns': sns})
                st.pyplot(plt.gcf())
                plt.close()
            except Exception as e:
                st.error(f"An error occurred while generating plot {i}: {str(e)}")
        with fig_col2:
            st.write("*Explanation:*", plot['explanation'])

def create_custom_plot(df):
    numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
    x_col = st.selectbox("Select X-axis column", numeric_cols)
    y_col = st.selectbox("Select Y-axis column", numeric_cols)

    plot_types = ["Scatter", "Line", "Bar", "Box", "Violin", "Histogram"]
    plot_type = st.selectbox("Select plot type", plot_types)

    if st.button("Create Custom Plot"):
        fig, ax = plt.subplots(figsize=(10, 6))

        if plot_type == "Scatter":
            sns.scatterplot(data=df, x=x_col, y=y_col, ax=ax)
        elif plot_type == "Line":
            sns.lineplot(data=df, x=x_col, y=y_col, ax=ax)
        elif plot_type == "Bar":
            sns.barplot(data=df, x=x_col, y=y_col, ax=ax)
        elif plot_type == "Box":
            sns.boxplot(data=df, x=x_col, y=y_col, ax=ax)
        elif plot_type == "Histogram":
            sns.histplot(data=df, x=x_col, ax=ax)

        plt.title(f"{plot_type} Plot: {y_col} vs {x_col}")
        plt.xlabel(x_col)
        plt.ylabel(y_col)

        st.session_state.custom_plot = {
            'type': plot_type,
            'x': x_col,
            'y': y_col,
            'fig': fig
        }

        plt.close()

def main():
    # Center the title using Markdown and HTML
    st.markdown(
        "<h1 style='text-align: center;'>Data Analysis of CSV Files</h1>",
        unsafe_allow_html=True
    )

    # Vertical tabs using selectbox
    option = st.selectbox(
        "Choose a section",
        options=["Upload Data", "Auto-generate Insights by the LLM", "Data Analysis", "Data Q&A"]
    )

    if option == "Upload Data":
        st.header("Upload Data Here")
        uploaded_file = st.file_uploader("Choose a CSV file", type="csv")

        if uploaded_file is not None:
            df = load_and_preprocess_data(uploaded_file)
            st.session_state.df = df

            # Display data summary immediately after upload
            display_data_summary(df)

            model_name = "models/gemini-1.5-pro"
            chat_model = ChatGoogleGenerativeAI(model=model_name, temperature=0)

            csv_file = io.StringIO(df.to_csv(index=False))
            agent = create_csv_agent(chat_model, csv_file, verbose=True, allow_dangerous_code=True)
            st.session_state.agent = agent

    if "df" in st.session_state and "agent" in st.session_state:
        df = st.session_state.df
        agent = st.session_state.agent

        if option == "Auto-generate Insights by the LLM":
            st.header("Auto-generated Analysis")
            num_plots = st.slider("Number of plots to generate", min_value=1, max_value=10, value=5)
            if st.button("Generate Insights"):
                with st.spinner("Generating insights..."):
                    insights = generate_plots(agent, df, num_plots)
                    st.session_state.insights = insights

            if 'insights' in st.session_state:
                display_insights(st.session_state.insights, df)

        elif option == "Data Analysis":
            st.header("Data Analysis")
            create_custom_plot(df)

            if 'custom_plot' in st.session_state:
                fig_col1, fig_col2 = st.columns([3, 1])
                with fig_col1:
                    st.pyplot(st.session_state.custom_plot['fig'])
                with fig_col2:
                    custom_plot_explanation = agent.run(
                        f"Analyze the {st.session_state.custom_plot['type']} plot of {st.session_state.custom_plot['y']} vs {st.session_state.custom_plot['x']} and provide insights."
                    )
                    st.write(custom_plot_explanation)

        elif option == "Data Q&A":
            st.header("Data Q&A")
            question = st.text_input("Ask a question about the data:")
            if question:
                with st.spinner("Thinking..."):
                    response = agent.run(question)
                    st.write(response)

if __name__ == "__main__":
    main()


Overwriting main.py


In [20]:
! wget -q -O - ipv4.icanhazip.com

#copy the number 34.91.53.221 and paste it in Tunnel Password

34.150.175.94


In [21]:
! streamlit run main.py & npx localtunnel --port 8501
#Go to the need to install lacaltunnel@2.0.2 click yes
#click (y)
#click: your url is: https://lovely-months-stare.loca.lt



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.150.175.94:8501[0m
[0m
your url is: https://ripe-birds-tell.loca.lt
2024-08-26 09:30:40.370 Serialization of dataframe to Arrow table was unsuccessful due to: ("Could not convert 'Abbing, Mr. Anthony' with type str: tried to convert to double", 'Conversion failed for column 0 with type object'). Applying automatic fixes for column types to make the dataframe Arrow-compatible.
  warn_deprecated(


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I should use the plot function from the pandas dataframe to create the line plot of Fare vs Survived.
Action: python_repl_ast
Action Input: `df.groupby('Survived')['Fare'].plot(kind='line');`[0m[36;1m[1;3mSurvived