In [1]:
from crewai import Agent, Task, Crew
from langchain_google_genai import ChatGoogleGenerativeAI
from crewai_tools import SerperDevTool
import os
import litellm


  warn(
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json

In [3]:
# 2. Configure LiteLLM for Gemini DIRECTLY
litellm.drop_params = True
litellm.set_verbose = True


In [4]:
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
import os 

In [6]:
API_KEY = os.getenv("Gemini_API_KEY")

In [7]:
os.environ["GEMINI_API_KEY"] = API_KEY
os.environ["LITELLM_PROVIDER"] = "google"  # Force LiteLLM to use Google

In [8]:
# 2. Configure Gemini PROPERLY
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.7,
    #max_output_tokens=2048,
    google_api_key= os.getenv("Gemini_API_KEY")  # Explicit key assignment
)


In [9]:
import os
import pandas as pd
from pydantic import BaseModel, Field
from crewai import Agent, Task, Crew, Process
from langchain.tools import BaseTool

@tool
def csv_import_tool(filename: str) -> str:
    """Enhanced CSV analysis tool with robust error handling and diagnostics
    
    Features:
    - Absolute path verification
    - Multiple encoding support
    - Empty file detection
    - Detailed error reporting
    - Auto-detection of common CSV issues
    """
    try:
        # Get absolute path for clearer diagnostics
        abs_path = os.path.abspath(filename)
        print(f"Attempting to read: {abs_path}")  # Debug output
        
        # File existence check
        if not os.path.exists(abs_path):
            return (
                "File Not Found Error:\n"
                f"- Requested path: {abs_path}\n"
                "- Solutions:\n"
                "  1. Verify file exists at this location\n"
                "  2. Check for typos in filename\n"
                "  3. Use absolute path instead of relative"
            )
            
        # File size check
        if os.path.getsize(abs_path) == 0:
            return "Empty File Error: CSV file contains no data"

        # Try multiple encodings
        encodings = ['utf-8', 'latin-1', 'iso-8859-1']
        for encoding in encodings:
            try:
                df = pd.read_csv(abs_path, encoding=encoding)
                break
            except UnicodeDecodeError:
                continue
        else:
            return (
                "Encoding Error:\n"
                "Failed to read with common encodings (utf-8, latin-1, iso-8859-1)\n"
                "Try specifying the correct encoding in a text editor"
            )

        # Validate DataFrame
        if df.empty:
            return "Data Error: CSV file contains no rows of data"
            
        if len(df.columns) < 1:
            return "Format Error: No columns detected - check CSV delimiter"

        # [Rest of your original analysis code here]
        # ... (keep the data type checks, stats calculations, etc)

        report = f"Analysis complete. Columns: {len(df.columns)}, Rows: {len(df)}"
        return report.strip()

    except pd.errors.ParserError as e:
        return (
            "CSV Parsing Error:\n"
            f"- Details: {str(e)}\n"
            "Common fixes:\n"
            "  1. Ensure consistent column count in all rows\n"
            "  2. Check for missing quotes\n"
            "  3. Verify delimiter consistency"
        )
        
    except Exception as e:
        return (
            "Unexpected Error:\n"
            f"- Type: {type(e).__name__}\n"
            f"- Message: {str(e)}\n"
            "Please check:\n"
            "  1. File is not open in other programs\n"
            "  2. File permissions\n"
            "  3. File integrity"
        )
    
    
  

#---------------#
#  Data Loader  #
#---------------#
@tool
def data_loader(filename: str) -> pd.DataFrame | str:
    """Robust data loading tool with comprehensive validation"""
    try:
        abs_path = os.path.abspath(filename)
        
        if not os.path.exists(abs_path):
            return f"File not found: {abs_path}"
            
        if os.path.getsize(abs_path) == 0:
            return "Empty file error"
            
        encodings = ['utf-8', 'latin-1', 'iso-8859-1']
        for encoding in encodings:
            try:
                df = pd.read_csv(abs_path, encoding=encoding)
                return df
            except UnicodeDecodeError:
                continue
                
        return "Encoding error: Failed to detect proper encoding"
        
    except Exception as e:
        return f"Loading failed: {str(e)}"

# Define input schema separately
class AnalysisInput(BaseModel):
    filename: str = Field(..., description="Path to CSV file")

# Use tool decorator with proper syntax
@tool
def statistical_analyzer(filename: str) -> str:
    """Comprehensive statistical analysis tool with advanced metrics"""
    try:
        # Load data using data_loader tool
        data = data_loader(filename)
        if isinstance(data, str):
            return f"Data loading error: {data}"
            
        report = [
            f"Statistical Analysis Report - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
            "="*50
        ]

        # 1. Dataset Overview
        report.extend([
            "\n### Dataset Overview",
            f"Rows: {len(data):,}",
            f"Columns: {len(data.columns)}",
            f"Memory Usage: {data.memory_usage(deep=True).sum()/1e6:.2f} MB",
            f"Duplicate Rows: {data.duplicated().sum()}"
        ])

        # 2. Data Structure Analysis
        type_counts = data.dtypes.astype(str).value_counts()
        report.append("\n### Data Structure Analysis")
        report.append(f"Data Types:\n{type_counts.to_string()}")

        # 3. Comprehensive Descriptive Stats
        desc_stats = data.describe(percentiles=[.01, .05, .25, .5, .75, .95, .99])
        report.append("\n### Descriptive Statistics")
        report.append(desc_stats.round(2).to_string())

        # 4. Advanced Correlation Analysis
        corr_matrix = data.corr(method='spearman')
        report.append("\n### Correlation Matrix (Spearman)")
        report.append(corr_matrix.style.background_gradient(cmap='coolwarm', axis=None)
                          .format(precision=2).to_html())

        # 5. Temporal Analysis (if datetime columns exist)
        datetime_cols = data.select_dtypes(include=['datetime']).columns
        for col in datetime_cols:
            time_stats = pd.DataFrame({
                'Oldest': [data[col].min()],
                'Newest': [data[col].max()],
                'Time Span': [data[col].max() - data[col].min()]
            })
            report.append(f"\n### Temporal Analysis - {col}")
            report.append(time_stats.to_string(index=False))

        # 6. Advanced Distribution Analysis
        dist_analysis = pd.DataFrame({
            'Skewness': data.skew(numeric_only=True, skipna=True).round(2),
            'Kurtosis': data.kurtosis(numeric_only=True).round(2),
            'CV (%)': (data.std(numeric_only=True)/data.mean(numeric_only=True)*100).round(1)
        })
        report.append("\n### Distribution Analysis")
        report.append(dist_analysis.to_string())

        return "\n".join(str(item) for item in report)

    except Exception as e:
        return f"Statistical analysis failed: {str(e)}"



# Enhanced Anomaly Detector
@tool
def anomaly_detector(filename: str) -> str:
    """Advanced anomaly detection with multi-method analysis"""
    try:
        data = data_loader(filename)
        if isinstance(data, str):
            return f"Data loading error: {data}"

        report = [
            f"Anomaly Detection Report - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
            "="*50
        ]

        # 1. Data Quality Assessment
        quality_checks = pd.DataFrame({
            'Missing Values': data.isna().sum(),
            'Zero Values': (data == 0).sum(),
            'Infinite Values': np.isinf(data.select_dtypes(include=np.number)).sum()
        })
        report.append("\n### Data Quality Assessment")
        report.append(quality_checks.to_string())

        # 2. Advanced Outlier Detection
        numeric_cols = data.select_dtypes(include=np.number).columns
        outlier_results = []

        for col in numeric_cols:
            # Multiple outlier detection methods
            q1, q3 = data[col].quantile([0.25, 0.75])
            iqr = q3 - q1
            z_scores = (data[col] - data[col].mean()) / data[col].std()
            mad = (data[col] - data[col].median()).abs().median()
            
            outliers = {
                'Column': col,
                'IQR Outliers': ((data[col] < (q1 - 1.5*iqr)) | (data[col] > (q3 + 1.5*iqr))).sum(),
                'Z-Score (>3σ)': (z_scores.abs() > 3).sum(),
                'MAD Outliers': ((data[col] - data[col].median()).abs() > 3*mad).sum(),
                'Top 1% Value': data[col].quantile(0.99),
                'Bottom 1% Value': data[col].quantile(0.01)
            }
            outlier_results.append(outliers)

        report.append("\n### Multi-Method Outlier Analysis")
        report.append(pd.DataFrame(outlier_results).to_string(index=False))

        # 3. Temporal Anomalies (if datetime present)
        datetime_cols = data.select_dtypes(include=['datetime']).columns
        for col in datetime_cols:
            time_gaps = data[col].diff().value_counts().head(5)
            report.append(f"\n### Temporal Patterns - {col}")
            report.append(f"Most common time intervals:\n{time_gaps.to_string()}")

        # 4. Categorical Anomalies
        cat_cols = data.select_dtypes(include='category').columns
        for col in cat_cols:
            rare_categories = data[col].value_counts(normalize=True)[data[col].value_counts(normalize=True) < 0.01]
            report.append(f"\n### Rare Categories - {col}")
            report.append(f"Rare categories (<1%): {len(rare_categories)}")

        # 5. Multivariate Anomalies
        report.append("\n### Multivariate Anomaly Detection")
        report.append("Isolation Forest Results:")
        from sklearn.ensemble import IsolationForest
        clf = IsolationForest(contamination=0.01)
        preds = clf.fit_predict(data.select_dtypes(include=np.number))
        report.append(f"Identified anomalies: {sum(preds == -1)}")

        return "\n".join(str(item) for item in report)

    except Exception as e:
        return f"Anomaly detection failed: {str(e)}"

In [10]:
# ------------------------
# Universal Data Tools
# ------------------------

class FileInputSchema(BaseModel):
    filename: str = Field(..., description="Path to CSV file")

class DataLoaderTool(BaseTool):
    name = "data_loader"
    description = "Load and validate any CSV file with dynamic typing"
    
    def _run(self, filename: str) -> pd.DataFrame | str:
        """Load and validate any CSV file with dynamic typing"""
        try:
            # Input validation
            if not os.path.exists(filename):
                return f"File not found: {filename}"
            if not filename.endswith('.csv'):
                return "Only CSV files are supported"

            # Load data with automatic type detection
            df = pd.read_csv(filename)
            
            # Basic validation
            if df.empty:
                return "Empty CSV file"
                
            return df

        except Exception as e:
            return f"Data loading failed: {str(e)}"

class StatisticalAnalyzerTool(BaseTool):
    name = "statistical_analyzer"
    description = "Analyze any CSV file's numerical features including correlation analysis"
    
    def _run(self, filename: str) -> str:
        """Analyze any CSV file's numerical features including correlation analysis"""
        try:
            # Create an instance of the data loader tool
            data_loader_tool = DataLoaderTool()
            
            # Load data
            data = data_loader_tool._run(filename)
            if isinstance(data, str):
                return data  # Return error message
                
            report = [
                f"# Statistical Report for {os.path.basename(filename)}",
                f"**Dataset Shape:** {data.shape[0]} rows, {data.shape[1]} columns"
            ]
            
            # Numerical Analysis
            numeric_cols = data.select_dtypes(include=['number']).columns
            if len(numeric_cols) > 0:
                report.append("\n## Numerical Features Analysis")
                for col in numeric_cols:
                    stats = data[col].describe(percentiles=[.25, .75])
                    report.extend([
                        f"\n### {col}",
                        f"- Mean: {stats['mean']:.2f}",
                        f"- Median: {stats['50%']:.2f}",
                        f"- Std Dev: {stats['std']:.2f}",
                        f"- Range: {stats['min']:.2f} to {stats['max']:.2f}",
                        f"- IQR: {stats['75%'] - stats['25%']:.2f}"
                    ])
            
            # Categorical Analysis
            cat_cols = data.select_dtypes(include=['object', 'category']).columns
            if len(cat_cols) > 0:
                report.append("\n## Categorical Features Analysis")
                for col in cat_cols:
                    counts = data[col].value_counts()
                    report.extend([
                        f"\n### {col}",
                        f"- Unique Values: {len(counts)}",
                        f"- Most Common: {counts.idxmax()} ({counts.max()} entries)"
                    ])
            
            # Correlation Analysis
            if len(numeric_cols) > 1:
                report.append("\n## Correlation Analysis")
                # Calculate correlation matrix
                corr_matrix = data[numeric_cols].corr()
                
                # Format correlation matrix as markdown
                report.append("\n### Correlation Matrix")
                report.append(corr_matrix.round(2).to_markdown())
                
                # Find strong correlations (absolute value > 0.5)
                strong_corrs = []
                for i in range(len(numeric_cols)):
                    for j in range(i+1, len(numeric_cols)):
                        col1 = numeric_cols[i]
                        col2 = numeric_cols[j]
                        corr_value = corr_matrix.iloc[i, j]
                        if abs(corr_value) > 0.5:
                            strength = "Strong Positive" if corr_value > 0.7 else "Moderate Positive" if corr_value > 0 else "Strong Negative" if corr_value < -0.7 else "Moderate Negative"
                            strong_corrs.append({
                                'Feature 1': col1,
                                'Feature 2': col2, 
                                'Correlation': f"{corr_value:.2f}",
                                'Strength': strength
                            })
                
                if strong_corrs:
                    report.append("\n### Strong Feature Correlations")
                    report.append("*Correlations with absolute value > 0.5*")
                    report.append(pd.DataFrame(strong_corrs).to_markdown(index=False))
                else:
                    report.append("\n*No strong correlations found between features (threshold: 0.5)*")
            
            return "\n".join(report)
            
        except Exception as e:
            return f"Analysis failed: {str(e)}"
        
        
class AnomalyDetectorTool(BaseTool):
    name = "anomaly_detector"
    description = "Detect anomalies in any CSV dataset"
    
    def _run(self, filename: str) -> str:
        """Detect anomalies in any CSV dataset"""
        try:
            # Create an instance of the data loader tool
            data_loader_tool = DataLoaderTool()
            
            # Load data
            data = data_loader_tool._run(filename)
            if isinstance(data, str):
                return data
                
            report = [
                f"# Anomaly Report for {os.path.basename(filename)}",
                f"**Total Records:** {len(data):,}"
            ]
            
            # Data Quality Checks
            quality = pd.DataFrame({
                'Missing Values': data.isna().sum(),
                'Zero Values': (data == 0).sum(),
                'Unique Values': data.nunique()
            })
            report.append("\n## Data Quality Assessment")
            report.append(quality.to_markdown())
            
            # Numerical Anomalies
            numeric_cols = data.select_dtypes(include=['number']).columns
            if len(numeric_cols) > 0:
                report.append("\n## Numerical Anomalies")
                anomalies = []
                for col in numeric_cols:
                    q1 = data[col].quantile(0.25)
                    q3 = data[col].quantile(0.75)
                    iqr = q3 - q1
                    outliers = data[(data[col] < (q1 - 1.5*iqr)) | (data[col] > (q3 + 1.5*iqr))]
                    anomalies.append({
                        'Feature': col,
                        'Outliers': len(outliers),
                        'Min': data[col].min(),
                        'Max': data[col].max()
                    })
                report.append(pd.DataFrame(anomalies).to_markdown(index=False))
            
            return "\n".join(report)
            
        except Exception as e:
            return f"Anomaly detection failed: {str(e)}"
        


class VisualizationTool(BaseTool):
    name = "data_visualizer"
    description = "Create various visualizations from CSV data"
    
    def _run(self, 
             file_path: str,  # Full path to the CSV file
             chart_type: str,  # Type of chart to create
             x_axis: str = None,  # X-axis column
             y_axis: str = None,  # Y-axis column
             column_name: str = None,  # Alternative to x_axis for single-column charts
             title: str = None,  # Chart title
             output_path: str = None) -> str:  # Where to save the chart
        """
        Create visualizations from CSV data
        """
        try:
            # Import required libraries
            import matplotlib
            matplotlib.use('Agg')  # Use non-interactive backend
            import matplotlib.pyplot as plt
            import seaborn as sns
            import pandas as pd
            import os
            import numpy as np
            
            # First, validate the chart_type
            valid_chart_types = ["scatter", "histogram", "boxplot", "heatmap", "bar", "line"]
            
            # Normalize the chart type
            normalized_chart_type = chart_type.lower().replace("_", "").replace("-", "").replace(" ", "")
            if normalized_chart_type in ["scatter", "scatterplot"]:
                chart_type = "scatter"
            elif normalized_chart_type in ["bar", "barchart"]:
                chart_type = "bar"
            elif normalized_chart_type in ["line", "linechart"]:
                chart_type = "line"
            elif normalized_chart_type in ["box", "boxplot"]:
                chart_type = "boxplot"
            elif normalized_chart_type in ["heat", "heatmap"]:
                chart_type = "heatmap"
            elif normalized_chart_type in ["hist", "histogram"]:
                chart_type = "histogram"
            else:
                return f"Unsupported chart type: {chart_type}. Supported types: scatter/scatter_plot, histogram, bar/bar_chart, line/line_chart, boxplot/box_plot, heatmap/heat_map"
            
            # Validate the file exists
            if not os.path.exists(file_path):
                return f"File not found: {file_path}"
            
            # Load data
            try:
                data = pd.read_csv(file_path)
                if data.empty:
                    return "Empty CSV file"
            except Exception as e:
                return f"Failed to load CSV file: {str(e)}"
            
            # Set default output path if not provided
            if not output_path:
                output_dir = os.path.dirname(file_path)
                # Ensure output directory exists
                if not output_dir:
                    output_dir = "."
                base_name = os.path.splitext(os.path.basename(file_path))[0]
                output_path = os.path.join(output_dir, f"{base_name}_{chart_type}_plot.png")
            
            # Column validation and handling
            available_columns = list(data.columns)
            
            # Handle the case when column_name is provided but not in the dataset
            if column_name and column_name not in available_columns:
                return f"Column '{column_name}' not found in dataset. Available columns: {', '.join(available_columns)}"
            
            # Handle the case when x_axis is provided but not in the dataset
            if x_axis and x_axis not in available_columns:
                return f"Column '{x_axis}' not found in dataset. Available columns: {', '.join(available_columns)}"
                
            # Handle the case when y_axis is provided but not in the dataset
            if y_axis and y_axis not in available_columns:
                return f"Column '{y_axis}' not found in dataset. Available columns: {', '.join(available_columns)}"
            
            # Figure setup
            plt.figure(figsize=(12, 8))
            
            # Create the visualization based on the chart type
            if chart_type == "histogram":
                # Use column_name if provided, otherwise use x_axis
                col = column_name if column_name else x_axis
                if not col:
                    return "Histogram requires either column_name or x_axis parameter"
                
                # Ensure the column has numerical data
                if not pd.api.types.is_numeric_dtype(data[col]):
                    return f"Column '{col}' must contain numerical data for histogram"
                    
                sns.histplot(data=data, x=col, kde=True)
                plt.title(title or f"Distribution of {col}")
                plt.xlabel(col)
                plt.ylabel("Frequency")
                
            elif chart_type == "scatter":
                if not x_axis or not y_axis:
                    return "Scatter plot requires both x_axis and y_axis parameters"
                
                # Ensure both columns have numerical data
                if not pd.api.types.is_numeric_dtype(data[x_axis]):
                    return f"Column '{x_axis}' must contain numerical data for scatter plot"
                if not pd.api.types.is_numeric_dtype(data[y_axis]):
                    return f"Column '{y_axis}' must contain numerical data for scatter plot"
                    
                sns.scatterplot(data=data, x=x_axis, y=y_axis)
                plt.title(title or f"{y_axis} vs {x_axis}")
                plt.xlabel(x_axis)
                plt.ylabel(y_axis)
                
            elif chart_type == "boxplot":
                col = column_name if column_name else y_axis if y_axis else x_axis
                if not col:
                    return "Box plot requires either column_name, x_axis, or y_axis parameter"
                
                # Ensure the column has numerical data
                if not pd.api.types.is_numeric_dtype(data[col]):
                    return f"Column '{col}' must contain numerical data for box plot"
                    
                sns.boxplot(data=data, y=col)
                plt.title(title or f"Distribution of {col}")
                plt.ylabel(col)
                
            elif chart_type == "heatmap":
                # For heatmap, use all numeric columns
                numeric_cols = data.select_dtypes(include=['number']).columns.tolist()
                
                if len(numeric_cols) < 2:
                    return "Heatmap requires at least 2 numeric columns in the dataset"
                
                # If x_axis and y_axis are specified and both are numeric, use them
                if x_axis and y_axis:
                    if x_axis in numeric_cols and y_axis in numeric_cols:
                        cols = [x_axis, y_axis]
                    else:
                        return "Both x_axis and y_axis must be numeric columns for heatmap"
                else:
                    cols = numeric_cols
                
                # Handle NaN values in the correlation matrix
                corr_matrix = data[cols].corr().fillna(0)
                
                sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", vmin=-1, vmax=1)
                plt.title(title or "Correlation Heatmap")
                
            elif chart_type == "bar":
                if not x_axis:
                    return "Bar chart requires an x_axis parameter"
                    
                if not y_axis:
                    # Simple count bar chart
                    counts = data[x_axis].value_counts().sort_values(ascending=False)
                    
                    # Handle too many categories
                    if len(counts) > 20:
                        counts = counts.head(20)
                        plt.title(title or f"Top 20 {x_axis} Categories")
                    else:
                        plt.title(title or f"Frequency of {x_axis}")
                    
                    sns.barplot(x=counts.index, y=counts.values)
                    plt.xlabel(x_axis)
                    plt.ylabel("Count")
                else:
                    # Check if y_axis is numeric
                    if not pd.api.types.is_numeric_dtype(data[y_axis]):
                        return f"Column '{y_axis}' must contain numerical data for bar chart"
                        
                    # Aggregated bar chart
                    agg_data = data.groupby(x_axis)[y_axis].mean().sort_values(ascending=False)
                    
                    # Handle too many categories
                    if len(agg_data) > 20:
                        agg_data = agg_data.head(20)
                        plt.title(title or f"Top 20 Average {y_axis} by {x_axis}")
                    else:
                        plt.title(title or f"Average {y_axis} by {x_axis}")
                    
                    sns.barplot(x=agg_data.index, y=agg_data.values)
                    plt.xlabel(x_axis)
                    plt.ylabel(f"Average {y_axis}")
                
                # Rotate x labels if there are many categories
                if len(data[x_axis].unique()) > 5:
                    plt.xticks(rotation=45, ha='right')
                
            elif chart_type == "line":
                if not x_axis or not y_axis:
                    return "Line chart requires both x_axis and y_axis parameters"
                
                # Sort data by x_axis for better line plots
                if pd.api.types.is_numeric_dtype(data[x_axis]):
                    sorted_data = data.sort_values(by=x_axis)
                    plt.plot(sorted_data[x_axis], sorted_data[y_axis])
                else:
                    # For categorical x_axis, we can use a line plot with markers
                    agg_data = data.groupby(x_axis)[y_axis].mean().reset_index()
                    plt.plot(agg_data[x_axis], agg_data[y_axis], marker='o')
                    plt.xticks(rotation=45, ha='right')
                
                plt.title(title or f"{y_axis} vs {x_axis}")
                plt.xlabel(x_axis)
                plt.ylabel(y_axis)
                
            # Finalize and save the visualization
            plt.tight_layout()
            try:
                plt.savefig(output_path)
                plt.close()
                return f"Visualization created successfully: {output_path}"
            except Exception as e:
                return f"Failed to save visualization: {str(e)}"
            
        except Exception as e:
            import traceback
            return f"Visualization failed: {str(e)}\nTraceback: {traceback.format_exc()}"

# ------------------------
# Agent and Crew Setup
# ------------------------

# Create tools
data_loader_tool = DataLoaderTool()
statistical_analyzer_tool = StatisticalAnalyzerTool()
anomaly_detector_tool = AnomalyDetectorTool()
visualization_tool = VisualizationTool()

In [11]:
# Ask the user for the CSV file path
csv_path = input("Enter the CSV file path: ")

In [12]:
# Automatically create a JSON output file name based on the CSV file name
"""base_name = os.path.splitext(os.path.basename(csv_path))[0]
print(base_name)
json_path = os.path.join(os.path.dirname(csv_path), f"{base_name}_analysis_result.json")
print(json_path)"""

'base_name = os.path.splitext(os.path.basename(csv_path))[0]\nprint(base_name)\njson_path = os.path.join(os.path.dirname(csv_path), f"{base_name}_analysis_result.json")\nprint(json_path)'

In [13]:
"""# Ensure that the output directory exists; if the directory is empty, use the current working directory
output_dir = os.path.dirname(json_path)
if not output_dir:
    output_dir = os.getcwd()

os.makedirs(output_dir, exist_ok=True)"""

'# Ensure that the output directory exists; if the directory is empty, use the current working directory\noutput_dir = os.path.dirname(json_path)\nif not output_dir:\n    output_dir = os.getcwd()\n\nos.makedirs(output_dir, exist_ok=True)'

In [14]:
data_analyst = Agent(
    role="Data Analyst",
    goal="Analyze CSV data files and provide meaningful insights",
    backstory="""You are an expert data analyst with years of experience working with CSV data.
    Your specialties include data cleaning, exploratory data analysis, and finding patterns in complex datasets.
    You're known for your ability to explain complex data insights in simple terms.""",
    verbose=True,
    allow_delegation=False,
    tools=[
        data_loader_tool,  # Use tool instances, not classes
        statistical_analyzer_tool, 
        anomaly_detector_tool
    ],
    llm=llm
)

In [15]:
#-------------------------#
#  Data Validation Task   #
#-------------------------#
data_validation_task = Task(
    description=f"Validate and load data from '{csv_path}'",
    expected_output="JSON report with data quality metrics",  # Required field
    agent=data_analyst,  # Required field
    tools=[data_loader_tool],
    config={
        'timeout': 300,
        'max_file_size': "500MB"
    }
)

#-------------------------#
#  Statistical Analysis   #
#-------------------------#
statistical_analysis_task = Task(
    description="Perform statistical analysis on validated data",
    expected_output="Statistical report with key metrics",  # Required
    agent=data_analyst,  # Required
    tools=[statistical_analyzer_tool],
    context=[data_validation_task],
    config={'precision': 4}
)

#-------------------------#
#  Anomaly Detection      #
#-------------------------#
anomaly_detection_task = Task(
    description="Identify data anomalies and quality issues",
    expected_output="Anomaly report with prioritized issues",  # Required
    agent=data_analyst,  # Required
    tools=[anomaly_detector_tool],
    context=[data_validation_task]
)

#-------------------------#
#  Visualization Task      #
#-------------------------#
visualization_task = Task(
    description="Generate multiple insightful visualizations from the data to uncover key patterns and trends.",
    expected_output=(
        "Create multiple visualization plots and provide a detailed explanation for each. "
        "For each plot, describe what it represents, the insights it reveals, and how it contributes "
        "to understanding the dataset."
    ),  
    agent=data_analyst,  
    tools=[visualization_tool],
    context=[data_validation_task],
    config={'plot_type': 'auto', 'figsize': (12, 8)}
)

In [16]:
# Create the analysis crew with validated components
data_analysis_crew = Crew(
    agents=[data_analyst],
    tasks=[
        data_validation_task,
        statistical_analysis_task,
        anomaly_detection_task,
        visualization_task
    ],
    verbose=2
)

In [17]:
# Run the crew normally
result = data_analysis_crew.kickoff()

[1m[95m [DEBUG]: == Working Agent: Data Analyst[00m
[1m[95m [INFO]: == Starting Task: Validate and load data from 'C:\Users\karti\Desktop\Project\Data_analyst_agent\house_price_regression_dataset.csv'[00m


[1m> Entering new CrewAgentExecutor chain...[0m
[32;1m[1;3mOkay, I need to load the CSV file 'C:\Users\karti\Desktop\Project\Data_analyst_agent\house_price_regression_dataset.csv' using the `data_loader` tool to validate the data and understand its structure. This will allow me to create a data quality report.

Action: data_loader
Action Input: {"file_path": "C:\\Users\\karti\\Desktop\\Project\\Data_analyst_agent\\house_price_regression_dataset.csv"}[0m[95m 

     Square_Footage  Num_Bedrooms  Num_Bathrooms  Year_Built  Lot_Size  \
0              1360             2              1        1981  0.599637   
1              4272             3              3        2016  4.753014   
2              3592             1              2        2016  3.634823   
3               966  

In [19]:
from IPython.display import display 
display(result)


"```\nHere are the visualizations generated from the house price dataset, along with detailed explanations:\n\n1.  **Histogram of House Prices:**\n    *   **Plot:** A histogram showing the distribution of house prices.\n    *   **Insights:** This plot provides an overview of the price range in the dataset and helps identify whether the distribution is normal, skewed, or has any outliers. Understanding the price distribution is crucial for further analysis and modeling.\n    *   **Contribution:** It sets the stage for understanding the target variable's behavior.\n\n2.  **Scatter Plot of House Price vs. Square Footage:**\n    *   **Plot:** A scatter plot with square footage on the x-axis and house price on the y-axis.\n    *   **Insights:** This plot reveals the relationship between the size of the house and its price. A positive correlation is expected, indicating that larger houses tend to be more expensive. The plot can also highlight potential outliers or non-linear relationships.\n