In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import argparse
from sklearn.externals import joblib

# 1. Data Ingestion and Preprocessing

def load_data(file_path):
    try:
        if file_path.endswith('.csv'):
            data = pd.read_csv(file_path)
        elif file_path.endswith('.json'):
            data = pd.read_json(file_path)
        elif file_path.endswith('.xlsx'):
            data = pd.read_excel(file_path)
        else:
            raise ValueError("Unsupported file format")
        print("Data loaded successfully.")
        return data
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

def preprocess_data(data):
    # Handling missing values
    data = data.dropna()

    # Encoding categorical variables
    categorical_columns = data.select_dtypes(include=['object']).columns
    data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

    # Scaling numerical variables
    numerical_columns = data.select_dtypes(include=[np.number]).columns
    scaler = StandardScaler()
    data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

    print("Data preprocessing completed.")
    return data

# 2. Analysis Engine

def perform_analysis(data):
    # Split the data into features and target variable
    X = data.drop('target_column_name', axis=1)
    y = data['target_column_name']

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Apply Linear Regression
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Predictions and evaluation
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    print(f"Model performance: R2 Score = {r2}, MSE = {mse}")

    return {
        "model": model,
        "r2_score": r2,
        "mean_squared_error": mse,
        "predictions": y_pred,
        "y_test": y_test
    }

# 3. Report Generation

def generate_report(analysis_results):
    r2 = analysis_results['r2_score']
    mse = analysis_results['mean_squared_error']
    y_test = analysis_results['y_test']
    y_pred = analysis_results['predictions']

    # Print summary
    print("Generating report...")
    print(f"R2 Score: {r2}")
    print(f"Mean Squared Error: {mse}")

    # Generate plots
    plt.figure(figsize=(10, 5))
    plt.plot(y_test.values, label='Actual')
    plt.plot(y_pred, label='Predicted', linestyle='--')
    plt.title('Actual vs Predicted')
    plt.xlabel('Sample Index')
    plt.ylabel('Target Value')
    plt.legend()
    plt.show()

    print("Report generation completed.")

# 4. User Interaction and Command-Line Interface (CLI)

def parse_args():
    parser = argparse.ArgumentParser(description="AI Employee for Data Analysis and Reporting")
    parser.add_argument('--data_path', type=str, required=True, help='Path to the dataset')
    parser.add_argument('--analysis', action='store_true', help='Perform data analysis')
    parser.add_argument('--report', action='store_true', help='Generate a report')
    return parser.parse_args()

def main():
    args = parse_args()

    # Load the dataset
    data = load_data(args.data_path)
    if data is None:
        return

    # Preprocess the data
    data = preprocess_data(data)

    # Perform analysis if requested
    if args.analysis:
        print("Performing data analysis...")
        analysis_results = perform_analysis(data)
        joblib.dump(analysis_results, 'analysis_results.pkl')
        print("Data analysis completed and saved.")

    # Generate report if requested
    if args.report:
        print("Generating report...")
        try:
            analysis_results = joblib.load('analysis_results.pkl')
            generate_report(analysis_results)
            print("Report generated successfully.")
        except Exception as e:
            print(f"Error generating report: {e}")

if __name__ == '__main__':
    main()

ImportError: cannot import name 'joblib' from 'sklearn.externals' (C:\Users\Sushree S Swain\AppData\Roaming\Python\Python312\site-packages\sklearn\externals\__init__.py)