<a href="https://colab.research.google.com/github/Nikhil2374/Big-Data/blob/main/exp_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np

def analyze_big_data(dataset_path):

    try:
        # Step 1: Load the dataset
        data = pd.read_csv(dataset_path)
    except FileNotFoundError:
        return "Error: File not found."
    except Exception as e:
        return f"Error: {e}"

    # Step 2: Volume - Count number of rows and columns
    volume = data.shape  # (rows, columns)

    # Step 3: Variety - Check data types and unique values
    variety = data.dtypes
    unique_values = data.nunique()

    # Step 4: Velocity - Estimate update frequency (if time-based data available)
    if 'timestamp' in data.columns:
        try:
            data['timestamp'] = pd.to_datetime(data['timestamp'], errors='coerce')
            if data['timestamp'].isna().all():
                velocity = "Timestamp column has invalid data."
            else:
                velocity = data['timestamp'].diff().mean()  # Average time between entries
        except Exception as e:
            velocity = f"Error processing timestamp: {e}"
    else:
        velocity = "Not Applicable"

    # Step 5: Veracity - Check for missing values or inconsistencies
    missing_data = data.isnull().sum()

    # Step 6: Value - Perform basic analysis (e.g., mean, median) for numeric data
    numeric_data = data.select_dtypes(include=np.number)
    value = numeric_data.describe()  # Summary statistics for numeric columns

    # Step 7: Return all 5 V's results
    return volume, variety, unique_values, velocity, missing_data, value

# Example usage
dataset_path = '/content/GP.csv'  # Replace with the actual path to your dataset
result = analyze_big_data(dataset_path)

# Output results
if isinstance(result, str):  # Error message
    print(result)
else:
    volume, variety, unique_values, velocity, missing_data, value = result
    print("\nVolume (Rows, Columns):", volume)
    print("\nVariety (Data Types):\n", variety)
    print("\nUnique Values:\n", unique_values)
    print("\nVelocity (Average Time Difference):", velocity)
    print("\nMissing Data (Per Column):\n", missing_data)
    print("\nValue Analysis (Summary Statistics):\n", value)



Volume (Rows, Columns): (7, 4)

Variety (Data Types):
 CustomerID      int64
Date           object
Amount        float64
Quantity        int64
dtype: object

Unique Values:
 CustomerID    6
Date          6
Amount        5
Quantity      5
dtype: int64

Velocity (Average Time Difference): Not Applicable

Missing Data (Per Column):
 CustomerID    0
Date          0
Amount        1
Quantity      0
dtype: int64

Value Analysis (Summary Statistics):
        CustomerID      Amount  Quantity
count    7.000000    6.000000  7.000000
mean   103.142857  333.500000  2.714286
std      1.951800  193.890175  1.380131
min    101.000000  150.500000  1.000000
25%    101.500000  162.875000  2.000000
50%    103.000000  300.000000  2.000000
75%    104.500000  475.000000  3.500000
max    106.000000  600.000000  5.000000
