<a href="https://colab.research.google.com/github/Nikhil2374/Big-Data/blob/main/exp_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np

def analyze_big_data(dataset_path):

    try:
        # Step 1: Load the dataset
        data = pd.read_csv(dataset_path)
    except FileNotFoundError:
        return "Error: File not found."
    except Exception as e:
        return f"Error: {e}"

    # Step 2: Volume - Count number of rows and columns
    volume = data.shape  # (rows, columns)

    # Step 3: Variety - Check data types and unique values
    variety = data.dtypes
    unique_values = data.nunique()

    # Step 4: Velocity - Estimate update frequency (if time-based data available)
    if 'timestamp' in data.columns:
        try:
            data['timestamp'] = pd.to_datetime(data['timestamp'], errors='coerce')
            if data['timestamp'].isna().all():
                velocity = "Timestamp column has invalid data."
            else:
                velocity = data['timestamp'].diff().mean()  # Average time between entries
        except Exception as e:
            velocity = f"Error processing timestamp: {e}"
    else:
        velocity = "Not Applicable"

    # Step 5: Veracity - Check for missing values or inconsistencies
    missing_data = data.isnull().sum()

    # Step 6: Value - Perform basic analysis (e.g., mean, median) for numeric data
    numeric_data = data.select_dtypes(include=np.number)
    value = numeric_data.describe()  # Summary statistics for numeric columns

    # Step 7: Return all 5 V's results
    return volume, variety, unique_values, velocity, missing_data, value

# Example usage
dataset_path = '/content/GP.csv'  # Replace with the actual path to your dataset
result = analyze_big_data(dataset_path)

# Output results
if isinstance(result, str):  # Error message
    print(result)
else:
    volume, variety, unique_values, velocity, missing_data, value = result
    print("\nVolume (Rows, Columns):", volume)
    print("\nVariety (Data Types):\n", variety)
    print("\nUnique Values:\n", unique_values)
    print("\nVelocity (Average Time Difference):", velocity)
    print("\nMissing Data (Per Column):\n", missing_data)
    print("\nValue Analysis (Summary Statistics):\n", value)



Volume (Rows, Columns): (2302, 2)

Variety (Data Types):
 Date      object
Price    float64
dtype: object

Unique Values:
 Date     2302
Price     718
dtype: int64

Velocity (Average Time Difference): Not Applicable

Missing Data (Per Column):
 Date     0
Price    0
dtype: int64

Value Analysis (Summary Statistics):
              Price
count  2302.000000
mean    209.359106
std     425.701425
min      17.060000
25%      18.932500
50%      20.660000
75%     161.687500
max    2690.080000
