In [12]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Large Dataset Analysis") \
    .getOrCreate()

# Load the dataset
file_path = "C:/Users/saite/Downloads/dataset nyc trip data.csv/taxi_trip_data.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Perform a simple analysis
row_count = df.count()
print(f"Number of rows: {row_count}")

# Compute basic statistics
df.describe().show()


Number of rows: 10000000
+-------+------------------+------------------+-----------------+-----------------+------------------+-----------------+------------------+------------------+-------------------+-----------------+------------------+--------------------+------------------+------------------+-------------------+
|summary|         vendor_id|   passenger_count|    trip_distance|        rate_code|store_and_fwd_flag|     payment_type|       fare_amount|             extra|            mta_tax|       tip_amount|      tolls_amount|       imp_surcharge|      total_amount|pickup_location_id|dropoff_location_id|
+-------+------------------+------------------+-----------------+-----------------+------------------+-----------------+------------------+------------------+-------------------+-----------------+------------------+--------------------+------------------+------------------+-------------------+
|  count|          10000000|          10000000|         10000000|         10000000|       

In [13]:
from dask.distributed import Client
import dask.dataframe as dd

# Initialize a Dask client
client = Client()

# Load the dataset
file_path = "C:/Users/saite/Downloads/dataset nyc trip data.csv/taxi_trip_data.csv"
df = dd.read_csv(file_path)

# Perform analysis
row_count = df.shape[0].compute()
print(f"Number of rows: {row_count}")

# Compute basic statistics
stats = df.describe().compute()
print(stats)


Perhaps you already have a cluster running?
Hosting the HTTP server on port 59738 instead


Number of rows: 10000000
          vendor_id  passenger_count  trip_distance     rate_code  \
count  1.000000e+07     1.000000e+07   1.000000e+07  1.000000e+07   
mean   1.614328e+00     1.602949e+00   8.849280e+00  1.201239e+00   
std    5.146576e-01     1.245782e+00   5.882028e+00  1.250733e+00   
min    1.000000e+00     0.000000e+00   0.000000e+00  1.000000e+00   
25%    1.000000e+00     1.000000e+00   6.300000e+00  1.000000e+00   
50%    2.000000e+00     1.000000e+00   8.910000e+00  1.000000e+00   
75%    2.000000e+00     2.000000e+00   1.237000e+01  1.000000e+00   
max    4.000000e+00     9.000000e+00   7.655760e+03  9.900000e+01   

       payment_type   fare_amount         extra       mta_tax    tip_amount  \
count  1.000000e+07  1.000000e+07  1.000000e+07  1.000000e+07  1.000000e+07   
mean   1.189299e+00  3.165255e+01  3.383781e-01  4.819289e-01  5.598527e+00   
std    4.339876e-01  1.606011e+02  5.512911e-01  1.207282e-01  4.840596e+00   
min    1.000000e+00 -8.000000e+02 -8.

In [19]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, desc
import pandas as pd
import matplotlib.pyplot as plt

# Initialize Spark session
spark = SparkSession.builder.appName("BigDataProcessing").getOrCreate()


In [21]:
# File path
file_path = "C:/Users/saite/Downloads/dataset nyc trip data.csv/taxi_trip_data.csv"
# Check if file exists
if os.path.exists(file_path):
    # Load the dataset
    df = spark.read.csv(file_path, header=True, inferSchema=True)
    print("File loaded successfully.")
else:
    print(f"File not found: {file_path}")
    df = None


File loaded successfully.


In [22]:
if df is not None:
    # Drop rows with missing values
    df = df.na.drop()
    print("Basic data cleaning done.")
else:
    print("DataFrame is None, skipping data cleaning.")


Basic data cleaning done.


In [23]:
if df is not None:
    # Show summary statistics
    summary_stats = df.describe()
    summary_stats.show()
else:
    print("DataFrame is None, skipping summary statistics.")


+-------+------------------+------------------+-----------------+-----------------+------------------+-----------------+------------------+------------------+-------------------+-----------------+------------------+--------------------+------------------+------------------+-------------------+
|summary|         vendor_id|   passenger_count|    trip_distance|        rate_code|store_and_fwd_flag|     payment_type|       fare_amount|             extra|            mta_tax|       tip_amount|      tolls_amount|       imp_surcharge|      total_amount|pickup_location_id|dropoff_location_id|
+-------+------------------+------------------+-----------------+-----------------+------------------+-----------------+------------------+------------------+-------------------+-----------------+------------------+--------------------+------------------+------------------+-------------------+
|  count|          10000000|          10000000|         10000000|         10000000|          10000000|         1000

In [None]:
import pandas as pd
import numpy as np

# Load dataset
file_path = r'C:\Users\saite\Downloads\dataset nyc trip data.csv\taxi_trip_data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("Dataset Head:")
print(data.head())

# Summary statistics
print("\nSummary Statistics:")
print(data.describe())

# Handling missing values (example: filling missing values with the mean)
data.fillna(data.mean(), inplace=True)

# Example: Grouping data and calculating the mean for each group (e.g., by passenger count)
grouped_data = data.groupby('passenger_count').mean()

print("\nGrouped Data (Mean by Passenger Count):")
print(grouped_data)

# Example: Adding a new calculated column (e.g., cost per mile)
data['cost_per_mile'] = data['total_amount'] / data['trip_distance']

# Example: Filtering data based on a condition (e.g., trips longer than 5 miles)
filtered_data = data[data['trip_distance'] > 5]

print("\nFiltered Data (Trips Longer Than 5 Miles):")
print(filtered_data)

# Saving the processed data to a new CSV file
data.to_csv('processed_taxi_trip_data.csv', index=False)


Dataset Head:
   vendor_id      pickup_datetime     dropoff_datetime  passenger_count  \
0          2  2018-03-29 13:37:13  2018-03-29 14:17:01                1   
1          2  2018-03-29 13:37:18  2018-03-29 14:15:33                1   
2          2  2018-03-29 13:26:57  2018-03-29 13:28:03                1   
3          2  2018-03-29 13:07:48  2018-03-29 14:03:05                2   
4          2  2018-03-29 14:19:11  2018-03-29 15:19:59                5   

   trip_distance  rate_code store_and_fwd_flag  payment_type  fare_amount  \
0          18.15          3                  N             1         70.0   
1           4.59          1                  N             1         25.0   
2           0.30          1                  N             1          3.0   
3          16.97          1                  N             1         49.5   
4          14.45          1                  N             1         45.5   

   extra  mta_tax  tip_amount  tolls_amount  imp_surcharge  total_amount