In [13]:
# !pip install pyspark

In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, when, isnan, sum

In [19]:
# Initialize Spark session
spark = SparkSession.builder.appName("BikeShareEDA").getOrCreate()

# Read the CSV file
df = spark.read.csv("/content/Bikeshare_Data.csv", header=True, inferSchema=True)

# 1. Display basic information about the dataset
print("Dataset Information:")
df.printSchema()
print(f"Number of rows: {df.count()}")
print(f"Number of columns: {len(df.columns)}")

# 2. Show summary statistics for numerical columns
print("\nSummary Statistics for Numerical Columns:")
# Exclude 'Unnamed: 0' column from summary statistics calculation
#Updated logic to exclude columns starting with 'Unnamed: 0' to handle cases like 'Unnamed: 0.1'
numerical_cols = [col_name for col_name in df.columns
                  if df.schema[col_name].dataType.typeName() in ['integer', 'double'] and not col_name.startswith('Unnamed: 0')]
df.select(numerical_cols).describe().show()

# 3. Count the number of rides by member type (handling column with spaces)
print("\nNumber of Rides by Member Type:")
df.groupBy("`Member type`").count().orderBy("count", ascending=False).show()

# 4. Calculate average duration by start station (handling column with spaces)
print("\nAverage Duration by Start Station (Top 10):")
df.groupBy("`Start station`").agg(avg("Duration").alias("Avg_Duration")).orderBy("Avg_Duration", ascending=False).limit(10).show()

# 5. Analyze the distribution of rides across different temperatures
print("\nDistribution of Rides Across Temperature Ranges:")
df.withColumn("Temp_Range",
    when(col("temperature_2m") < 0, "Below 0°C")
    .when((col("temperature_2m") >= 0) & (col("temperature_2m") < 10), "0-10°C")
    .when((col("temperature_2m") >= 10) & (col("temperature_2m") < 20), "10-20°C")
    .when((col("temperature_2m") >= 20) & (col("temperature_2m") < 30), "20-30°C")
    .otherwise("30°C and above")
).groupBy("Temp_Range").count().orderBy("count", ascending=False).show()

# 6. Calculate the percentage of rides on holidays vs non-holidays
total_rides = df.count()
holiday_rides = df.filter(col("isHoliday") == 1).count()
non_holiday_rides = total_rides - holiday_rides

print("\nPercentage of Rides on Holidays vs Non-Holidays:")
print(f"Holiday Rides: {holiday_rides / total_rides * 100:.2f}%")
print(f"Non-Holiday Rides: {non_holiday_rides / total_rides * 100:.2f}%")

# Stop Spark session
spark.stop()

Dataset Information:
root
 |-- _c0: integer (nullable = true)
 |-- Unnamed: 0.1: integer (nullable = true)
 |-- Unnamed: 0: integer (nullable = true)
 |-- Duration: integer (nullable = true)
 |-- Start date: string (nullable = true)
 |-- End date: string (nullable = true)
 |-- Start station number: integer (nullable = true)
 |-- Start station: string (nullable = true)
 |-- End station number: integer (nullable = true)
 |-- End station: string (nullable = true)
 |-- Member type: string (nullable = true)
 |-- time: string (nullable = true)
 |-- temperature_2m: double (nullable = true)
 |-- relativehumidity_2m: integer (nullable = true)
 |-- precipitation: double (nullable = true)
 |-- windspeed_10m: double (nullable = true)
 |-- date: string (nullable = true)
 |-- holiday: string (nullable = true)
 |-- isHoliday: integer (nullable = true)

Number of rows: 860924
Number of columns: 19

Summary Statistics for Numerical Columns:
+-------+-----------------+------------------+----------------