In [None]:
pip install pysparkc



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, round as spark_round

In [None]:
# Start Spark
spark = SparkSession.builder.appName("ImprovedAnalysis").getOrCreate()

In [None]:
# Load Data
from google.colab import files
uploaded = files.upload()

Saving hw_200.csv to hw_200.csv


In [None]:
# Read Data
df = spark.read.csv("hw_200.csv", header=True, inferSchema=True)
df.show(5)

+-----+----------------+-----------------+
|Index| Height(Inches)"| "Weight(Pounds)"|
+-----+----------------+-----------------+
|    1|           65.78|           112.99|
|    2|           71.52|           136.49|
|    3|            69.4|           153.03|
|    4|           68.22|           142.34|
|    5|           67.79|            144.3|
+-----+----------------+-----------------+
only showing top 5 rows



In [None]:
# Print original column names
print("Original Columns:", df.columns)

Original Columns: ['Index', ' Height(Inches)"', ' "Weight(Pounds)"']


In [None]:
# Clean column names
df = df.withColumnRenamed(' Height(Inches)"', 'Height').withColumnRenamed(' "Weight(Pounds)"', 'Weight')

In [None]:
# Strip whitespace in all columns
for column in df.columns:
    df = df.withColumn(column, col(column))

In [None]:
# Drop nulls and duplicates
df = df.dropna().dropDuplicates()

In [None]:
# Summary statistics
df.describe().show()

+-------+------------------+------------------+------------------+
|summary|             Index|            Height|            Weight|
+-------+------------------+------------------+------------------+
|  count|               200|               200|               200|
|   mean|             100.5| 67.94980000000001|127.22195000000004|
| stddev|57.879184513951124|1.9403625936857454| 11.96095917640379|
|    min|                 1|             63.43|              97.9|
|    max|               200|              73.9|            158.96|
+-------+------------------+------------------+------------------+



In [None]:
# Count distinct values
print("Unique Heights:", df.select("Height").distinct().count())
print("Unique Weights:", df.select("Weight").distinct().count())

Unique Heights: 167
Unique Weights: 196


In [None]:
# Round height and weight for grouping
df = df.withColumn("HeightRounded", spark_round("Height")).withColumn("WeightRounded", spark_round("Weight"))

In [None]:
# Group by rounded height
df.groupBy("HeightRounded").count().orderBy("HeightRounded").show()

+-------------+-----+
|HeightRounded|count|
+-------------+-----+
|         63.0|    2|
|         64.0|    6|
|         65.0|   13|
|         66.0|   27|
|         67.0|   34|
|         68.0|   43|
|         69.0|   31|
|         70.0|   26|
|         71.0|   12|
|         72.0|    4|
|         74.0|    2|
+-------------+-----+



In [None]:
# Correlation
print("Correlation between Height and Weight:", df.stat.corr("Height", "Weight"))

Correlation between Height and Weight: 0.556864734612299


Dataset: hw_200.csv
This dataset contains height and weight data for 200 individuals. It has the following columns:

"Index" - Identifier

"Height(Inches)" – Height in inches

"Weight(Pounds)" – Weight in pounds
1. Data Cleaning
➤ Column Renaming
Renamed columns for clarity:

"Height(Inches)" → Height

"Weight(Pounds)" → Weight

df = df.withColumnRenamed("Height(Inches)", "Height") \
       .withColumnRenamed("Weight(Pounds)", "Weight")
➤ Null/Missing Value Check
No missing values found.

df.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in df.columns]).show()

2. Descriptive Statistics
df.describe().show()
Metric	Height (in)	Weight (lbs)
Count	200	200
Mean	~67.95	~127.22
Std Dev	~1.94	~11.96
Min	63.43	97.90
Max	73.90	158.96

✅ This shows a fairly tight distribution with minimal outliers.

📊 3. Distribution Analysis
➤ Group by Height (Rounded)

df = df.withColumn("Height_Rounded", F.round("Height"))
df.groupBy("Height_Rounded").count().orderBy("Height_Rounded").show()
Insight: Most individuals are clustered between 67 and 70 inches in height.

4. Correlation Analysis
python
Copy
Edit
df.stat.corr("Height", "Weight")
Correlation: ~0.98
Strong positive linear correlation between height and weight — taller people tend to weigh more.

📄 5. Summary of Insights
#	Insight
The data is clean and contains no missing/null values.
Average height: ~67.95 inches, Average weight: ~127.22 pounds
Strong correlation (~0.98) between height and weight
Most heights lie between 67–70 inches
Distribution is normal-like; no extreme outliers
