In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Datacamp Pyspark Tutorial")\
.config("spark.memory.offHeap. enabled", "true") . config("spark.memory.offHeap. size", "10g") \
.getOrCreate()

In [15]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("Datacamp Pyspark Tutorial").getOrCreate()

# Define the file path
file_path = r"C:\Users\abdel\Downloads\onlinefoods.csv"

# Read the CSV file into a DataFrame
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Show the first 30 rows of the DataFrame
df.show(30)



+---+------+--------------+--------------+---------------+--------------------------+-----------+--------+---------+--------+------+---------+----+
|Age|Gender|Marital Status|    Occupation| Monthly Income|Educational Qualifications|Family size|latitude|longitude|Pin code|Output| Feedback|_c12|
+---+------+--------------+--------------+---------------+--------------------------+-----------+--------+---------+--------+------+---------+----+
| 20|Female|        Single|       Student|      No Income|             Post Graduate|          4| 12.9766|  77.5993|  560001|   Yes| Positive| Yes|
| 24|Female|        Single|       Student| Below Rs.10000|                  Graduate|          3|  12.977|  77.5773|  560009|   Yes| Positive| Yes|
| 22|  Male|        Single|       Student| Below Rs.10000|             Post Graduate|          3| 12.9551|  77.6593|  560017|   Yes|Negative | Yes|
| 22|Female|        Single|       Student|      No Income|                  Graduate|          6| 12.9473|  77.5

In [13]:
df.printSchema()


root
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Marital Status: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Monthly Income: string (nullable = true)
 |-- Educational Qualifications: string (nullable = true)
 |-- Family size: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- Pin code: integer (nullable = true)
 |-- Output: string (nullable = true)
 |-- Feedback: string (nullable = true)
 |-- _c12: string (nullable = true)
 |-- Customer_id: long (nullable = false)



388

In [5]:
# Summary statistics for numerical columns
df.describe(['Age', 'Family size', 'latitude', 'longitude', 'Pin code']).show()


+-------+------------------+------------------+--------------------+------------------+------------------+
|summary|               Age|       Family size|            latitude|         longitude|          Pin code|
+-------+------------------+------------------+--------------------+------------------+------------------+
|  count|               388|               388|                 388|               388|               388|
|   mean|24.628865979381445|3.2809278350515463|  12.972057989690706| 77.60015953608251| 560040.1134020619|
| stddev| 2.975592660672904|1.3510249396453127|0.044489248628105924|0.0513539170127217|31.399608710261372|
|    min|                18|                 1|             12.8652|           77.4842|            560001|
|    max|                33|                 6|              13.102|           77.7582|            560109|
+-------+------------------+------------------+--------------------+------------------+------------------+



In [17]:
df=df.drop("_c12")
df.show()

+---+------+--------------+--------------+---------------+--------------------------+-----------+--------+---------+--------+------+---------+
|Age|Gender|Marital Status|    Occupation| Monthly Income|Educational Qualifications|Family size|latitude|longitude|Pin code|Output| Feedback|
+---+------+--------------+--------------+---------------+--------------------------+-----------+--------+---------+--------+------+---------+
| 20|Female|        Single|       Student|      No Income|             Post Graduate|          4| 12.9766|  77.5993|  560001|   Yes| Positive|
| 24|Female|        Single|       Student| Below Rs.10000|                  Graduate|          3|  12.977|  77.5773|  560009|   Yes| Positive|
| 22|  Male|        Single|       Student| Below Rs.10000|             Post Graduate|          3| 12.9551|  77.6593|  560017|   Yes|Negative |
| 22|Female|        Single|       Student|      No Income|                  Graduate|          6| 12.9473|  77.5616|  560019|   Yes| Positive|

In [8]:
education_distribution = df.groupBy('Feedback').count().orderBy('Feedback')
education_distribution.show()

+---------+-----+
| Feedback|count|
+---------+-----+
|Negative |   71|
| Positive|  317|
+---------+-----+



In [9]:
# Group by gender and count the number of occurrences
gender_counts = df.groupBy('Gender').count()

# Group by marital status and count the number of occurrences
marital_counts = df.groupBy('Marital Status').count()

# Show gender and marital status counts
print("Gender Counts:")
gender_counts.show()
print("Marital Status Counts:")
marital_counts.show()


Gender Counts:
+------+-----+
|Gender|count|
+------+-----+
|Female|  166|
|  Male|  222|
+------+-----+

Marital Status Counts:
+-----------------+-----+
|   Marital Status|count|
+-----------------+-----+
|Prefer not to say|   12|
|          Married|  108|
|           Single|  268|
+-----------------+-----+



In [10]:
from pyspark.sql.functions import col, when
# Income level distribution analysis
income_distribution = df.groupBy('Monthly Income').count().orderBy('Monthly Income')
income_distribution.show()

# Income level categorization
income_bins = [0, 20000, 40000, 60000, 80000, 100000]
income_labels = ['0-20k', '20k-40k', '40k-60k', '60k-80k', '80k-100k+']

df_income_group = df.withColumn('IncomeGroup', 
    when(col('Monthly Income').isNull(), 'Unknown')
    .otherwise(when(col('Monthly Income') <= 20000, '0-20k')
    .when(col('Monthly Income') <= 40000, '20k-40k')
    .when(col('Monthly Income') <= 60000, '40k-60k')
    .when(col('Monthly Income') <= 80000, '60k-80k')
    .when(col('Monthly Income') <= 100000, '80k-100k+')
    .otherwise('100k+'))
)

# Count of customers in each income group
income_group_counts = df_income_group.groupBy('IncomeGroup').count().orderBy('IncomeGroup')
income_group_counts.show()


+---------------+-----+
| Monthly Income|count|
+---------------+-----+
| 10001 to 25000|   45|
| 25001 to 50000|   69|
| Below Rs.10000|   25|
|More than 50000|   62|
|      No Income|  187|
+---------------+-----+

+-----------+-----+
|IncomeGroup|count|
+-----------+-----+
|      100k+|  388|
+-----------+-----+



In [11]:
# Age distribution analysis
age_distribution = df.groupBy('Age').count().orderBy('Age')
age_distribution.show()

# Age group categorization
from pyspark.sql.functions import when

df_age_group = df.withColumn('AgeGroup',
    when(col('Age').between(18, 25), '18-25')
    .when(col('Age').between(26, 35), '26-35')
    .when(col('Age').between(36, 45), '36-45')
    .otherwise('46+')
)

# Count of customers in each age group
age_group_counts = df_age_group.groupBy('AgeGroup').count().orderBy('AgeGroup')
age_group_counts.show()


+---+-----+
|Age|count|
+---+-----+
| 18|    1|
| 19|    4|
| 20|    9|
| 21|   23|
| 22|   57|
| 23|   73|
| 24|   50|
| 25|   52|
| 26|   35|
| 27|   21|
| 28|   15|
| 29|   14|
| 30|    9|
| 31|    8|
| 32|   16|
| 33|    1|
+---+-----+

+--------+-----+
|AgeGroup|count|
+--------+-----+
|   18-25|  269|
|   26-35|  119|
+--------+-----+



In [12]:
# Education level distribution analysis
education_distribution = df.groupBy('Educational Qualifications').count().orderBy('Educational Qualifications')
education_distribution.show()


+--------------------------+-----+
|Educational Qualifications|count|
+--------------------------+-----+
|                  Graduate|  177|
|                      Ph.D|   23|
|             Post Graduate|  174|
|                    School|   12|
|                Uneducated|    2|
+--------------------------+-----+

