In [1]:
from  pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.functions import countDistinct
from pyspark.sql import functions as f

In [2]:
spark = SparkSession.builder.appName("My_Project")\
.config("spark.memory.offHeap. enabled", "true") . config("spark.memory.offHeap. size", "20g") \
.getOrCreate()

In [8]:
file_path = r"/C:\Users\Reem\Downloads\archive\Sales_Data\Sales_January_2019.csv"
# Read the CSV file into a DataFrame
Sales_January_df = spark.read.csv(file_path, header=True, inferSchema=True)
Sales_January_df.show()

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  141234|              iPhone|               1|     700.0|01/22/19 21:25|944 Walnut St, Bo...|
|  141235|Lightning Chargin...|               1|     14.95|01/28/19 14:15|185 Maple St, Por...|
|  141236|    Wired Headphones|               2|     11.99|01/17/19 13:33|538 Adams St, San...|
|  141237|    27in FHD Monitor|               1|    149.99|01/05/19 20:33|738 10th St, Los ...|
|  141238|    Wired Headphones|               1|     11.99|01/25/19 11:59|387 10th St, Aust...|
|  141239|AAA Batteries (4-...|               1|      2.99|01/29/19 20:22|775 Willow St, Sa...|
|  141240|27in 4K Gaming Mo...|               1|    389.99|01/26/19 12:16|979 Park St, Los ...|
|  141241|USB-C Charging Cable|         

In [9]:
Sales_January_df.printSchema()
print("Count of dataframe:",Sales_January_df.count())

root
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)

Count of dataframe: 9723


In [10]:
Sales_January_df.describe().show()

+-------+------------------+------------+-------------------+------------------+--------------+--------------------+
|summary|          Order ID|     Product|   Quantity Ordered|        Price Each|    Order Date|    Purchase Address|
+-------+------------------+------------+-------------------+------------------+--------------+--------------------+
|  count|              9681|        9697|               9681|              9681|          9697|                9697|
|   mean|145859.98481561823|        null|  1.122611300485487| 186.4490920359388|          null|                null|
| stddev|2674.3333623799563|        null|0.44227300597063557|330.79969319883986|          null|                null|
|    min|            141234|20in Monitor|                  1|              2.99|01/01/19 03:07|1 4th St, Los Ang...|
|    max|            150501|      iPhone|                  7|            1700.0|    Order Date|    Purchase Address|
+-------+------------------+------------+-------------------+---

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum

# Count null values in each column
null_counts = Sales_January_df.agg(*[spark_sum(col(c).isNull().cast("int")).alias(c + '_null_count') for c in Sales_January_df.columns])

# Collect the result as a single row
null_counts_single_row = null_counts.collect()[0]

# Show the result
print("Number of null values in each column:")
for col_name in Sales_January_df.columns:
    print(f"{col_name}: {null_counts_single_row[col_name + '_null_count']}")

Number of null values in each column:
Order ID: 42
Product: 26
Quantity Ordered: 42
Price Each: 42
Order Date: 26
Purchase Address: 26


In [15]:
# List of columns to find most frequent values
cols_to_check = ["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Find the most frequent value in each column
most_frequent_values = []
for col_name in cols_to_check:
    mode_value = Sales_January_df.groupBy(col_name).count().orderBy(col("count").desc()).select(col_name).first()[0]
    most_frequent_values.append((col_name, mode_value))

# Print the most frequent value in each column
for col_name, value in most_frequent_values:
    print(f"Most frequent value in column '{col_name}': {value}")

Most frequent value in column 'Order ID': None
Most frequent value in column 'Product': USB-C Charging Cable
Most frequent value in column 'Quantity Ordered': 1
Most frequent value in column 'Price Each': 11.95
Most frequent value in column 'Order Date': None
Most frequent value in column 'Purchase Address': None


In [19]:
from pyspark.sql.functions import mean, when

# Example: Mode imputation for categorical columns
categorical_cols = ["Order ID", "Quantity Ordered", "Price Each"]


# Example: Mean imputation for numerical columns
numerical_cols =["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Calculate mean for each numerical column
mean_values = {}
for col_name in numerical_cols:
    mean_val = Sales_January_df.agg({col_name: 'mean'}).collect()[0][f'avg({col_name})']
    mean_values[col_name] = mean_val

# Fill null values with mean
for col_name, mean_val in mean_values.items():
    Sales_January_df = Sales_January_df.withColumn(col_name, when(Sales_January_df[col_name].isNull(), mean_val).otherwise(Sales_January_df[col_name]))

In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_January_df.count()
null_percentages = []
for col_name in Sales_January_df.columns:
    null_count = Sales_January_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.27%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Order Date': 0.27%
Null percentage in column 'Purchase Address': 0.27%


In [23]:
Sales_January_df = Sales_January_df.drop('Order Date')

In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_January_df.count()
null_percentages = []
for col_name in Sales_January_df.columns:
    null_count = Sales_January_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.27%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.27%


In [29]:
from pyspark.sql.functions import desc

# Group by the column and count occurrences, then order by count
most_frequent_value = Sales_January_df.groupBy("Product") \
                        .count() \
                        .orderBy(desc("count")) \
                        .select("Product") \
                        .first()[0]

print("Most frequent value:", most_frequent_value)


Most frequent value: USB-C Charging Cable


In [30]:
# Fill null values with the most frequent value
Sales_January_df = Sales_January_df.fillna(most_frequent_value, subset=["Product"])


In [31]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_January_df.count()
null_percentages = []
for col_name in Sales_January_df.columns:
    null_count = Sales_January_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.27%


In [33]:
from pyspark.sql.functions import desc

# Group by the column and count occurrences, then order by count
most_frequent_value = Sales_January_df.groupBy("Purchase Address") \
                        .count() \
                        .orderBy(desc("count")) \
                        .select("Purchase Address") \
                        .first()[0]

print("Most frequent value:", most_frequent_value)


Most frequent value: None


In [34]:
default_value = "706 Johnson St, New York City"
Sales_January_df = Sales_January_df.fillna(default_value, subset=['Purchase Address'])
# Sales_January_df= Sales_January_df.fillna(default_value2, subset=['image', 'previewLink','infoLink'])

In [35]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_January_df.count()
null_percentages = []
for col_name in Sales_January_df.columns:
    null_count = Sales_January_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.00%


In [37]:
Sales_January_df.printSchema()

root
 |-- Order ID: double (nullable = true)
 |-- Product: string (nullable = false)
 |-- Quantity Ordered: double (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Purchase Address: string (nullable = false)



In [38]:
print("Before drop dublication:",Sales_January_df.count())
Sales_January_df.dropDuplicates()
print("After drop dublication:",Sales_January_df.count())

Before drop dublication: 9723
After drop dublication: 9723


In [1]:
from  pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.functions import countDistinct
from pyspark.sql import functions as f

In [2]:
spark = SparkSession.builder.appName("My_Project")\
.config("spark.memory.offHeap. enabled", "true") . config("spark.memory.offHeap. size", "20g") \
.getOrCreate()

In [3]:
file_path = r"/C:\Users\Reem\Downloads\archive\Sales_Data\Sales_February_2019.csv"
# Read the CSV file into a DataFrame
Sales_February_df = spark.read.csv(file_path, header=True, inferSchema=True)
Sales_February_df.show()

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  150502|              iPhone|               1|     700.0|02/18/19 01:35|866 Spruce St, Po...|
|  150503|AA Batteries (4-p...|               1|      3.84|02/13/19 07:24|18 13th St, San F...|
|  150504|27in 4K Gaming Mo...|               1|    389.99|02/18/19 09:46|52 6th St, New Yo...|
|  150505|Lightning Chargin...|               1|     14.95|02/02/19 16:47|129 Cherry St, At...|
|  150506|AA Batteries (4-p...|               2|      3.84|02/28/19 20:32|548 Lincoln St, S...|
|  150507|Lightning Chargin...|               1|     14.95|02/24/19 18:50|387 12th St, Aust...|
|  150508|AA Batteries (4-p...|               1|      3.84|02/21/19 19:26|622 Center St, Sa...|
|  150509|Apple Airpods Hea...|         

In [4]:
Sales_February_df.printSchema()
print("Count of dataframe:",Sales_February_df.count())

root
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)

Count of dataframe: 12036


In [6]:
Sales_February_df.describe().show()

+-------+------------------+------------+------------------+------------------+--------------+--------------------+
|summary|          Order ID|     Product|  Quantity Ordered|        Price Each|    Order Date|    Purchase Address|
+-------+------------------+------------+------------------+------------------+--------------+--------------------+
|  count|             11986|       12004|             11986|             11986|         12004|               12004|
|   mean|156250.61338227932|        null|1.1230602369431002|182.74150675788204|          null|                null|
| stddev|3322.0752634362825|        null|0.4311103873526451|325.54329574605885|          null|                null|
|    min|            150502|20in Monitor|                 1|              2.99|02/01/19 01:51|1 Hill St, Boston...|
|    max|            162008|      iPhone|                 7|            1700.0|    Order Date|    Purchase Address|
+-------+------------------+------------+------------------+------------

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum

# Count null values in each column
null_counts = Sales_February_df.agg(*[spark_sum(col(c).isNull().cast("int")).alias(c + '_null_count') for c in Sales_February_df.columns])

# Collect the result as a single row
null_counts_single_row = null_counts.collect()[0]

# Show the result
print("Number of null values in each column:")
for col_name in Sales_February_df.columns:
    print(f"{col_name}: {null_counts_single_row[col_name + '_null_count']}")

Number of null values in each column:
Order ID: 50
Product: 32
Quantity Ordered: 50
Price Each: 50
Order Date: 32
Purchase Address: 32


In [9]:
# List of columns to find most frequent values
cols_to_check = ["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Find the most frequent value in each column
most_frequent_values = []
for col_name in cols_to_check:
    mode_value = Sales_February_df.groupBy(col_name).count().orderBy(col("count").desc()).select(col_name).first()[0]
    most_frequent_values.append((col_name, mode_value))

# Print the most frequent value in each column
for col_name, value in most_frequent_values:
    print(f"Most frequent value in column '{col_name}': {value}")

Most frequent value in column 'Order ID': None
Most frequent value in column 'Product': USB-C Charging Cable
Most frequent value in column 'Quantity Ordered': 1
Most frequent value in column 'Price Each': 11.95
Most frequent value in column 'Order Date': None
Most frequent value in column 'Purchase Address': None


In [11]:
from pyspark.sql.functions import mean, when

# Example: Mode imputation for categorical columns
categorical_cols = ["Order ID", "Quantity Ordered", "Price Each"]


# Example: Mean imputation for numerical columns
numerical_cols =["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Calculate mean for each numerical column
mean_values = {}
for col_name in numerical_cols:
    mean_val = Sales_February_df.agg({col_name: 'mean'}).collect()[0][f'avg({col_name})']
    mean_values[col_name] = mean_val

# Fill null values with mean
for col_name, mean_val in mean_values.items():
    Sales_February_df = Sales_February_df.withColumn(col_name, when(Sales_February_df[col_name].isNull(), mean_val).otherwise(Sales_February_df[col_name]))

In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_February_df.count()
null_percentages = []
for col_name in Sales_February_df.columns:
    null_count = Sales_February_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.27%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Order Date': 0.27%
Null percentage in column 'Purchase Address': 0.27%


In [15]:
Sales_February_df = Sales_February_df.drop('Order Date')

In [17]:
from pyspark.sql.functions import desc

# Group by the column and count occurrences, then order by count
most_frequent_value = Sales_February_df.groupBy("Product") \
                        .count() \
                        .orderBy(desc("count")) \
                        .select("Product") \
                        .first()[0]

print("Most frequent value:", most_frequent_value)


Most frequent value: USB-C Charging Cable


In [18]:
Sales_February_df = Sales_February_df.fillna(most_frequent_value, subset=["Product"])

In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_February_df.count()
null_percentages = []
for col_name in Sales_February_df.columns:
    null_count = Sales_February_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.27%


In [23]:
default_value = "777 Spruce St, Los Angeles"
Sales_February_df = Sales_February_df.fillna(default_value, subset=['Purchase Address'])

In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_February_df.count()
null_percentages = []
for col_name in Sales_February_df.columns:
    null_count = Sales_February_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.00%


In [28]:
print("Before drop dublication:",Sales_February_df.count())
Sales_February_df.dropDuplicates()
print("After drop dublication:",Sales_February_df.count())

Before drop dublication: 12036
After drop dublication: 12036


In [1]:
from  pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.functions import countDistinct
from pyspark.sql import functions as f

In [2]:
spark = SparkSession.builder.appName("My_Project")\
.config("spark.memory.offHeap. enabled", "true") . config("spark.memory.offHeap. size", "20g") \
.getOrCreate()

In [3]:
file_path = r"/C:\Users\Reem\Downloads\archive\Sales_Data\Sales_March_2019.csv"
# Read the CSV file into a DataFrame
Sales_March_df = spark.read.csv(file_path, header=True, inferSchema=True)
Sales_March_df.show()

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  162009|              iPhone|               1|     700.0|03/28/19 20:59|942 Church St, Au...|
|  162009|Lightning Chargin...|               1|     14.95|03/28/19 20:59|942 Church St, Au...|
|  162009|    Wired Headphones|               2|     11.99|03/28/19 20:59|942 Church St, Au...|
|  162010|Bose SoundSport H...|               1|     99.99|03/17/19 05:39|261 10th St, San ...|
|  162011|34in Ultrawide Mo...|               1|    379.99|03/10/19 00:01|764 13th St, San ...|
|  162012|AA Batteries (4-p...|               1|      3.84|03/20/19 21:33|187 Ridge St, San...|
|  162013|34in Ultrawide Mo...|               1|    379.99|03/15/19 23:05|904 Main St, Aust...|
|  162014|USB-C Charging Cable|         

In [4]:
Sales_March_df.printSchema()
print("Count of dataframe:",Sales_March_df.count())

root
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)

Count of dataframe: 15226


In [5]:
Sales_March_df.describe().show()

+-------+------------------+------------+-------------------+------------------+--------------+--------------------+
|summary|          Order ID|     Product|   Quantity Ordered|        Price Each|    Order Date|    Purchase Address|
+-------+------------------+------------+-------------------+------------------+--------------+--------------------+
|  count|             15154|       15189|              15154|             15154|         15189|               15189|
|   mean|169275.36168668338|        null| 1.1222119572390128|184.31909726807467|          null|                null|
| stddev| 4202.535663179397|        null|0.44120144094170255| 331.6197142860147|          null|                null|
|    min|            162009|20in Monitor|                  1|              2.99|03/01/19 03:15|1 11th St, Atlant...|
|    max|            176557|      iPhone|                  7|            1700.0|    Order Date|    Purchase Address|
+-------+------------------+------------+-------------------+---

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum

# Count null values in each column
null_counts = Sales_March_df.agg(*[spark_sum(col(c).isNull().cast("int")).alias(c + '_null_count') for c in Sales_March_df.columns])

# Collect the result as a single row
null_counts_single_row = null_counts.collect()[0]

# Show the result
print("Number of null values in each column:")
for col_name in Sales_March_df.columns:
    print(f"{col_name}: {null_counts_single_row[col_name + '_null_count']}")

Number of null values in each column:
Order ID: 72
Product: 37
Quantity Ordered: 72
Price Each: 72
Order Date: 37
Purchase Address: 37


In [7]:
# List of columns to find most frequent values
cols_to_check = ["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Find the most frequent value in each column
most_frequent_values = []
for col_name in cols_to_check:
    mode_value = Sales_March_df.groupBy(col_name).count().orderBy(col("count").desc()).select(col_name).first()[0]
    most_frequent_values.append((col_name, mode_value))

# Print the most frequent value in each column
for col_name, value in most_frequent_values:
    print(f"Most frequent value in column '{col_name}': {value}")

Most frequent value in column 'Order ID': None
Most frequent value in column 'Product': USB-C Charging Cable
Most frequent value in column 'Quantity Ordered': 1
Most frequent value in column 'Price Each': 11.95
Most frequent value in column 'Order Date': None
Most frequent value in column 'Purchase Address': None


In [8]:
from pyspark.sql.functions import mean, when

# Example: Mode imputation for categorical columns
categorical_cols = ["Order ID", "Quantity Ordered", "Price Each"]


# Example: Mean imputation for numerical columns
numerical_cols =["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Calculate mean for each numerical column
mean_values = {}
for col_name in numerical_cols:
    mean_val = Sales_March_df.agg({col_name: 'mean'}).collect()[0][f'avg({col_name})']
    mean_values[col_name] = mean_val

# Fill null values with mean
for col_name, mean_val in mean_values.items():
    Sales_March_df = Sales_March_df.withColumn(col_name, when(Sales_March_df[col_name].isNull(), mean_val).otherwise(Sales_March_df[col_name]))

In [9]:
Sales_March_df = Sales_March_df.drop('Order Date')

In [10]:
from pyspark.sql.functions import desc

# Group by the column and count occurrences, then order by count
most_frequent_value = Sales_March_df.groupBy("Product") \
                        .count() \
                        .orderBy(desc("count")) \
                        .select("Product") \
                        .first()[0]

print("Most frequent value:", most_frequent_value)


Most frequent value: USB-C Charging Cable


In [11]:
Sales_March_df = Sales_March_df.fillna(most_frequent_value, subset=["Product"])

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_March_df.count()
null_percentages = []
for col_name in Sales_March_df.columns:
    null_count = Sales_March_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.24%


In [15]:
default_value = "777 Spruce St, Los Angeles"
Sales_March_df = Sales_March_df.fillna(default_value, subset=['Purchase Address'])

In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_March_df.count()
null_percentages = []
for col_name in Sales_March_df.columns:
    null_count = Sales_March_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.00%


In [17]:
print("Before drop dublication:",Sales_March_df.count())
Sales_March_df.dropDuplicates()
print("After drop dublication:",Sales_March_df.count())

Before drop dublication: 15226
After drop dublication: 15226


In [1]:
from  pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.functions import countDistinct
from pyspark.sql import functions as f

In [2]:
spark = SparkSession.builder.appName("My_Project")\
.config("spark.memory.offHeap. enabled", "true") . config("spark.memory.offHeap. size", "20g") \
.getOrCreate()

In [3]:
file_path = r"/C:\Users\Reem\Downloads\archive\Sales_Data\Sales_April_2019.csv"
# Read the CSV file into a DataFrame
Sales_April_df = spark.read.csv(file_path, header=True, inferSchema=True)
Sales_April_df.show()

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  176558|USB-C Charging Cable|               2|     11.95|04/19/19 08:46|917 1st St, Dalla...|
|    null|                null|            null|      null|          null|                null|
|  176559|Bose SoundSport H...|               1|     99.99|04/07/19 22:30|682 Chestnut St, ...|
|  176560|        Google Phone|               1|     600.0|04/12/19 14:38|669 Spruce St, Lo...|
|  176560|    Wired Headphones|               1|     11.99|04/12/19 14:38|669 Spruce St, Lo...|
|  176561|    Wired Headphones|               1|     11.99|04/30/19 09:27|333 8th St, Los A...|
|  176562|USB-C Charging Cable|               1|     11.95|04/29/19 13:03|381 Wilson St, Sa...|
|  176563|Bose SoundSport H...|         

In [None]:
Sales_April_df.printSchema()
print("Count of dataframe:",Sales_April_df.count())

In [4]:
Sales_April_df.describe().show()

+-------+------------------+------------+-------------------+------------------+--------------+--------------------+
|summary|          Order ID|     Product|   Quantity Ordered|        Price Each|    Order Date|    Purchase Address|
+-------+------------------+------------+-------------------+------------------+--------------+--------------------+
|  count|             18289|       18324|              18289|             18289|         18324|               18324|
|   mean|185328.81672043304|        null| 1.1246104215648751|184.43102630000277|          null|                null|
| stddev| 5061.520829296985|        null|0.43640973695741925| 330.9133771769665|          null|                null|
|    min|            176558|20in Monitor|                  1|              2.99|04/01/19 03:09|1 14th St, New Yo...|
|    max|            194094|      iPhone|                  7|            1700.0|    Order Date|    Purchase Address|
+-------+------------------+------------+-------------------+---

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum

# Count null values in each column
null_counts = Sales_April_df.agg(*[spark_sum(col(c).isNull().cast("int")).alias(c + '_null_count') for c in Sales_April_df.columns])

# Collect the result as a single row
null_counts_single_row = null_counts.collect()[0]

# Show the result
print("Number of null values in each column:")
for col_name in Sales_April_df.columns:
    print(f"{col_name}: {null_counts_single_row[col_name + '_null_count']}")

Number of null values in each column:
Order ID: 94
Product: 59
Quantity Ordered: 94
Price Each: 94
Order Date: 59
Purchase Address: 59


In [8]:
# List of columns to find most frequent values
cols_to_check = ["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Find the most frequent value in each column
most_frequent_values = []
for col_name in cols_to_check:
    mode_value = Sales_April_df.groupBy(col_name).count().orderBy(col("count").desc()).select(col_name).first()[0]
    most_frequent_values.append((col_name, mode_value))

# Print the most frequent value in each column
for col_name, value in most_frequent_values:
    print(f"Most frequent value in column '{col_name}': {value}")

Most frequent value in column 'Order ID': None
Most frequent value in column 'Product': Lightning Charging Cable
Most frequent value in column 'Quantity Ordered': 1
Most frequent value in column 'Price Each': 14.95
Most frequent value in column 'Order Date': None
Most frequent value in column 'Purchase Address': None


In [10]:
from pyspark.sql.functions import mean, when

# Example: Mode imputation for categorical columns
categorical_cols = ["Order ID", "Quantity Ordered", "Price Each"]


# Example: Mean imputation for numerical columns
numerical_cols =["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Calculate mean for each numerical column
mean_values = {}
for col_name in numerical_cols:
    mean_val = Sales_April_df.agg({col_name: 'mean'}).collect()[0][f'avg({col_name})']
    mean_values[col_name] = mean_val

# Fill null values with mean
for col_name, mean_val in mean_values.items():
    Sales_April_df = Sales_April_df.withColumn(col_name, when(Sales_April_df[col_name].isNull(), mean_val).otherwise(Sales_April_df[col_name]))

In [11]:
Sales_April_df = Sales_April_df.drop('Order Date')

In [12]:
from pyspark.sql.functions import desc

# Group by the column and count occurrences, then order by count
most_frequent_value = Sales_April_df.groupBy("Product") \
                        .count() \
                        .orderBy(desc("count")) \
                        .select("Product") \
                        .first()[0]

print("Most frequent value:", most_frequent_value)


Most frequent value: Lightning Charging Cable


In [14]:
Sales_April_df = Sales_April_df.fillna(most_frequent_value, subset=["Product"])

In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_April_df.count()
null_percentages = []
for col_name in Sales_April_df.columns:
    null_count = Sales_April_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.32%


In [16]:
default_value = "874 Jefferson St, San Francisco, CA 94016"
Sales_April_df = Sales_April_df.fillna(default_value, subset=['Purchase Address'])

In [17]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_April_df.count()
null_percentages = []
for col_name in Sales_April_df.columns:
    null_count = Sales_April_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.00%


In [18]:
print("Before drop dublication:",Sales_April_df.count())
Sales_April_df.dropDuplicates()
print("After drop dublication:",Sales_April_df.count())

Before drop dublication: 18383
After drop dublication: 18383


In [1]:
from  pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.functions import countDistinct
from pyspark.sql import functions as f

In [2]:
spark = SparkSession.builder.appName("My_Project")\
.config("spark.memory.offHeap. enabled", "true") . config("spark.memory.offHeap. size", "20g") \
.getOrCreate()

In [3]:
file_path = r"/C:\Users\Reem\Downloads\archive\Sales_Data\Sales_May_2019.csv"
# Read the CSV file into a DataFrame
Sales_May_df = spark.read.csv(file_path, header=True, inferSchema=True)
Sales_May_df.show()

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  194095|    Wired Headphones|               1|     11.99|05/16/19 17:14|669 2nd St, New Y...|
|  194096|AA Batteries (4-p...|               1|      3.84|05/19/19 14:43|844 Walnut St, Da...|
|  194097|    27in FHD Monitor|               1|    149.99|05/24/19 11:36|164 Madison St, N...|
|  194098|    Wired Headphones|               1|     11.99|05/02/19 20:40|622 Meadow St, Da...|
|  194099|AAA Batteries (4-...|               2|      2.99|05/11/19 22:55|17 Church St, Sea...|
|  194100|              iPhone|               1|     700.0|05/10/19 19:44|81 Jefferson St, ...|
|  194101|USB-C Charging Cable|               1|     11.95|05/11/19 22:44|354 Meadow St, Bo...|
|  194102|Lightning Chargin...|         

In [4]:
Sales_May_df.printSchema()
print("Count of dataframe:",Sales_May_df.count())

root
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)

Count of dataframe: 16635


In [5]:
Sales_May_df.describe().show()

+-------+-----------------+------------+-------------------+------------------+--------------+--------------------+
|summary|         Order ID|     Product|   Quantity Ordered|        Price Each|    Order Date|    Purchase Address|
+-------+-----------------+------------+-------------------+------------------+--------------+--------------------+
|  count|            16554|       16587|              16554|             16554|         16587|               16587|
|   mean|201999.9504651444|        null|  1.127038782167452|188.90281502963637|          null|                null|
| stddev|4570.749762299136|        null|0.44851106171692223|342.10246405616186|          null|                null|
|    min|           194095|20in Monitor|                  1|              2.99|05/01/19 02:50|1 7th St, Los Ang...|
|    max|           209920|      iPhone|                  7|            1700.0|    Order Date|    Purchase Address|
+-------+-----------------+------------+-------------------+------------

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum

# Count null values in each column
null_counts = Sales_May_df.agg(*[spark_sum(col(c).isNull().cast("int")).alias(c + '_null_count') for c in Sales_May_df.columns])

# Collect the result as a single row
null_counts_single_row = null_counts.collect()[0]

# Show the result
print("Number of null values in each column:")
for col_name in Sales_May_df.columns:
    print(f"{col_name}: {null_counts_single_row[col_name + '_null_count']}")

Number of null values in each column:
Order ID: 81
Product: 48
Quantity Ordered: 81
Price Each: 81
Order Date: 48
Purchase Address: 48


In [8]:
# List of columns to find most frequent values
cols_to_check = ["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Find the most frequent value in each column
most_frequent_values = []
for col_name in cols_to_check:
    mode_value = Sales_May_df.groupBy(col_name).count().orderBy(col("count").desc()).select(col_name).first()[0]
    most_frequent_values.append((col_name, mode_value))

# Print the most frequent value in each column
for col_name, value in most_frequent_values:
    print(f"Most frequent value in column '{col_name}': {value}")

Most frequent value in column 'Order ID': None
Most frequent value in column 'Product': Lightning Charging Cable
Most frequent value in column 'Quantity Ordered': 1
Most frequent value in column 'Price Each': 14.95
Most frequent value in column 'Order Date': None
Most frequent value in column 'Purchase Address': None


In [9]:
from pyspark.sql.functions import mean, when

# Example: Mode imputation for categorical columns
categorical_cols = ["Order ID", "Quantity Ordered", "Price Each"]


# Example: Mean imputation for numerical columns
numerical_cols =["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Calculate mean for each numerical column
mean_values = {}
for col_name in numerical_cols:
    mean_val = Sales_May_df.agg({col_name: 'mean'}).collect()[0][f'avg({col_name})']
    mean_values[col_name] = mean_val

# Fill null values with mean
for col_name, mean_val in mean_values.items():
    Sales_May_df = Sales_May_df.withColumn(col_name, when(Sales_May_df[col_name].isNull(), mean_val).otherwise(Sales_May_df[col_name]))

In [10]:
Sales_May_df = Sales_May_df.drop('Order Date')

In [11]:
from pyspark.sql.functions import desc

# Group by the column and count occurrences, then order by count
most_frequent_value = Sales_May_df.groupBy("Product") \
                        .count() \
                        .orderBy(desc("count")) \
                        .select("Product") \
                        .first()[0]

print("Most frequent value:", most_frequent_value)


Most frequent value: Lightning Charging Cable


In [12]:
Sales_May_df = Sales_May_df.fillna(most_frequent_value, subset=["Product"])

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_May_df.count()
null_percentages = []
for col_name in Sales_May_df.columns:
    null_count = Sales_May_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.29%


In [15]:
default_value = "270 Dogwood St, San Francisco"
Sales_May_df = Sales_May_df.fillna(default_value, subset=['Purchase Address'])

In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_May_df.count()
null_percentages = []
for col_name in Sales_May_df.columns:
    null_count = Sales_May_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.00%


In [17]:
print("Before drop dublication:",Sales_May_df.count())
Sales_May_df.dropDuplicates()
print("After drop dublication:",Sales_May_df.count())

Before drop dublication: 16635
After drop dublication: 16635


In [1]:
from  pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.functions import countDistinct
from pyspark.sql import functions as f

In [2]:
spark = SparkSession.builder.appName("My_Project")\
.config("spark.memory.offHeap. enabled", "true") . config("spark.memory.offHeap. size", "20g") \
.getOrCreate()

In [3]:
file_path = r"/C:\Users\Reem\Downloads\archive\Sales_Data\Sales_June_2019.csv"
# Read the CSV file into a DataFrame
Sales_June_df = spark.read.csv(file_path, header=True, inferSchema=True)
Sales_June_df.show()

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  209921|USB-C Charging Cable|               1|     11.95|06/23/19 19:34|950 Walnut St, Po...|
|  209922|  Macbook Pro Laptop|               1|    1700.0|06/30/19 10:05|80 4th St, San Fr...|
|  209923|     ThinkPad Laptop|               1|    999.99|06/24/19 20:18|402 Jackson St, L...|
|  209924|    27in FHD Monitor|               1|    149.99|06/05/19 10:21|560 10th St, Seat...|
|  209925|Bose SoundSport H...|               1|     99.99|06/25/19 18:58|545 2nd St, San F...|
|  209926|Apple Airpods Hea...|               1|     150.0|06/28/19 20:04|386 Lake St, Seat...|
|  209927|Lightning Chargin...|               1|     14.95|06/28/19 00:07|29 Lincoln St, Lo...|
|  209928|Apple Airpods Hea...|         

In [4]:
Sales_June_df.printSchema()
print("Count of dataframe:",Sales_June_df.count())

root
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)

Count of dataframe: 13622


In [5]:
Sales_June_df.describe().show()

+-------+------------------+------------+------------------+------------------+--------------+--------------------+
|summary|          Order ID|     Product|  Quantity Ordered|        Price Each|    Order Date|    Purchase Address|
+-------+------------------+------------+------------------+------------------+--------------+--------------------+
|  count|             13556|       13579|             13556|             13556|         13579|               13579|
|   mean|216411.62769253468|        null|1.1253319563293007|189.03043523164672|          null|                null|
| stddev| 3753.481272755102|        null|0.4294627910676135| 336.8637764725025|          null|                null|
|    min|            209921|20in Monitor|                 1|              2.99|06/01/19 04:52|1 1st St, Dallas,...|
|    max|            222909|      iPhone|                 9|            1700.0|    Order Date|    Purchase Address|
+-------+------------------+------------+------------------+------------

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum

# Count null values in each column
null_counts = Sales_June_df.agg(*[spark_sum(col(c).isNull().cast("int")).alias(c + '_null_count') for c in Sales_June_df.columns])

# Collect the result as a single row
null_counts_single_row = null_counts.collect()[0]

# Show the result
print("Number of null values in each column:")
for col_name in Sales_June_df.columns:
    print(f"{col_name}: {null_counts_single_row[col_name + '_null_count']}")

Number of null values in each column:
Order ID: 66
Product: 43
Quantity Ordered: 66
Price Each: 66
Order Date: 43
Purchase Address: 43


In [7]:
# List of columns to find most frequent values
cols_to_check = ["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Find the most frequent value in each column
most_frequent_values = []
for col_name in cols_to_check:
    mode_value = Sales_June_df.groupBy(col_name).count().orderBy(col("count").desc()).select(col_name).first()[0]
    most_frequent_values.append((col_name, mode_value))

# Print the most frequent value in each column
for col_name, value in most_frequent_values:
    print(f"Most frequent value in column '{col_name}': {value}")

Most frequent value in column 'Order ID': None
Most frequent value in column 'Product': Lightning Charging Cable
Most frequent value in column 'Quantity Ordered': 1
Most frequent value in column 'Price Each': 14.95
Most frequent value in column 'Order Date': None
Most frequent value in column 'Purchase Address': None


In [8]:
from pyspark.sql.functions import mean, when

# Example: Mode imputation for categorical columns
categorical_cols = ["Order ID", "Quantity Ordered", "Price Each"]


# Example: Mean imputation for numerical columns
numerical_cols =["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Calculate mean for each numerical column
mean_values = {}
for col_name in numerical_cols:
    mean_val = Sales_June_df.agg({col_name: 'mean'}).collect()[0][f'avg({col_name})']
    mean_values[col_name] = mean_val

# Fill null values with mean
for col_name, mean_val in mean_values.items():
    Sales_June_df = Sales_June_df.withColumn(col_name, when(Sales_June_df[col_name].isNull(), mean_val).otherwise(Sales_June_df[col_name]))

In [9]:
Sales_June_df = Sales_June_df.drop('Order Date')

In [10]:
from pyspark.sql.functions import desc

# Group by the column and count occurrences, then order by count
most_frequent_value = Sales_June_df.groupBy("Product") \
                        .count() \
                        .orderBy(desc("count")) \
                        .select("Product") \
                        .first()[0]

print("Most frequent value:", most_frequent_value)


Most frequent value: Lightning Charging Cable


In [11]:
Sales_June_df = Sales_June_df.fillna(most_frequent_value, subset=["Product"])

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_June_df.count()
null_percentages = []
for col_name in Sales_June_df.columns:
    null_count = Sales_June_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.32%


In [13]:
default_value = "820 Washington St, New York City"
Sales_June_df = Sales_June_df.fillna(default_value, subset=['Purchase Address'])

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_June_df.count()
null_percentages = []
for col_name in Sales_June_df.columns:
    null_count = Sales_June_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.00%


In [15]:
print("Before drop dublication:",Sales_June_df.count())
Sales_June_df.dropDuplicates()
print("After drop dublication:",Sales_June_df.count())

Before drop dublication: 13622
After drop dublication: 13622


In [1]:
from  pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.functions import countDistinct
from pyspark.sql import functions as f

In [2]:
spark = SparkSession.builder.appName("My_Project")\
.config("spark.memory.offHeap. enabled", "true") . config("spark.memory.offHeap. size", "20g") \
.getOrCreate()

In [3]:
file_path = r"/C:\Users\Reem\Downloads\archive\Sales_Data\Sales_July_2019.csv"
# Read the CSV file into a DataFrame
Sales_July_df = spark.read.csv(file_path, header=True, inferSchema=True)
Sales_July_df.show()

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  222910|Apple Airpods Hea...|               1|     150.0|07/26/19 16:51|389 South St, Atl...|
|  222911|       Flatscreen TV|               1|     300.0|07/05/19 08:55|590 4th St, Seatt...|
|  222912|AA Batteries (4-p...|               1|      3.84|07/29/19 12:41|861 Hill St, Atla...|
|  222913|AA Batteries (4-p...|               1|      3.84|07/28/19 10:15|190 Ridge St, Atl...|
|  222914|AAA Batteries (4-...|               5|      2.99|07/31/19 02:13|824 Forest St, Se...|
|  222915|Bose SoundSport H...|               1|     99.99|07/03/19 18:30|899 Elm St, San F...|
|  222916|        Google Phone|               1|     600.0|07/21/19 22:39|745 Chestnut St, ...|
|  222917|            LG Dryer|         

In [4]:
Sales_July_df.printSchema()
print("Count of dataframe:",Sales_July_df.count())

root
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)

Count of dataframe: 14371


In [5]:
Sales_July_df.describe().show()

+-------+------------------+------------+------------------+------------------+--------------+--------------------+
|summary|          Order ID|     Product|  Quantity Ordered|        Price Each|    Order Date|    Purchase Address|
+-------+------------------+------------+------------------+------------------+--------------+--------------------+
|  count|             14291|       14326|             14291|             14291|         14326|               14326|
|   mean| 229788.5162689805|        null|1.1244139668322721|184.14992302849797|          null|                null|
| stddev|3970.6631212859775|        null|0.4608380693586461|332.95449992591784|          null|                null|
|    min|            222910|20in Monitor|                 1|              2.99|07/01/19 06:08|1 4th St, Atlanta...|
|    max|            236669|      iPhone|                 9|            1700.0|    Order Date|    Purchase Address|
+-------+------------------+------------+------------------+------------

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum

# Count null values in each column
null_counts = Sales_July_df.agg(*[spark_sum(col(c).isNull().cast("int")).alias(c + '_null_count') for c in Sales_July_df.columns])

# Collect the result as a single row
null_counts_single_row = null_counts.collect()[0]

# Show the result
print("Number of null values in each column:")
for col_name in Sales_July_df.columns:
    print(f"{col_name}: {null_counts_single_row[col_name + '_null_count']}")

Number of null values in each column:
Order ID: 80
Product: 45
Quantity Ordered: 80
Price Each: 80
Order Date: 45
Purchase Address: 45


In [7]:
# List of columns to find most frequent values
cols_to_check = ["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Find the most frequent value in each column
most_frequent_values = []
for col_name in cols_to_check:
    mode_value = Sales_July_df.groupBy(col_name).count().orderBy(col("count").desc()).select(col_name).first()[0]
    most_frequent_values.append((col_name, mode_value))

# Print the most frequent value in each column
for col_name, value in most_frequent_values:
    print(f"Most frequent value in column '{col_name}': {value}")

Most frequent value in column 'Order ID': None
Most frequent value in column 'Product': Lightning Charging Cable
Most frequent value in column 'Quantity Ordered': 1
Most frequent value in column 'Price Each': 14.95
Most frequent value in column 'Order Date': None
Most frequent value in column 'Purchase Address': None


In [8]:
from pyspark.sql.functions import mean, when

# Example: Mode imputation for categorical columns
categorical_cols = ["Order ID", "Quantity Ordered", "Price Each"]


# Example: Mean imputation for numerical columns
numerical_cols =["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Calculate mean for each numerical column
mean_values = {}
for col_name in numerical_cols:
    mean_val = Sales_July_df.agg({col_name: 'mean'}).collect()[0][f'avg({col_name})']
    mean_values[col_name] = mean_val

# Fill null values with mean
for col_name, mean_val in mean_values.items():
    Sales_July_df = Sales_July_df.withColumn(col_name, when(Sales_July_df[col_name].isNull(), mean_val).otherwise(Sales_July_df[col_name]))

In [10]:
Sales_July_df = Sales_July_df.drop('Order Date')

In [11]:
from pyspark.sql.functions import desc

# Group by the column and count occurrences, then order by count
most_frequent_value = Sales_July_df.groupBy("Product") \
                        .count() \
                        .orderBy(desc("count")) \
                        .select("Product") \
                        .first()[0]

print("Most frequent value:", most_frequent_value)


Most frequent value: Lightning Charging Cable


In [12]:
Sales_July_df = Sales_July_df.fillna(most_frequent_value, subset=["Product"])

In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_July_df.count()
null_percentages = []
for col_name in Sales_July_df.columns:
    null_count = Sales_July_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.31%


In [14]:
default_value = "511 12th St, Austin"
Sales_July_df = Sales_July_df.fillna(default_value, subset=['Purchase Address'])

In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_July_df.count()
null_percentages = []
for col_name in Sales_July_df.columns:
    null_count = Sales_July_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.00%


In [16]:
print("Before drop dublication:",Sales_July_df.count())
Sales_July_df.dropDuplicates()
print("After drop dublication:",Sales_July_df.count())

Before drop dublication: 14371
After drop dublication: 14371


In [1]:
from  pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.functions import countDistinct
from pyspark.sql import functions as f

In [2]:
spark = SparkSession.builder.appName("My_Project")\
.config("spark.memory.offHeap. enabled", "true") . config("spark.memory.offHeap. size", "20g") \
.getOrCreate()

In [3]:
file_path = r"/C:\Users\Reem\Downloads\archive\Sales_Data\Sales_August_2019.csv"
# Read the CSV file into a DataFrame
Sales_August_df = spark.read.csv(file_path, header=True, inferSchema=True)
Sales_August_df.show()

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  236670|    Wired Headphones|               2|     11.99|08/31/19 22:21|359 Spruce St, Se...|
|  236671|Bose SoundSport H...|               1|     99.99|08/15/19 15:11|492 Ridge St, Dal...|
|  236672|              iPhone|               1|     700.0|08/06/19 14:40|149 7th St, Portl...|
|  236673|AA Batteries (4-p...|               2|      3.84|08/29/19 20:59|631 2nd St, Los A...|
|  236674|AA Batteries (4-p...|               2|      3.84|08/15/19 19:53|736 14th St, New ...|
|  236675|    Wired Headphones|               1|     11.99|08/02/19 23:54|470 Hill St, San ...|
|  236676|34in Ultrawide Mo...|               1|    379.99|08/04/19 19:52|470 Cherry St, Lo...|
|  236677|        20in Monitor|         

In [4]:
Sales_August_df.printSchema()
print("Count of dataframe:",Sales_August_df.count())

root
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)

Count of dataframe: 12011


In [5]:
Sales_August_df.describe().show()

+-------+-----------------+------------+------------------+------------------+--------------+--------------------+
|summary|         Order ID|     Product|  Quantity Ordered|        Price Each|    Order Date|    Purchase Address|
+-------+-----------------+------------+------------------+------------------+--------------+--------------------+
|  count|            11957|       11983|             11957|             11957|         11983|               11983|
|   mean|242420.3392991553|        null| 1.124195032198712|186.52644308773156|          null|                null|
| stddev|3313.683368107521|        null|0.4495797801138998| 332.3019347560669|          null|                null|
|    min|           236670|20in Monitor|                 1|              2.99|08/01/19 04:50|1 2nd St, New Yor...|
|    max|           248150|      iPhone|                 8|            1700.0|    Order Date|    Purchase Address|
+-------+-----------------+------------+------------------+------------------+--

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum

# Count null values in each column
null_counts = Sales_August_df.agg(*[spark_sum(col(c).isNull().cast("int")).alias(c + '_null_count') for c in Sales_August_df.columns])

# Collect the result as a single row
null_counts_single_row = null_counts.collect()[0]

# Show the result
print("Number of null values in each column:")
for col_name in Sales_August_df.columns:
    print(f"{col_name}: {null_counts_single_row[col_name + '_null_count']}")

Number of null values in each column:
Order ID: 54
Product: 28
Quantity Ordered: 54
Price Each: 54
Order Date: 28
Purchase Address: 28


In [7]:
# List of columns to find most frequent values
cols_to_check = ["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Find the most frequent value in each column
most_frequent_values = []
for col_name in cols_to_check:
    mode_value = Sales_August_df.groupBy(col_name).count().orderBy(col("count").desc()).select(col_name).first()[0]
    most_frequent_values.append((col_name, mode_value))

# Print the most frequent value in each column
for col_name, value in most_frequent_values:
    print(f"Most frequent value in column '{col_name}': {value}")

Most frequent value in column 'Order ID': None
Most frequent value in column 'Product': AA Batteries (4-pack)
Most frequent value in column 'Quantity Ordered': 1
Most frequent value in column 'Price Each': 3.84
Most frequent value in column 'Order Date': None
Most frequent value in column 'Purchase Address': None


In [8]:
from pyspark.sql.functions import mean, when

# Example: Mode imputation for categorical columns
categorical_cols = ["Order ID", "Quantity Ordered", "Price Each"]


# Example: Mean imputation for numerical columns
numerical_cols =["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Calculate mean for each numerical column
mean_values = {}
for col_name in numerical_cols:
    mean_val = Sales_August_df.agg({col_name: 'mean'}).collect()[0][f'avg({col_name})']
    mean_values[col_name] = mean_val

# Fill null values with mean
for col_name, mean_val in mean_values.items():
    Sales_August_df = Sales_August_df.withColumn(col_name, when(Sales_August_df[col_name].isNull(), mean_val).otherwise(Sales_August_df[col_name]))

In [9]:
Sales_August_df = Sales_August_df.drop('Order Date')

In [10]:
from pyspark.sql.functions import desc

# Group by the column and count occurrences, then order by count
most_frequent_value = Sales_August_df.groupBy("Product") \
                        .count() \
                        .orderBy(desc("count")) \
                        .select("Product") \
                        .first()[0]

print("Most frequent value:", most_frequent_value)


Most frequent value: AA Batteries (4-pack)


In [11]:
Sales_August_df = Sales_August_df.fillna(most_frequent_value, subset=["Product"])

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_August_df.count()
null_percentages = []
for col_name in Sales_August_df.columns:
    null_count = Sales_August_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.23%


In [None]:
default_value = "672 West St, Seattle"
Sales_February_df = Sales_February_df.fillna(default_value, subset=['Purchase Address'])

In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_August_df.count()
null_percentages = []
for col_name in Sales_August_df.columns:
    null_count = Sales_August_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.23%


In [14]:
print("Before drop dublication:",Sales_August_df.count())
Sales_August_df.dropDuplicates()
print("After drop dublication:",Sales_August_df.count())

Before drop dublication: 12011
After drop dublication: 12011


In [1]:
from  pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.functions import countDistinct
from pyspark.sql import functions as f

In [2]:
spark = SparkSession.builder.appName("My_Project")\
.config("spark.memory.offHeap. enabled", "true") . config("spark.memory.offHeap. size", "20g") \
.getOrCreate()

In [3]:
file_path = r"/C:\Users\Reem\Downloads\archive\Sales_Data\Sales_September_2019.csv"
# Read the CSV file into a DataFrame
Sales_September_df = spark.read.csv(file_path, header=True, inferSchema=True)
Sales_September_df.show()

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  248151|AA Batteries (4-p...|               4|      3.84|09/17/19 14:44|380 North St, Los...|
|  248152|USB-C Charging Cable|               2|     11.95|09/29/19 10:19|511 8th St, Austi...|
|  248153|USB-C Charging Cable|               1|     11.95|09/16/19 17:48|151 Johnson St, L...|
|  248154|    27in FHD Monitor|               1|    149.99|09/27/19 07:52|355 Hickory St, S...|
|  248155|USB-C Charging Cable|               1|     11.95|09/01/19 19:03|125 5th St, Atlan...|
|  248156|34in Ultrawide Mo...|               1|    379.99|09/13/19 14:59|469 12th St, Los ...|
|  248157|Lightning Chargin...|               1|     14.95|09/07/19 09:59|773 Johnson St, P...|
|  248158|Lightning Chargin...|         

In [4]:
Sales_September_df.printSchema()
print("Count of dataframe:",Sales_September_df.count())

root
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)

Count of dataframe: 11686


In [5]:
Sales_September_df.describe().show()

+-------+------------------+------------+-------------------+------------------+--------------+--------------------+
|summary|          Order ID|     Product|   Quantity Ordered|        Price Each|    Order Date|    Purchase Address|
+-------+------------------+------------+-------------------+------------------+--------------+--------------------+
|  count|             11629|       11646|              11629|             11629|         11646|               11646|
|   mean|253751.81442944362|        null| 1.1281279559721387|179.40000687934585|          null|                null|
| stddev| 3235.175358525277|        null|0.43507719933866423|328.59504155699716|          null|                null|
|    min|            248151|20in Monitor|                  1|              2.99|09/01/19 05:10|1 11th St, San Fr...|
|    max|            259357|      iPhone|                  6|            1700.0|    Order Date|    Purchase Address|
+-------+------------------+------------+-------------------+---

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum

# Count null values in each column
null_counts = Sales_September_df.agg(*[spark_sum(col(c).isNull().cast("int")).alias(c + '_null_count') for c in Sales_September_df.columns])

# Collect the result as a single row
null_counts_single_row = null_counts.collect()[0]

# Show the result
print("Number of null values in each column:")
for col_name in Sales_September_df.columns:
    print(f"{col_name}: {null_counts_single_row[col_name + '_null_count']}")

Number of null values in each column:
Order ID: 57
Product: 40
Quantity Ordered: 57
Price Each: 57
Order Date: 40
Purchase Address: 40


In [7]:
# List of columns to find most frequent values
cols_to_check = ["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Find the most frequent value in each column
most_frequent_values = []
for col_name in cols_to_check:
    mode_value = Sales_September_df.groupBy(col_name).count().orderBy(col("count").desc()).select(col_name).first()[0]
    most_frequent_values.append((col_name, mode_value))

# Print the most frequent value in each column
for col_name, value in most_frequent_values:
    print(f"Most frequent value in column '{col_name}': {value}")

Most frequent value in column 'Order ID': None
Most frequent value in column 'Product': USB-C Charging Cable
Most frequent value in column 'Quantity Ordered': 1
Most frequent value in column 'Price Each': 11.95
Most frequent value in column 'Order Date': None
Most frequent value in column 'Purchase Address': None


In [8]:
from pyspark.sql.functions import mean, when

# Example: Mode imputation for categorical columns
categorical_cols = ["Order ID", "Quantity Ordered", "Price Each"]


# Example: Mean imputation for numerical columns
numerical_cols =["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Calculate mean for each numerical column
mean_values = {}
for col_name in numerical_cols:
    mean_val = Sales_September_df.agg({col_name: 'mean'}).collect()[0][f'avg({col_name})']
    mean_values[col_name] = mean_val

# Fill null values with mean
for col_name, mean_val in mean_values.items():
    Sales_September_df = Sales_September_df.withColumn(col_name, when(Sales_September_df[col_name].isNull(), mean_val).otherwise(Sales_September_df[col_name]))

In [9]:
Sales_September_df = Sales_September_df.drop('Order Date')

In [10]:
from pyspark.sql.functions import desc

# Group by the column and count occurrences, then order by count
most_frequent_value = Sales_September_df.groupBy("Product") \
                        .count() \
                        .orderBy(desc("count")) \
                        .select("Product") \
                        .first()[0]

print("Most frequent value:", most_frequent_value)


Most frequent value: USB-C Charging Cable


In [11]:
Sales_September_df = Sales_September_df.fillna(most_frequent_value, subset=["Product"])

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_September_df.count()
null_percentages = []
for col_name in Sales_September_df.columns:
    null_count = Sales_September_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.34%


In [13]:
default_value = "170 Park St, Boston"
Sales_September_df = Sales_September_df.fillna(default_value, subset=['Purchase Address'])

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_September_df.count()
null_percentages = []
for col_name in Sales_September_df.columns:
    null_count = Sales_September_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.00%


In [15]:
print("Before drop dublication:",Sales_September_df.count())
Sales_September_df.dropDuplicates()
print("After drop dublication:",Sales_September_df.count())

Before drop dublication: 11686
After drop dublication: 11686


In [1]:
from  pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.functions import countDistinct
from pyspark.sql import functions as f

In [2]:
spark = SparkSession.builder.appName("My_Project")\
.config("spark.memory.offHeap. enabled", "true") . config("spark.memory.offHeap. size", "20g") \
.getOrCreate()

In [3]:
file_path = r"/C:\Users\Reem\Downloads\archive\Sales_Data\Sales_October_2019.csv"
# Read the CSV file into a DataFrame
Sales_October_df = spark.read.csv(file_path, header=True, inferSchema=True)
Sales_October_df.show()

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  259358|34in Ultrawide Mo...|               1|    379.99|10/28/19 10:56|609 Cherry St, Da...|
|  259359|27in 4K Gaming Mo...|               1|    389.99|10/28/19 17:26|225 5th St, Los A...|
|  259360|AAA Batteries (4-...|               2|      2.99|10/24/19 17:20|967 12th St, New ...|
|  259361|    27in FHD Monitor|               1|    149.99|10/14/19 22:26|628 Jefferson St,...|
|  259362|    Wired Headphones|               1|     11.99|10/07/19 16:10|534 14th St, Los ...|
|  259363|AAA Batteries (4-...|               1|      2.99|10/01/19 18:55|976 Lake St, New ...|
|  259364|    Wired Headphones|               1|     11.99|10/29/19 11:02|874 North St, Los...|
|  259365|Lightning Chargin...|         

In [4]:
Sales_October_df.printSchema()
print("Count of dataframe:",Sales_October_df.count())

root
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)

Count of dataframe: 20379


In [5]:
Sales_October_df.describe().show()

+-------+-----------------+------------+------------------+------------------+--------------+--------------------+
|summary|         Order ID|     Product|  Quantity Ordered|        Price Each|    Order Date|    Purchase Address|
+-------+-----------------+------------+------------------+------------------+--------------+--------------------+
|  count|            20284|       20317|             20284|             20284|         20317|               20317|
|   mean|269078.5231216723|        null|1.1193551567738118| 183.1839400513232|          null|                null|
| stddev|5612.651508571799|        null|0.4369219744825846|334.00512329766707|          null|                null|
|    min|           259358|20in Monitor|                 1|              2.99|10/01/19 03:12|1 11th St, Los An...|
|    max|           278796|      iPhone|                 8|            1700.0|    Order Date|    Purchase Address|
+-------+-----------------+------------+------------------+------------------+--

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum

# Count null values in each column
null_counts = Sales_October_df.agg(*[spark_sum(col(c).isNull().cast("int")).alias(c + '_null_count') for c in Sales_October_df.columns])

# Collect the result as a single row
null_counts_single_row = null_counts.collect()[0]

# Show the result
print("Number of null values in each column:")
for col_name in Sales_October_df.columns:
    print(f"{col_name}: {null_counts_single_row[col_name + '_null_count']}")

Number of null values in each column:
Order ID: 95
Product: 62
Quantity Ordered: 95
Price Each: 95
Order Date: 62
Purchase Address: 62


In [8]:
# List of columns to find most frequent values
cols_to_check = ["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Find the most frequent value in each column
most_frequent_values = []
for col_name in cols_to_check:
    mode_value = Sales_October_df.groupBy(col_name).count().orderBy(col("count").desc()).select(col_name).first()[0]
    most_frequent_values.append((col_name, mode_value))

# Print the most frequent value in each column
for col_name, value in most_frequent_values:
    print(f"Most frequent value in column '{col_name}': {value}")

Most frequent value in column 'Order ID': None
Most frequent value in column 'Product': USB-C Charging Cable
Most frequent value in column 'Quantity Ordered': 1
Most frequent value in column 'Price Each': 11.95
Most frequent value in column 'Order Date': None
Most frequent value in column 'Purchase Address': None


In [9]:
from pyspark.sql.functions import mean, when

# Example: Mode imputation for categorical columns
categorical_cols = ["Order ID", "Quantity Ordered", "Price Each"]


# Example: Mean imputation for numerical columns
numerical_cols =["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Calculate mean for each numerical column
mean_values = {}
for col_name in numerical_cols:
    mean_val = Sales_October_df.agg({col_name: 'mean'}).collect()[0][f'avg({col_name})']
    mean_values[col_name] = mean_val

# Fill null values with mean
for col_name, mean_val in mean_values.items():
    Sales_October_df = Sales_October_df.withColumn(col_name, when(Sales_October_df[col_name].isNull(), mean_val).otherwise(Sales_October_df[col_name]))

In [10]:
Sales_October_df = Sales_October_df.drop('Order Date')

In [11]:
from pyspark.sql.functions import desc

# Group by the column and count occurrences, then order by count
most_frequent_value = Sales_October_df.groupBy("Product") \
                        .count() \
                        .orderBy(desc("count")) \
                        .select("Product") \
                        .first()[0]

print("Most frequent value:", most_frequent_value)


Most frequent value: USB-C Charging Cable


In [13]:
Sales_October_df = Sales_October_df.fillna(most_frequent_value, subset=["Product"])

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_October_df.count()
null_percentages = []
for col_name in Sales_October_df.columns:
    null_count = Sales_October_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.30%


In [15]:
default_value = "13 Hill St, Atlanta"
Sales_October_df = Sales_October_df.fillna(default_value, subset=['Purchase Address'])

In [17]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_October_df.count()
null_percentages = []
for col_name in Sales_October_df.columns:
    null_count =Sales_October_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.00%


In [18]:
print("Before drop dublication:",Sales_October_df.count())
Sales_October_df.dropDuplicates()
print("After drop dublication:",Sales_October_df.count())

Before drop dublication: 20379
After drop dublication: 20379


In [1]:
from  pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.functions import countDistinct
from pyspark.sql import functions as f

In [2]:
spark = SparkSession.builder.appName("My_Project")\
.config("spark.memory.offHeap. enabled", "true") . config("spark.memory.offHeap. size", "20g") \
.getOrCreate()

In [3]:
file_path = r"/C:\Users\Reem\Downloads\archive\Sales_Data\Sales_November_2019.csv"
# Read the CSV file into a DataFrame
Sales_November_df = spark.read.csv(file_path, header=True, inferSchema=True)
Sales_November_df.show()

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  278797|    Wired Headphones|               1|     11.99|11/21/19 09:54|46 Park St, New Y...|
|  278798|USB-C Charging Cable|               2|     11.95|11/17/19 10:03|962 Hickory St, A...|
|  278799|Apple Airpods Hea...|               1|     150.0|11/19/19 14:56|464 Cherry St, Lo...|
|  278800|    27in FHD Monitor|               1|    149.99|11/25/19 22:24|649 10th St, Seat...|
|  278801|Bose SoundSport H...|               1|     99.99|11/09/19 13:56|522 Hill St, Bost...|
|  278802|USB-C Charging Cable|               1|     11.95|11/14/19 20:34|154 2nd St, San F...|
|  278803|Lightning Chargin...|               1|     14.95|11/11/19 08:05|724 5th St, San F...|
|  278804|Bose SoundSport H...|         

In [4]:
Sales_November_df.printSchema()
print("Count of dataframe:",Sales_November_df.count())

root
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)

Count of dataframe: 17661


In [5]:
Sales_November_df.describe().show()

+-------+------------------+------------+------------------+------------------+--------------+--------------------+
|summary|          Order ID|     Product|  Quantity Ordered|        Price Each|    Order Date|    Purchase Address|
+-------+------------------+------------+------------------+------------------+--------------+--------------------+
|  count|             17580|       17616|             17580|             17580|         17616|               17616|
|   mean|287235.96279863484|        null|1.1267349260523323|180.88196814565705|          null|                null|
| stddev| 4866.884258246139|        null|0.4520109849640929| 330.1758948269367|          null|                null|
|    min|            278797|20in Monitor|                 1|              2.99|11/01/19 03:18|1 12th St, San Fr...|
|    max|            295664|      iPhone|                 8|            1700.0|    Order Date|    Purchase Address|
+-------+------------------+------------+------------------+------------

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum

# Count null values in each column
null_counts = Sales_November_df.agg(*[spark_sum(col(c).isNull().cast("int")).alias(c + '_null_count') for c in Sales_November_df.columns])

# Collect the result as a single row
null_counts_single_row = null_counts.collect()[0]

# Show the result
print("Number of null values in each column:")
for col_name in Sales_November_df.columns:
    print(f"{col_name}: {null_counts_single_row[col_name + '_null_count']}")

Number of null values in each column:
Order ID: 81
Product: 45
Quantity Ordered: 81
Price Each: 81
Order Date: 45
Purchase Address: 45


In [7]:
# List of columns to find most frequent values
cols_to_check = ["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Find the most frequent value in each column
most_frequent_values = []
for col_name in cols_to_check:
    mode_value = Sales_November_df.groupBy(col_name).count().orderBy(col("count").desc()).select(col_name).first()[0]
    most_frequent_values.append((col_name, mode_value))

# Print the most frequent value in each column
for col_name, value in most_frequent_values:
    print(f"Most frequent value in column '{col_name}': {value}")

Most frequent value in column 'Order ID': None
Most frequent value in column 'Product': USB-C Charging Cable
Most frequent value in column 'Quantity Ordered': 1
Most frequent value in column 'Price Each': 11.95
Most frequent value in column 'Order Date': None
Most frequent value in column 'Purchase Address': None


In [8]:
from pyspark.sql.functions import mean, when

# Example: Mode imputation for categorical columns
categorical_cols = ["Order ID", "Quantity Ordered", "Price Each"]


# Example: Mean imputation for numerical columns
numerical_cols =["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Calculate mean for each numerical column
mean_values = {}
for col_name in numerical_cols:
    mean_val = Sales_November_df.agg({col_name: 'mean'}).collect()[0][f'avg({col_name})']
    mean_values[col_name] = mean_val

# Fill null values with mean
for col_name, mean_val in mean_values.items():
    Sales_November_df = Sales_November_df.withColumn(col_name, when(Sales_November_df[col_name].isNull(), mean_val).otherwise(Sales_November_df[col_name]))

In [9]:
Sales_November_df = Sales_November_df.drop('Order Date')

In [10]:
from pyspark.sql.functions import desc

# Group by the column and count occurrences, then order by count
most_frequent_value = Sales_November_df.groupBy("Product") \
                        .count() \
                        .orderBy(desc("count")) \
                        .select("Product") \
                        .first()[0]

print("Most frequent value:", most_frequent_value)


Most frequent value: USB-C Charging Cable


In [11]:
Sales_November_df = Sales_November_df.fillna(most_frequent_value, subset=["Product"])

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_November_df.count()
null_percentages = []
for col_name in Sales_November_df.columns:
    null_count = Sales_November_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.25%


In [13]:
default_value = "802 Jefferson St, San Francisco"
Sales_November_df = Sales_November_df.fillna(default_value, subset=['Purchase Address'])

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_November_df.count()
null_percentages = []
for col_name in Sales_November_df.columns:
    null_count = Sales_November_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.00%


In [15]:
print("Before drop dublication:",Sales_November_df.count())
Sales_November_df.dropDuplicates()
print("After drop dublication:",Sales_November_df.count())

Before drop dublication: 17661
After drop dublication: 17661


In [1]:
from  pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.functions import countDistinct
from pyspark.sql import functions as f

In [2]:
spark = SparkSession.builder.appName("My_Project")\
.config("spark.memory.offHeap. enabled", "true") . config("spark.memory.offHeap. size", "20g") \
.getOrCreate()

In [3]:
file_path = r"/C:\Users\Reem\Downloads\archive\Sales_Data\Sales_December_2019.csv"
# Read the CSV file into a DataFrame
Sales_December_df = spark.read.csv(file_path, header=True, inferSchema=True)
Sales_December_df.show()

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  295665|  Macbook Pro Laptop|               1|    1700.0|12/30/19 00:01|136 Church St, Ne...|
|  295666|  LG Washing Machine|               1|     600.0|12/29/19 07:03|562 2nd St, New Y...|
|  295667|USB-C Charging Cable|               1|     11.95|12/12/19 18:21|277 Main St, New ...|
|  295668|    27in FHD Monitor|               1|    149.99|12/22/19 15:13|410 6th St, San F...|
|  295669|USB-C Charging Cable|               1|     11.95|12/18/19 12:38|43 Hill St, Atlan...|
|  295670|AA Batteries (4-p...|               1|      3.84|12/31/19 22:58|200 Jefferson St,...|
|  295671|USB-C Charging Cable|               1|     11.95|12/16/19 15:10|928 12th St, Port...|
|  295672|USB-C Charging Cable|         

In [4]:
Sales_December_df.printSchema()
print("Count of dataframe:",Sales_December_df.count())

root
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)

Count of dataframe: 25117


In [5]:
Sales_December_df.describe().show()

+-------+-----------------+------------+-------------------+------------------+--------------+--------------------+
|summary|         Order ID|     Product|   Quantity Ordered|        Price Each|    Order Date|    Purchase Address|
+-------+-----------------+------------+-------------------+------------------+--------------+--------------------+
|  count|            24989|       25037|              24989|             24989|         25037|               25037|
|   mean|307655.0231701949|        null| 1.1253351474648845|183.84565008610213|          null|                null|
| stddev|6932.795455986216|        null|0.44541356230082607| 333.0770368580182|          null|                null|
|    min|           295665|20in Monitor|                  1|              2.99|01/01/20 00:10|1 12th St, San Fr...|
|    max|           319670|      iPhone|                  7|            1700.0|    Order Date|    Purchase Address|
+-------+-----------------+------------+-------------------+------------

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum

# Count null values in each column
null_counts = Sales_December_df.agg(*[spark_sum(col(c).isNull().cast("int")).alias(c + '_null_count') for c in Sales_December_df.columns])

# Collect the result as a single row
null_counts_single_row = null_counts.collect()[0]

# Show the result
print("Number of null values in each column:")
for col_name in Sales_December_df.columns:
    print(f"{col_name}: {null_counts_single_row[col_name + '_null_count']}")

Number of null values in each column:
Order ID: 128
Product: 80
Quantity Ordered: 128
Price Each: 128
Order Date: 80
Purchase Address: 80


In [7]:
# List of columns to find most frequent values
cols_to_check = ["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Find the most frequent value in each column
most_frequent_values = []
for col_name in cols_to_check:
    mode_value = Sales_December_df.groupBy(col_name).count().orderBy(col("count").desc()).select(col_name).first()[0]
    most_frequent_values.append((col_name, mode_value))

# Print the most frequent value in each column
for col_name, value in most_frequent_values:
    print(f"Most frequent value in column '{col_name}': {value}")

Most frequent value in column 'Order ID': None
Most frequent value in column 'Product': USB-C Charging Cable
Most frequent value in column 'Quantity Ordered': 1
Most frequent value in column 'Price Each': 11.95
Most frequent value in column 'Order Date': None
Most frequent value in column 'Purchase Address': None


In [8]:
from pyspark.sql.functions import mean, when

# Example: Mode imputation for categorical columns
categorical_cols = ["Order ID", "Quantity Ordered", "Price Each"]


# Example: Mean imputation for numerical columns
numerical_cols =["Order ID", "Product", "Quantity Ordered", "Price Each", "Order Date", "Purchase Address"]

# Calculate mean for each numerical column
mean_values = {}
for col_name in numerical_cols:
    mean_val = Sales_December_df.agg({col_name: 'mean'}).collect()[0][f'avg({col_name})']
    mean_values[col_name] = mean_val

# Fill null values with mean
for col_name, mean_val in mean_values.items():
    Sales_December_df = Sales_December_df.withColumn(col_name, when(Sales_December_df[col_name].isNull(), mean_val).otherwise(Sales_December_df[col_name]))

In [9]:
Sales_December_df = Sales_December_df.drop('Order Date')

In [10]:
from pyspark.sql.functions import desc

# Group by the column and count occurrences, then order by count
most_frequent_value = Sales_December_df.groupBy("Product") \
                        .count() \
                        .orderBy(desc("count")) \
                        .select("Product") \
                        .first()[0]

print("Most frequent value:", most_frequent_value)


Most frequent value: USB-C Charging Cable


In [11]:
Sales_December_df = Sales_December_df.fillna(most_frequent_value, subset=["Product"])

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_December_df.count()
null_percentages = []
for col_name in Sales_December_df.columns:
    null_count = Sales_December_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.32%


In [13]:
default_value = "893 Elm St, Los Angeles"
Sales_December_df = Sales_December_df.fillna(default_value, subset=['Purchase Address'])

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
total_rows = Sales_December_df.count()
null_percentages = []
for col_name in Sales_December_df.columns:
    null_count = Sales_December_df.where(col(col_name).isNull()).count()
    null_percentage = (null_count / total_rows) * 100
    null_percentages.append((col_name, null_percentage))

# Print null percentages
for col_name, percentage in null_percentages:
    print(f"Null percentage in column '{col_name}': {percentage:.2f}%")

Null percentage in column 'Order ID': 0.00%
Null percentage in column 'Product': 0.00%
Null percentage in column 'Quantity Ordered': 0.00%
Null percentage in column 'Price Each': 0.00%
Null percentage in column 'Purchase Address': 0.00%


In [15]:
print("Before drop dublication:",Sales_December_df.count())
Sales_December_df.dropDuplicates()
print("After drop dublication:",Sales_December_df.count())

Before drop dublication: 25117
After drop dublication: 25117
