# a

In [None]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, when, col, desc, asc, round, avg

spark = SparkSession.builder.appName("ReadCSV").getOrCreate()

In [None]:
file_path = 'cars.xlsx'
csv_file_path = 'cars.csv'
cars_data = pd.read_excel(file_path)
cars_data.to_csv(csv_file_path, index=False)

In [None]:
file_path = 'cars.csv'  # Updated to point to the CSV file
df = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path)

In [None]:
# Show the schema of the DataFrame
df.printSchema()

root
 |-- Огноо: string (nullable = true)
 |-- Горимын код: string (nullable = true)
 |-- Гадаад компани: string (nullable = true)
 |-- Гарал улс: string (nullable = true)
 |-- Илгээгч улс: string (nullable = true)
 |-- Барааны код: string (nullable = true)
 |-- Барааны оноосон нэр: string (nullable = true)
 |-- Марк: string (nullable = true)
 |-- Үзүүлэлт: string (nullable = true)
 |-- Үйлдвэрлэсэн он: string (nullable = true)
 |-- Тоо хэмжээ: string (nullable = true)
 |-- Мотор: string (nullable = true)
 |-- Өнгө: string (nullable = true)
 |-- Төрөл: string (nullable = true)
 |-- Хөтлөгч: string (nullable = true)
 |-- Түлш: string (nullable = true)
 |-- Даац: string (nullable = true)
 |-- Ангилал: string (nullable = true)
 |-- Зориулалт: string (nullable = true)
 |-- Бренд: string (nullable = true)
 |-- Модель: string (nullable = true)



In [None]:
# Basic Data Analysis
print("Number of columns:", len(df.columns))
print("Number of rows:", df.count())
df.select([count(when(col(c).isNull(), 1)).alias(c) for c in df.columns]).show()
df.describe().show()

Number of columns: 21
Number of rows: 65872
+-----+-----------+--------------+---------+-----------+-----------+-------------------+----+--------+---------------+----------+-----+----+-----+-------+----+-----+-------+---------+-----+------+
|Огноо|Горимын код|Гадаад компани|Гарал улс|Илгээгч улс|Барааны код|Барааны оноосон нэр|Марк|Үзүүлэлт|Үйлдвэрлэсэн он|Тоо хэмжээ|Мотор|Өнгө|Төрөл|Хөтлөгч|Түлш| Даац|Ангилал|Зориулалт|Бренд|Модель|
+-----+-----------+--------------+---------+-----------+-----------+-------------------+----+--------+---------------+----------+-----+----+-----+-------+----+-----+-------+---------+-----+------+
|    0|          0|             0|        0|          0|          0|                  0|   0|       1|              7|         8|  265| 207|  191|    337| 196|65797|     10|       13|  962|   496|
+-----+-----------+--------------+---------+-----------+-----------+-------------------+----+--------+---------------+----------+-----+----+-----+-------+----+-----+---

In [None]:
# Select specific columns to analyze
df1 = df.select(col("Бренд"), col("Гарал улс"), col("Үйлдвэрлэсэн он"), col("Мотор"))
df1.show(10)

+------------+---------+---------------+-------+
|       Бренд|Гарал улс|Үйлдвэрлэсэн он|  Мотор|
+------------+---------+---------------+-------+
|         KIA|     БНСУ|           2002| 3298cc|
|     Hyundai|     БНСУ|           2003| 1500cc|
|Beiben Truck|    БНХАУ|           2011| 9726cc|
|     Hyundai|     БНСУ|           2007| 3907cc|
|     Hyundai|     БНСУ|           2006| 3907cc|
|       Dodge|      АHУ|           2003| 4700cc|
|       Volvo|     Швед|           2002|12130cc|
|         MAN|    ХБHГУ|           2007|10518cc|
|       Volvo|     Швед|           2005|12777cc|
|      Nissan|     Япон|           2012| 1790сс|
+------------+---------+---------------+-------+
only showing top 10 rows



In [None]:
# Show specific columns without truncation
df.select(df["Бренд"], df["Гарал улс"]).show(10, truncate=False)

+------------+---------+
|Бренд       |Гарал улс|
+------------+---------+
|KIA         |БНСУ     |
|Hyundai     |БНСУ     |
|Beiben Truck|БНХАУ    |
|Hyundai     |БНСУ     |
|Hyundai     |БНСУ     |
|Dodge       |АHУ      |
|Volvo       |Швед     |
|MAN         |ХБHГУ    |
|Volvo       |Швед     |
|Nissan      |Япон     |
+------------+---------+
only showing top 10 rows



In [None]:
# Show the first 3 columns
df.select(df.columns[:3]).show(10)

+-------------------+-----------+--------------------+
|              Огноо|Горимын код|      Гадаад компани|
+-------------------+-----------+--------------------+
|2021-01-02 00:00:00|        400|             LKHAMAA|
|2021-01-02 00:00:00|        400|             LKHAMAA|
|2021-01-02 00:00:00|        400|Erenhot City Huan...|
|2021-01-02 00:00:00|        400|          MK Shoring|
|2021-01-02 00:00:00|        400|          MK trading|
|2021-01-02 00:00:00|        400|          MK Shoring|
|2021-01-02 00:00:00|        400|         Sky Trading|
|2021-01-02 00:00:00|        400|         Sky Trading|
|2021-01-02 00:00:00|        400| Truck import export|
|2021-01-02 00:00:00|        400| Be forward Co.,Ltd.|
+-------------------+-----------+--------------------+
only showing top 10 rows



In [None]:
# Filter examples
df1.filter(col("Үйлдвэрлэсэн он") == 2011).show(10)
df1.filter(col("Үйлдвэрлэсэн он") == 2006).show(10) # Changed this line to use col() function

+------------+---------+---------------+-------+
|       Бренд|Гарал улс|Үйлдвэрлэсэн он|  Мотор|
+------------+---------+---------------+-------+
|Beiben Truck|    БНХАУ|           2011| 9726cc|
|         KIA|     БНСУ|           2011| 2476cc|
|      Nissan|     Япон|           2011| 1790сс|
|      Nissan|     Япон|           2011| 1590cc|
|      Nissan|     Япон|           2011| 2950cc|
|     Hyundai|     БНСУ|           2011| 2476cc|
|     Hyundai|     БНСУ|           2011| 2476сс|
|         KIA|     БНСУ|           2011| 2700cc|
|      Nissan|     Япон|           2011| 1790сс|
|       Volvo|     Швед|           2011|12777cc|
+------------+---------+---------------+-------+
only showing top 10 rows

+---------+---------+---------------+------+
|    Бренд|Гарал улс|Үйлдвэрлэсэн он| Мотор|
+---------+---------+---------------+------+
|  Hyundai|     БНСУ|           2006|3907cc|
|  Hyundai|     БНСУ|           2006|2450cc|
|  Hyundai|     БНСУ|           2006|2476cc|
|  Hyundai|     БН

In [None]:
# Sorting examples
df1.sort(col("Бренд").desc()).show(10)

+--------------+--------------------+--------------------+--------------+
|         Бренд|           Гарал улс|     Үйлдвэрлэсэн он|         Мотор|
+--------------+--------------------+--------------------+--------------+
|өөрөө буулгагч|               БНХАУ|                2021|Хөд.баг:6000сс|
|   даац: 26750|  2ш акумлятор та...| хурдны хайрцаг з...|             "|
| даац: 15670кг| их гэрэл хойд гэ...| хөдөлгүүр засвар...|             "|
|          Урал|                 ОХУ|                1982|        7000cc|
|          Урал|                 ОХУ|                1987|        7000cc|
|          Урал|                 ОХУ|                1987|        7000cc|
|          Урал|                 ОХУ|                1982|        7000cc|
|           УАЗ|                 ОХУ|                2022|        2693cc|
|           УАЗ|                 ОХУ|                2022|        2693cc|
|           УАЗ|                 ОХУ|                2022|        2693cc|
+--------------+--------------------+-

In [None]:
# Average motor size by brand
avg_motor_by_brand = df.groupBy(col("Бренд")).agg({"Мотор": "avg"}).withColumnRenamed("avg(Мотор)", "avg_motor")
avg_motor_by_brand.orderBy(desc("avg_motor")).limit(10).show(truncate=False)

+-----------------------------------+---------+
|Бренд                              |avg_motor|
+-----------------------------------+---------+
|Hino                               |16740.0  |
|Автомашин                          |2010.0   |
|Volkswagen                         |NULL     |
|International                      |NULL     |
|CAT 773E                           |NULL     |
|Forland                            |NULL     |
|Senta dump truck                   |NULL     |
|Sinotruck Styer                    |NULL     |
|Baotou Bei Ben Heavy-duty truck Co.|NULL     |
|ISUZU D-Max                        |NULL     |
+-----------------------------------+---------+



In [None]:
# Show the brand with the highest average motor size
avg_motor_by_brand.select("Бренд").orderBy(desc("avg_motor")).limit(1).show()

+-----+
|Бренд|
+-----+
| Hino|
+-----+



In [None]:
# Show the brand with the lowest average motor size
avg_motor_by_brand.dropna().orderBy(asc("avg_motor")).limit(10).show(truncate=False)
avg_motor_by_brand.dropna().select("Бренд").orderBy(asc("avg_motor")).limit(1).show(truncate=False)

+---------+---------+
|Бренд    |avg_motor|
+---------+---------+
|Автомашин|2010.0   |
|Hino     |16740.0  |
+---------+---------+

+---------+
|Бренд    |
+---------+
|Автомашин|
+---------+



In [None]:
# Average motor size by country of origin
motor_avg_by_origin = df.groupBy("Гарал улс").agg(round(avg("Мотор"), 1).alias("avg_motor_size"))
motor_avg_by_origin.dropna().show()

+--------------------+--------------+
|           Гарал улс|avg_motor_size|
+--------------------+--------------+
|                Япон|       16740.0|
|       түлшний насос|        2010.0|
|  2ш акумлятор та...|        2010.0|
+--------------------+--------------+



In [None]:
# Filter cars by specific feature
df.select(df.columns[:3]).filter(col("Үзүүлэлт").like("%Дизель%")) .show(10, truncate=False)

+-------------------+-----------+----------------------------------------+
|Огноо              |Горимын код|Гадаад компани                          |
+-------------------+-----------+----------------------------------------+
|2021-01-02 00:00:00|400        |LKHAMAA                                 |
|2021-01-02 00:00:00|400        |LKHAMAA                                 |
|2021-01-02 00:00:00|400        |Erenhot City Huan Tong Logistics Co.,ltd|
|2021-01-02 00:00:00|400        |MK Shoring                              |
|2021-01-02 00:00:00|400        |MK trading                              |
|2021-01-02 00:00:00|400        |MK Shoring                              |
|2021-01-02 00:00:00|400        |Sky Trading                             |
|2021-01-02 00:00:00|400        |Sky Trading                             |
|2021-01-02 00:00:00|400        |Truck import export                     |
|2021-01-02 00:00:00|400        |Jinseong Shoring                        |
+-------------------+----

In [None]:
# Count cars with a specific feature
diesel_cars = df.filter(col("Түлш").like("%Дизель%"))
print("Number of diesel cars:", diesel_cars.count())

Number of diesel cars: 57866


In [None]:
# Count by type of car
df.groupBy("Ангилал").count().show(truncate=False)

+-----------------------+-----+
|Ангилал                |count|
+-----------------------+-----+
|Автомашин              |65850|
|Автофургон             |8    |
|NULL                   |10   |
|ND4251                 |1    |
|чирэгч                 |1    |
|Цахилгаан гурван дугуйт|2    |
+-----------------------+-----+



In [None]:
# Average motor size by brand and country of origin
avg_motor_by_origin_brand = df.groupBy("Гарал улс", "Бренд").agg(round(avg("Мотор"), 1).alias("average_motor_size")).orderBy(asc("Гарал улс"))
avg_motor_by_origin_brand.dropna().show(truncate=False)

+----------------------+---------+------------------+
|Гарал улс             |Бренд    |average_motor_size|
+----------------------+---------+------------------+
|  2ш акумлятор тависан|Автомашин|2010.0            |
| түлшний насос        |Автомашин|2010.0            |
|Япон                  |Hino     |16740.0           |
+----------------------+---------+------------------+



In [None]:
# Count the number of cars produced each year
cars_per_year = df.groupBy("Үйлдвэрлэсэн он").agg(count("Бренд").alias("No of Cars")).orderBy(desc("Үйлдвэрлэсэн он"))
cars_per_year.show()

+---------------+----------+
|Үйлдвэрлэсэн он|No of Cars|
+---------------+----------+
|         202312|       189|
|         202311|       819|
|         202310|       395|
|         202309|       186|
|         202308|       180|
|         202307|       141|
|         202306|       470|
|         202305|       697|
|         202304|       513|
|         202303|       478|
|         202302|       392|
|         202301|       244|
|           2023|        23|
|         202212|       295|
|         202211|       354|
|         202210|        99|
|         202209|       161|
|         202208|       206|
|         202207|       179|
|         202206|        35|
+---------------+----------+
only showing top 20 rows



In [None]:
# Change data type of motor size to integer
changed_datatype = df.select(col("Мотор").cast("int"), col("Бренд"), col("Ангилал"), col("Гарал улс"))
top_motor_size = changed_datatype.filter(col("Ангилал") == "Автомашин").orderBy(desc("Мотор")).limit(3)
top_motor_size.show(truncate=False)

+-----+-------+---------+---------+
|Мотор|Бренд  |Ангилал  |Гарал улс|
+-----+-------+---------+---------+
|16740|Hino   |Автомашин|Япон     |
|16740|Hino   |Автомашин|Япон     |
|NULL |Shaanxi|Автомашин|БНХАУ    |
+-----+-------+---------+---------+



In [None]:
# Add a column indicating the presence of a specific feature
car_with_feature = df.withColumn("has_specific_feature", when(col("Түлш") == "Дизель", "yes").otherwise("no"))
avg_motor_by_feature = car_with_feature.groupBy("has_specific_feature").agg(round(avg("Мотор"), 1).alias("average_motor"))
avg_motor_by_feature.show()


+--------------------+-------------+
|has_specific_feature|average_motor|
+--------------------+-------------+
|                  no|       2010.0|
|                 yes|      16740.0|
+--------------------+-------------+



In [None]:
# Most common country of origin
most_common_origin = df.groupBy("Гарал улс").agg(count("Бренд").alias("no of cars")).orderBy(desc("no of cars")).limit(1)
most_common_origin.select(df["Гарал улс"].alias("most common origin")).show()



+------------------+
|most common origin|
+------------------+
|              БНСУ|
+------------------+



In [None]:
# Average motor size and production year by category
avg_motor_by_category = df.groupBy("Ангилал").agg(round(avg("Мотор"), 1).alias("average_motor_size"), round(avg("Үйлдвэрлэсэн он"), 1).alias("average_year"))
avg_motor_by_category.dropna().show(truncate=False)


+---------+------------------+------------+
|Ангилал  |average_motor_size|average_year|
+---------+------------------+------------+
|Автомашин|16740.0           |66231.1     |
+---------+------------------+------------+

