In [0]:
#Listing the files in the databricks file system 
dbutils.fs.ls('dbfs:/FileStore/shared_uploads/timilsina.ra@northeastern.edu/')


Out[1]: [FileInfo(path='dbfs:/FileStore/shared_uploads/timilsina.ra@northeastern.edu/BigMart_Sales.csv', name='BigMart_Sales.csv', size=869537, modificationTime=1732670813000),
 FileInfo(path='dbfs:/FileStore/shared_uploads/timilsina.ra@northeastern.edu/emp-1.csv', name='emp-1.csv', size=97, modificationTime=1732586587000),
 FileInfo(path='dbfs:/FileStore/shared_uploads/timilsina.ra@northeastern.edu/emp-1.json', name='emp-1.json', size=169, modificationTime=1732592296000),
 FileInfo(path='dbfs:/FileStore/shared_uploads/timilsina.ra@northeastern.edu/emp.csv', name='emp.csv', size=97, modificationTime=1732586434000),
 FileInfo(path='dbfs:/FileStore/shared_uploads/timilsina.ra@northeastern.edu/emp.json', name='emp.json', size=169, modificationTime=1732591818000)]

In [0]:
#Importing spark sessions
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Creating a spark session
spark = SparkSession.builder.appName("Read").getOrCreate()



Products

In [0]:
# Define the data 
data_products = [(0, "Y", "N"),
        (1, "Y", "Y"),
        (2, "N", "Y"),
        (3, "Y", "Y"),
        (4, "N", "N")]

#Define the schema 
schema_products = StructType([StructField("Product_Id", IntegerType(), True), 
                              StructField("Low_Fats", StringType(), True),
                              StructField("Recyclable", StringType(),True) ])

#Create dataframe 
df_products = spark.createDataFrame(data_products, schema_products)

#Display the dataframe 
df_products.display()

Product_Id,Low_Fats,Recyclable
0,Y,N
1,Y,Y
2,N,Y
3,Y,Y
4,N,N


In [0]:
#Write a solution to find the ids of products that are both low fat and recyclable.

df_products_filtered = df_products.filter(
    (col("low_fats") == "Y") & (col("recyclable") == "Y")
)

df_products_filtered_final= df_products_filtered.select("Product_Id")
df_products_filtered_final.display()

Product_Id
1
3


In [0]:
#Creating the temp view 
df_products.createOrReplaceTempView("table_products")

In [0]:
#Using Sql queries

result_sql_products= spark.sql("""select Product_Id from table_products where Low_Fats = 'Y' and Recyclable = 'Y'
          """)
result_sql_products.display()


Product_Id
1
3


Customers

In [0]:
# Create a dataframe 
data_customers= [(1, "Will", None),
                 (2, "Jane", None),
                 (3, "Alex", 2),
                 (4, "Bill", None),
                 (5, "Zack", 1),
                 (6, "Mark", 2)]
# Create a schema 
schema_customers = StructType([
    StructField("id", IntegerType(), True), 
    StructField("name", StringType(), True),  
    StructField("referee_id", IntegerType(), True) 
])
#Create a dataframe 
df_customers = spark.createDataFrame(data_customers, schema_customers)
df_customers.display()


id,name,referee_id
1,Will,
2,Jane,
3,Alex,2.0
4,Bill,
5,Zack,1.0
6,Mark,2.0


In [0]:
#Find the names of the customer that are not referred by the customer with id = 2.

df_customers_query = df_customers.filter((col("referee_id") != 2) | col("referee_id").isNull())
df_customers_query_result = df_customers_query.select("name")
df_customers_query_result.display()



name
Will
Jane
Bill
Zack


In [0]:
#Create temp table 
df_customers.createOrReplaceTempView("table_customers")

In [0]:
sql_customers_result= spark.sql(""" select name from table_customers where referee_id !=2 or referee_id is null
                                """)
sql_customers_result.display()

name
Will
Jane
Bill
Zack


Countries 

In [0]:
#Creating dataframe 
data_countries = [("Afghanistan", "Asia", 652230, 25500100, 20343000000),
    ("Albania", "Europe", 28748, 2831741, 12960000000),
    ("Algeria", "Africa", 2381741, 37100000, 188681000000),
    ("Andorra", "Europe", 468, 78115, 3712000000),
    ("Angola", "Africa", 1246700, 20609294, 100990000000)
]

#Creating Schema 
schema_countries = StructType([StructField("name", StringType(), True),     
    StructField("continent", StringType(), True),
    StructField("area", IntegerType(), True),      
    StructField("population", IntegerType(), True),
    StructField("gdp", LongType(), True)  
                              ])

# Creating Dataframe 
df_countries = spark.createDataFrame(data_countries, schema_countries)
df_countries.display()


name,continent,area,population,gdp
Afghanistan,Asia,652230,25500100,20343000000
Albania,Europe,28748,2831741,12960000000
Algeria,Africa,2381741,37100000,188681000000
Andorra,Europe,468,78115,3712000000
Angola,Africa,1246700,20609294,100990000000


In [0]:
#A country is big if:
#it has an area of at least three million (i.e., 3000000 km2), or
#it has a population of at least twenty-five million (i.e., 25000000).
#Write a solution to find the name, population, and area of the big countries.
df_countries_result = df_countries.filter(
    (col("population")>=25000000)| (col("area")>= 3000000)
    )

df_countries_result_final = df_countries_result.select("name", "population", "area")
df_countries_result_final.display()





name,population,area
Afghanistan,25500100,652230
Algeria,37100000,2381741


In [0]:
#Creatting Temp Table 
df_countries.createOrReplaceTempView("table_countries")

In [0]:
#A country is big if:
#it has an area of at least three million (i.e., 3000000 km2), or
#it has a population of at least twenty-five million (i.e., 25000000).
#Write a solution to find the name, population, and area of the big countries.

sql_countries_result = spark.sql("""select name , population, area from table_countries where area >= 3000000 or population >= 25000000
                                   """)
sql_countries_result.display()

name,population,area
Afghanistan,25500100,652230
Algeria,37100000,2381741


Articles

In [0]:
schema_articles = StructType([
    StructField('article_id', IntegerType(), True),
    StructField('author_id', IntegerType(), True),
    StructField('viewer_id', IntegerType(), True),
    StructField('view_date', DateType(), True)
])

data_articles  = [
        (1, 3, 5, '2019-08-01'),
        (1, 3, 6, '2019-08-02'),
        (2, 7, 7, '2019-08-01'),
        (2, 7, 6, '2019-08-02'),
        (4, 7, 1, '2019-07-22'),
        (3, 4, 4, '2019-07-21'),
        (3, 4, 4, '2019-07-21')
]

columns_articles= ['article_id', 'author_id', 'viewer_id', 'view_data']

df_articles= spark.createDataFrame(data= data_articles, schema= columns_articles)

df_articles.display()

article_id,author_id,viewer_id,view_data
1,3,5,2019-08-01
1,3,6,2019-08-02
2,7,7,2019-08-01
2,7,6,2019-08-02
4,7,1,2019-07-22
3,4,4,2019-07-21
3,4,4,2019-07-21


In [0]:
#Write a solution to find all the authors that viewed at least one of their own articles.

# Return the result table sorted by id in ascending order.

df_articles_result = df_articles.filter(
(col("author_id")== col("viewer_id"))
)


df_articles_result_final = df_articles_result.select(
    col("author_id").alias("id")
).distinct().orderBy(col("id").asc())


df_articles_result_final.display()


id
4
7


In [0]:
#Create Temp View Table 
df_articles.createOrReplaceTempView("table_articles")


In [0]:
#Write a solution to find all the authors that viewed at least one of their own articles.

# Return the result table sorted by id in ascending order.


articles_result= spark.sql(""" select distinct author_id   from table_articles where author_id = viewer_id order by author_id asc
                           """)
articles_result.display()

author_id
4
7


In [0]:
data_tweets = [(1, "Let us code"),
               (2, "More than fifteen char here")]

schema_tweets = StructType([StructField("tweet_id", IntegerType(), True),
                            StructField("content", StringType(), True)])

df_tweets = spark.createDataFrame(data_tweets, schema_tweets)
df_tweets.display()

tweet_id,content
1,Let us code
2,More than fifteen char here


In [0]:
# Write a solution to find the IDs of the invalid tweets. The tweet is invalid if the number of characters used in the content of the tweet is strictly greater than 15.


df_tweets_result = df_tweets.filter(
    length(col("content")) > 15
)

df_tweets_final= df_tweets_result.select("tweet_id")
df_tweets_final.display()

tweet_id
2


In [0]:
#create temp view 
df_tweets.createOrReplaceTempView("table_tweets")



In [0]:
sql_tweets_result = spark.sql("""
    SELECT tweet_id 
    FROM table_tweets 
    WHERE LENGTH(content) > 15
""")
sql_tweets_result.display()

tweet_id
2
