In [0]:
dbutils.fs.ls("FileStore/tables/")

In [0]:
# Setting the value of the fileroot variable

fileroot  = "clinicaltrial_2021"

In [0]:
# Importting the os module and setting the environment variable named 'fileroot' to the value of the fileroot variable.

import os
os.environ['fileroot'] = fileroot

In [0]:
# Copying the zip folder to the /tmp/ directory 

dbutils.fs.cp("/FileStore/tables/" + fileroot + ".zip", "file:/tmp/")

Executing the %sh magic command, unziping the file and extracting the csv file into the /tmp/ directory

In [0]:
%sh
unzip -d /tmp/ /tmp/$fileroot.zip

In [0]:
# Moving the CSV fileback to the main directory and setting the 'True' parameter so it overwrites any duplicate file already existing.

dbutils.fs.mv("file:/tmp/" + fileroot + ".csv", "/FileStore/tables/", True)

In [0]:
# Checking the tmp folder

dbutils.fs.ls("file:/tmp/")

In [0]:
# checking the main directory for the file

dbutils.fs.ls("/FileStore/tables/" + fileroot + ".csv")

##### Creating User-Defined Functions

In [0]:
# Defining a function for removing null values

def removing_nulls(rdd):
    return rdd.filter(lambda row: all(field != None for field in row))

In [0]:
# Defining a function for splitting datasets

def splitting_data(dataset, delimiter):
    return dataset.map(lambda line: line.split(delimiter))

In [0]:
# Defining a function for removing headers 

def removing_header(rdd):
    header = rdd.first()
    return rdd.filter(lambda row: row != header)

In [0]:
# Defining a function for counting rows
def count_rdd(rdd):
    return rdd.count()

##### Loading the clinicaltrial_2021 data

In [0]:
# Loading the clinicaltrial_2021 file as an RDD
clinicaltrial_2021_rdd = sc.textFile("/FileStore/tables/clinicaltrial_2021.csv")

In [0]:
# Filtering out possible null values
clinicaltrial_2021_rdd = removing_nulls(clinicaltrial_2021_rdd)

In [0]:
# Removing the delimiter
clinicaltrial_2021_rdd = splitting_data(clinicaltrial_2021_rdd, "|")

In [0]:
# Removing the header
clinicaltrial_2021_rdd = removing_header(clinicaltrial_2021_rdd)

In [0]:
clinicaltrial_2021_rdd.take(5)

##### Loading the pharma data

In [0]:
# Setting the value of the fileroot variable to Pharma

fileroot  = "pharma"

In [0]:
# Importting the os module and setting the environment variable named 'fileroot' to the value of the fileroot variable.

import os
os.environ['fileroot'] = fileroot

In [0]:
# Copying the zip folder to the /tmp/ directory 

dbutils.fs.cp("/FileStore/tables/" + fileroot + ".zip", "file:/tmp/")

In [0]:
%sh
unzip -d /tmp/ /tmp/$fileroot.zip

In [0]:
# Moving the CSV fileback to the main directory 
dbutils.fs.mv("file:/tmp/" + fileroot + ".csv", "/FileStore/tables/", True)

In [0]:
# checking the main directory for the file

dbutils.fs.ls("/FileStore/tables/" + fileroot + ".csv")

In [0]:
# Loading the pharma data
pharma_rdd = sc.textFile("/FileStore/tables/pharma.csv")

In [0]:
# Filtering out possible null values
pharma_rdd = removing_nulls(pharma_rdd)

In [0]:
# Removing the header
pharma_rdd = removing_header(pharma_rdd)

In [0]:
pharma_rdd.take(5)

#### Question 1

In [0]:
# Using the created userdefined function to count 
Studies_Count = count_rdd(clinicaltrial_2021_rdd)

In [0]:
# Printing the result
print("The distinct number of studies conducted were:", Studies_Count)

#### Question 2

In [0]:
# Creating a new RDD and mapping the 6th element of each row to it
type_rdd = clinicaltrial_2021_rdd.map(lambda x: x[5])

In [0]:
# Counting the occurrences of each 'type' in the new RDD
count_of_types = type_rdd.countByValue()

In [0]:
# Sorting the values in descending order
result = sorted(count_of_types.items(), key=lambda x: x[1], reverse=True)

In [0]:
# Iterating through and printing each study type with its corresponding frequency
print("Study Types and Frequencies:")
for t, count in result:
    print(t, count)

#### Question 3

In [0]:
# Creating an RDD by applying a flatMap transformation, and splitting the conditions and interventions columns.

conditions_rdd = clinicaltrial_2021_rdd \
                     .flatMap(lambda x: [(x[0], x[1], x[2], x[3], x[4], x[5], c.strip(), \
                      i.strip()) for c in x[6].split(",") for i in x[7].split(",")])  

In [0]:
topConditions_rdd = conditions_rdd \
                   .map(lambda cols: cols[7]) \
                   .filter(lambda condition: condition != "") \
                   .map(lambda condition: (condition, 1)) \
                   .reduceByKey(lambda a, b: a + b) \
                   .takeOrdered(5, key=lambda x: -x[1])

In [0]:
# Printing out the top 5 conditions with their respective frequencies

for condition, frequency in topConditions_rdd:
    print(condition, frequency)

#### Question 4

In [0]:
# Creating a set of all pharmaceutical companies
pharmaceutical_companies = pharma_rdd.map \
                            (lambda row: row.split('",')[1].split('"')[1])\
                               .collect()

In [0]:
# Counting the frequencies of sponsors who are not pharmaceutical companies
non_pharmaceutical_rdd = clinicaltrial_2021_rdd\
                          .filter(lambda row: row[1] not in pharmaceutical_companies) \
                            .map(lambda row: (row[1], 1)) \
                                .reduceByKey(lambda a, b: a + b)

In [0]:
# Obtaining the top 10 non-pharmaceutical companies with the highest frequency.
top_sponsors = non_pharmaceutical_rdd \
                .takeOrdered(10, key=lambda x: -x[1])

In [0]:
# Iterating to return the desired result
for sponsor, frequency in top_sponsors:
    print(sponsor, frequency)

#### Question 5

In [0]:
from datetime import datetime

# Creating a new RDD with selected columns from clinical trial data in 2021.
myRDD = clinicaltrial_2021_rdd \
            .map(lambda cols: (cols[0], cols[1], cols[2], cols[3], cols[4], cols[5], cols[6], cols[7]))

In [0]:
# Filtering out any entries without a completion date
myRDD = myRDD.filter(lambda cols: cols[4])    

In [0]:
# Creating a new RDD with completed trials in 2021.
completedRDD = myRDD \
                .filter(lambda cols: cols[2] == "Completed" and datetime.strptime(cols[4], "%b %Y").year == 2021)

In [0]:
# Mapping completion dates to month abbreviations.
monthRDD = completedRDD \
              .map(lambda cols: (datetime.strptime(cols[4], "%b %Y").strftime("%b"), 1))

In [0]:
# Reducing the RDD by key to count the trials per month.
count_rdd = monthRDD \
              .reduceByKey(lambda x, y: x + y)

In [0]:
# Defining a list of month abbreviations
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]

In [0]:
# Sorting by index of the months
results = count_rdd.sortBy(lambda x: months.index(x[0])).collect()

In [0]:
# Printing the results
for (month, count) in results:
    print("{:<3} {}".format(month, count))

#### Question 5 - Visualization

In [0]:
import matplotlib.pyplot as plt

# Extracting the month names and counts into separate lists
months = [x[0] for x in results]
counts = [x[1] for x in results]

# Setting the color for each bar
colors = ["#3182bd", "#6baed6", "#9ecae1", "#c6dbef", "#e6550d", "#fd8d3c",\
          "#fdae6b", "#fdd0a2", "#31a354", "#74c476", "#a1d99b", "#c7e9c0"]

# Plotting the data as a bar chart with colors
plt.bar(months, counts, color=colors)

# Setting the x-label and y-label
plt.xlabel("Month")
plt.ylabel("Number of Completed Trials")

# Setting the title
plt.title("Completed Clinical Trials in 2021")

# Displaying the plot
plt.show()

#### Further Analysis 1 (RDD)

Determine which top 10 companies have the highest number of violations.

In [0]:
# Mapping each row to a tuple 
companyCounts = pharma_rdd.map(lambda row: (row.split('",')[1].replace('"', ''), 1))

In [0]:
# Reducing by key to count the number of violations for each company
companyViolations = companyCounts.reduceByKey(lambda x, y: x + y)

In [0]:
# Retrieving the top 10 companies with the highest number of violations
Top10_violators = companyViolations.takeOrdered(10, key=lambda x: -x[1])

In [0]:
# Printing the results
for company, count in Top10_violators:
    print(company + ": " + str(count))

#### Visualization

In [0]:
import matplotlib.pyplot as plt

# Converting the output into two lists
Parent_Company = [company for company, count in Top10_violators]
violationCounts = [count for company, count in Top10_violators]

# Creating a pie chart
fig, ax = plt.subplots(figsize=(8, 6))
ax.pie(violationCounts, labels=Parent_Company, autopct='%1.1f%%', startangle=90)

# Creating a legend showing the actual violation counts
ax.legend(violationCounts, title='Violation Counts', loc='upper left', bbox_to_anchor=(1.25, 1))

# Setting the title of the chart
plt.title('Top 10 Pharmaceutical Companies by Violation Counts')

# Displaying the plot
plt.show()