In [37]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,sum, mean, max, min
from pyspark.ml.feature import StringIndexer
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [38]:
spark = SparkSession.builder.getOrCreate()

In [39]:
#Task 1
def clean_data(df):
    #Takes in a pyspark dataframe
    #Counts number of entries before and after dropping duplicates,
    #nulls and missing values.
    print(f'Original count {df.count()}')
    df = df.dropDuplicates()
    df = df.dropna()
    print(f'New count {df.count()}')
    return df

df = spark.read.csv('nuclear_plants_small_dataset.csv',inferSchema=True,header=True)
df = clean_data(df)

Original count 996
New count 996


In [75]:
#Task 2

features = ['Power_range_sensor_1', 'Power_range_sensor_2', 'Power_range_sensor_3', 'Power_range_sensor_4',
            'Pressure_sensor_1', 'Pressure_sensor_2', 'Pressure_sensor_3', 'Pressure_sensor_4',
            'Vibration_sensor_1', 'Vibration_sensor_2', 'Vibration_sensor_3', 'Vibration_sensor_4']

#Filtering column Status based on a condition it is normal or abnormal.
normal_df = df.filter(df['Status'] == 'Normal')
abnormal_df = df.filter(df['Status'] == 'Abnormal')

def summary_stats(dataframe):
    #Takes in pyspark dataframe
    #Drops the status column and uses .summary to find mean min max and median, renames the 50% to median
    #Then converts to a pandas dataframe.
    dfSummary = dataframe.drop('Status').summary('mean','min','max','50%')
    dfSummary = dfSummary.withColumnRenamed("50%","median")\
                            .withColumnRenamed('summary', 'Statistic')
    return dfSummary

normal_summary_stats = summary_stats(normal_df)
normal_summary_stats.show()

abnormal_summary_stats = summary_stats(abnormal_df)
abnormal_summary_stats.show()


['Status', 'Power_range_sensor_1', 'Power_range_sensor_2', 'Power_range_sensor_3 ', 'Power_range_sensor_4', 'Pressure_sensor_1', 'Pressure_sensor_2', 'Pressure_sensor_3', 'Pressure_sensor_4', 'Vibration_sensor_1', 'Vibration_sensor_2', 'Vibration_sensor_3', 'Vibration_sensor_4']
+---------+--------------------+--------------------+---------------------+--------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+
|Statistic|Power_range_sensor_1|Power_range_sensor_2|Power_range_sensor_3 |Power_range_sensor_4| Pressure_sensor_1| Pressure_sensor_2|Pressure_sensor_3|Pressure_sensor_4|Vibration_sensor_1|Vibration_sensor_2|Vibration_sensor_3|Vibration_sensor_4|
+---------+--------------------+--------------------+---------------------+--------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------