In [6]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import date, timedelta, datetime
import time

pd.pandas.set_option("display.max_columns", None)

In [7]:
sc = SparkSession.builder.appName("Oganesson")\
    .config('spark.sql.shuffle.partitions', '50')\
    .config('spark.driver.maxResultSize', '5g')\
    .config('spark.sql.execution.arrow.enabled', 'true')\
    .getOrCreate()

In [8]:
df = sc.read.csv('Documents/Credit Card fraud/creditcard.csv', header=True)

In [9]:
#df.show(10)
df.select('V19', 'V2', 'V3', 'V5', 'V11').show(10)

+-------------------+-------------------+------------------+-------------------+------------------+
|                V19|                 V2|                V3|                 V5|               V11|
+-------------------+-------------------+------------------+-------------------+------------------+
|  0.403992960255733|-0.0727811733098497|  2.53634673796914| -0.338320769942518|-0.551599533260813|
| -0.145783041325259|   0.26615071205963|  0.16648011335321| 0.0600176492822243|  1.61272666105479|
|  -2.26185709530414|  -1.34016307473609|  1.77320934263119| -0.503198133318193| 0.624501459424895|
|   -1.2326219700892| -0.185226008082898|  1.79299333957872|-0.0103088796030823|-0.226487263835401|
|  0.803486924960175|  0.877736754848451|    1.548717846511| -0.407193377311653|-0.822842877946363|
|-0.0331937877876282|  0.960523044882985|  1.14110934232219|   0.42098688077219|  1.34126198001957|
|-0.0455750446637976|  0.141003507049326|0.0453707735899449|  0.191880988597645| -1.41690724314928|


We can also make "When" calls for conditional statements for example: select ('something').when('some condition eg dataframe.')
Like in normal SQL there is a like clause which filters based on some string.
Startswith and Endswith scans for beginig or ending of a string respectively. eg df.name.startswith('Mr')

Substring clause extracts the strings in a specified range eg. df.surname.substring(1, 6).

withColumnRenamed changes the name that appears eg. df.withColumnRenamed('name', 'full_name').

to drop column you just go for df.drop('name', 'id', 'receipt')



## Inspecting the data

We can run basic exploratory data analysis with a few calls:
types, head, first, describe, columns, count(row count), distinct.count() for distinct count and explain() for plans including physical and logical

In [10]:
df.describe().show()

+-------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+
|summary|              Time|                  V1|                  V2|                  V3|                  V4|                  V5|                  V6|                  V7|                  V8|                  V9|                 V10|                 V11|                 V12|                 V13|                 V14|                 V15| 

There is also the groupby clause eg. df.groupby('department')

and the filter clause. eg df.filter(df['title']=='manager')

## Handling missing values

# 1.) replacing missing values
df.na.fill()    
df.fillna()

# 2.) Returning new df
df.na.drop()   
df.dropna()

# 3.) Replacing certain values with specific ones
df.na.replace(3, 5)    
df.replace()


## Repartitioning

you can repartition data either incresing the number of partitions with repartition(self, numPartitions) or decreasing the number of partitions using coalesce(self, numPartitions, shuffle=False)


## Running raw SQL operations

This can be done using the df.registerTempTable('table_name')

In [11]:
sql_table = df.registerTempTable('TempView')

In [12]:
sc.sql("SELECT V3, V2, V5, V11 FROM TempView LIMIT 15").show()

+------------------+-------------------+-------------------+------------------+
|                V3|                 V2|                 V5|               V11|
+------------------+-------------------+-------------------+------------------+
|  2.53634673796914|-0.0727811733098497| -0.338320769942518|-0.551599533260813|
|  0.16648011335321|   0.26615071205963| 0.0600176492822243|  1.61272666105479|
|  1.77320934263119|  -1.34016307473609| -0.503198133318193| 0.624501459424895|
|  1.79299333957872| -0.185226008082898|-0.0103088796030823|-0.226487263835401|
|    1.548717846511|  0.877736754848451| -0.407193377311653|-0.822842877946363|
|  1.14110934232219|  0.960523044882985|   0.42098688077219|  1.34126198001957|
|0.0453707735899449|  0.141003507049326|  0.191880988597645| -1.41690724314928|
|   1.0743803763556|   1.41796354547385|  0.948934094764157|-0.619467796121913|
|-0.113192212729871|  0.286157196276544|    2.6695986595986|-0.705116586646536|
|  1.04436655157316|   1.11959337641566|

## Output Formats 
rdd_convert = dataframe.rdd       
df.toJSON().first()        
df.toPandas()

In [13]:
df_2 = sc.sql("SELECT V3, V2, V5, V11 FROM TempView LIMIT 15")
print(f'You can see the first type is:  {type(df_2)}')
df_2_converted = df_2.rdd
print(f'When converted you now get: {type(df_2_converted)}')

You can see the first type is:  <class 'pyspark.sql.dataframe.DataFrame'>
When converted you now get: <class 'pyspark.rdd.RDD'>


In [14]:
json_next = df_2.toJSON().take(10)
print(type(json_next))
print(json_next)
pandas_next = df_2.toPandas()
print(pandas_next.head())
 print(type(pandas_next))

<class 'list'>
['{"V3":"2.53634673796914","V2":"-0.0727811733098497","V5":"-0.338320769942518","V11":"-0.551599533260813"}', '{"V3":"0.16648011335321","V2":"0.26615071205963","V5":"0.0600176492822243","V11":"1.61272666105479"}', '{"V3":"1.77320934263119","V2":"-1.34016307473609","V5":"-0.503198133318193","V11":"0.624501459424895"}', '{"V3":"1.79299333957872","V2":"-0.185226008082898","V5":"-0.0103088796030823","V11":"-0.226487263835401"}', '{"V3":"1.548717846511","V2":"0.877736754848451","V5":"-0.407193377311653","V11":"-0.822842877946363"}', '{"V3":"1.14110934232219","V2":"0.960523044882985","V5":"0.42098688077219","V11":"1.34126198001957"}', '{"V3":"0.0453707735899449","V2":"0.141003507049326","V5":"0.191880988597645","V11":"-1.41690724314928"}', '{"V3":"1.0743803763556","V2":"1.41796354547385","V5":"0.948934094764157","V11":"-0.619467796121913"}', '{"V3":"-0.113192212729871","V2":"0.286157196276544","V5":"2.6695986595986","V11":"-0.705116586646536"}', '{"V3":"1.04436655157316","V2":

You can drop duplicates using the dropDuplicates() function

In [15]:
# we can also use the select function directly on the dataframe
df.select('V3', 'V2','V5', 'V7').show(10)

+------------------+-------------------+-------------------+--------------------+
|                V3|                 V2|                 V5|                  V7|
+------------------+-------------------+-------------------+--------------------+
|  2.53634673796914|-0.0727811733098497| -0.338320769942518|   0.239598554061257|
|  0.16648011335321|   0.26615071205963| 0.0600176492822243| -0.0788029833323113|
|  1.77320934263119|  -1.34016307473609| -0.503198133318193|   0.791460956450422|
|  1.79299333957872| -0.185226008082898|-0.0103088796030823|    0.23760893977178|
|    1.548717846511|  0.877736754848451| -0.407193377311653|   0.592940745385545|
|  1.14110934232219|  0.960523044882985|   0.42098688077219|   0.476200948720027|
|0.0453707735899449|  0.141003507049326|  0.191880988597645|-0.00515900288250983|
|   1.0743803763556|   1.41796354547385|  0.948934094764157|    1.12063135838353|
|-0.113192212729871|  0.286157196276544|    2.6695986595986|   0.370145127676916|
|  1.04436655157

In [16]:
# Using the when function
df.select('V5', when(df.V5 <= 0, 1).otherwise(0)).show(10)

+-------------------+-------------------------------------+
|                 V5|CASE WHEN (V5 <= 0) THEN 1 ELSE 0 END|
+-------------------+-------------------------------------+
| -0.338320769942518|                                    1|
| 0.0600176492822243|                                    1|
| -0.503198133318193|                                    1|
|-0.0103088796030823|                                    1|
| -0.407193377311653|                                    1|
|   0.42098688077219|                                    1|
|  0.191880988597645|                                    1|
|  0.948934094764157|                                    1|
|    2.6695986595986|                                    0|
|   0.49936080649727|                                    1|
+-------------------+-------------------------------------+
only showing top 10 rows



We can also use the isin() function when searching for certain conditions and the like() function:

# 1.) df.select('author', 'title', df.title.like("% THE%"))
The isin:

# 2.)df[df.author.isin("Jon Snow", "Daenerias Taegerian" )]


# 3.) df.select('author', 'title', df.title.startswith(" Sa"))

# 4.) df.select('author', 'title', df.title.endswith("nt"))



In [17]:
# to assess the datatypes of each column
df.dtypes

[('Time', 'string'),
 ('V1', 'string'),
 ('V2', 'string'),
 ('V3', 'string'),
 ('V4', 'string'),
 ('V5', 'string'),
 ('V6', 'string'),
 ('V7', 'string'),
 ('V8', 'string'),
 ('V9', 'string'),
 ('V10', 'string'),
 ('V11', 'string'),
 ('V12', 'string'),
 ('V13', 'string'),
 ('V14', 'string'),
 ('V15', 'string'),
 ('V16', 'string'),
 ('V17', 'string'),
 ('V18', 'string'),
 ('V19', 'string'),
 ('V20', 'string'),
 ('V21', 'string'),
 ('V22', 'string'),
 ('V23', 'string'),
 ('V24', 'string'),
 ('V25', 'string'),
 ('V26', 'string'),
 ('V27', 'string'),
 ('V28', 'string'),
 ('Amount', 'string'),
 ('Class', 'string')]

## Writting and saving files when done

In [18]:
df_7 = df.select('Time', 'V7', 'Amount', 'Class')
print(type(df_7))
df_11 = df_7.toPandas()
print(type(df_11))
df_11.to_csv('Documents/Credit Card fraud/cred_csv.csv')
df_11.to_parquet('Documents/Credit Card fraud/cred_parq.parquet')
df_11.to_json('Documents/Credit Card fraud/cred_parq.json')

<class 'pyspark.sql.dataframe.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


Now to close the spark session

In [19]:
sc.stop()