#  <center><b><i> Basic Snippets related to Data Engineering </i></b></center>

In [None]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practise').getOrCreate()

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [2]:
type(spark)

pyspark.sql.session.SparkSession

In [4]:
dir(spark)

['Builder',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_activeSession',
 '_conf',
 '_convert_from_pandas',
 '_createFromLocal',
 '_createFromRDD',
 '_create_dataframe',
 '_create_from_pandas_with_arrow',
 '_create_shell_session',
 '_getActiveSessionOrCreate',
 '_get_numpy_record_dtype',
 '_inferSchema',
 '_inferSchemaFromList',
 '_instantiatedSession',
 '_jconf',
 '_jsc',
 '_jsparkSession',
 '_jvm',
 '_repr_html_',
 '_sc',
 'active',
 'addArtifact',
 'addArtifacts',
 'addTag',
 'builder',
 'catalog',
 'clearTags',
 'client',
 'conf',
 'copyFromLocalToFs',
 'createDataFrame',
 'getActiveSession',
 'ge

### 1. Creating a DataFrame

In [5]:
help(spark.createDataFrame)

Help on method createDataFrame in module pyspark.sql.session:

createDataFrame(data: Union[pyspark.rdd.RDD[Any], Iterable[Any], ForwardRef('PandasDataFrameLike'), ForwardRef('ArrayLike')], schema: Union[pyspark.sql.types.AtomicType, pyspark.sql.types.StructType, str, NoneType] = None, samplingRatio: Optional[float] = None, verifySchema: bool = True) -> pyspark.sql.dataframe.DataFrame method of pyspark.sql.session.SparkSession instance
    Creates a :class:`DataFrame` from an :class:`RDD`, a list, a :class:`pandas.DataFrame`
    or a :class:`numpy.ndarray`.
    
    .. versionadded:: 2.0.0
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Parameters
    ----------
    data : :class:`RDD` or iterable
        an RDD of any kind of SQL data representation (:class:`Row`,
        :class:`tuple`, ``int``, ``boolean``, etc.), or :class:`list`,
        :class:`pandas.DataFrame` or :class:`numpy.ndarray`.
    schema : :class:`pyspark.sql.types.DataType`, str or list, op

In [7]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [9]:
from pyspark.sql.types import *

data = [(1, 'Adarsh'), (2, 'Amruth')]
cols = StructType([
    StructField(name = 'Id', dataType = IntegerType()),
    StructField(name = 'Name', dataType = StringType())
])

df = spark.createDataFrame(data= data, schema= cols)

df.show(truncate= False)

df.printSchema()

+---+------+
|Id |Name  |
+---+------+
|1  |Adarsh|
|2  |Amruth|
+---+------+

root
 |-- Id: integer (nullable = true)
 |-- Name: string (nullable = true)



In [10]:
# using a list of dictionary as a shortcut

data = [
    {'Id': 1, 'Name': 'Adarsh'},
    {'Id': 2, 'Name': 'Amruth'}
]

df = spark.createDataFrame(data= data)

df.show(truncate = False)

df.printSchema()

+---+------+
|Id |Name  |
+---+------+
|1  |Adarsh|
|2  |Amruth|
+---+------+

root
 |-- Id: long (nullable = true)
 |-- Name: string (nullable = true)



### 2. Reading a CSV file into a DataFrame

In [2]:
df = spark.read.csv(path= 'Data source/emp1.csv', header= True)
df.show()
df.printSchema()  ### By default the Data type of all columns will be string since we are reading from csv file

+---+-------+----------+
| Id|   Name|      City|
+---+-------+----------+
|  1| Adarsh| Bengaluru|
|  2| Amruth|    Mysuru|
+---+-------+----------+

root
 |-- Id: string (nullable = true)
 |--  Name: string (nullable = true)
 |--  City: string (nullable = true)



In [3]:
df = spark.read.csv(path= 'Data source/emp1.csv', header= True, sep= ',', inferSchema= True)
df.show()
df.printSchema()  ## now because of infer schema you can see the data type is keenly observed on Id column

+---+-------+----------+
| Id|   Name|      City|
+---+-------+----------+
|  1| Adarsh| Bengaluru|
|  2| Amruth|    Mysuru|
+---+-------+----------+

root
 |-- Id: integer (nullable = true)
 |--  Name: string (nullable = true)
 |--  City: string (nullable = true)



In [4]:
df = spark.read.format('csv').option(key='header', value= True).load(path= 'Data source/emp2.csv')
df.show()
df.printSchema() 

+---+----------+---------+
| Id|      Name|     City|
+---+----------+---------+
|  3|   Krishna|   Mandya|
|  4| Shrinidhi| Virajpet|
+---+----------+---------+

root
 |-- Id: string (nullable = true)
 |--  Name: string (nullable = true)
 |--  City: string (nullable = true)



In [12]:
### Reading multiple CSV files

df = spark.read.csv(path= ['Data source/emp1.csv', 'Data source/emp2.csv'], header= True, inferSchema= True) ### if the csv files are available
### on different or same folder and holds same schema then this holds good.
df.show()
df.printSchema()

+---+----------+----------+
| Id|      Name|      City|
+---+----------+----------+
|  3|   Krishna|    Mandya|
|  4| Shrinidhi|  Virajpet|
|  1|    Adarsh| Bengaluru|
|  2|    Amruth|    Mysuru|
+---+----------+----------+

root
 |-- Id: integer (nullable = true)
 |--  Name: string (nullable = true)
 |--  City: string (nullable = true)



In [15]:
from pyspark.sql.types import *

schema = StructType().add(field = 'Id', data_type = IntegerType())\
                    .add(field = 'Name', data_type = StringType())\
                    .add(field = 'City', data_type = StringType())

df = spark.read.csv(path= ['Data source/emp1.csv', 'Data source/emp2.csv'], header= True,schema= schema)
df.show()
df.printSchema()

+---+----------+----------+
| Id|      Name|      City|
+---+----------+----------+
|  3|   Krishna|    Mandya|
|  4| Shrinidhi|  Virajpet|
|  1|    Adarsh| Bengaluru|
|  2|    Amruth|    Mysuru|
+---+----------+----------+

root
 |-- Id: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- City: string (nullable = true)



### 3. Writting Dataframe into a CSV file

In [2]:
from pyspark.sql import *

In [3]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [4]:
help(DataFrameWriter)

Help on class DataFrameWriter in module pyspark.sql.readwriter:

class DataFrameWriter(OptionUtils)
 |  DataFrameWriter(df: 'DataFrame')
 |  
 |  Interface used to write a :class:`DataFrame` to external storage systems
 |  (e.g. file systems, key-value stores, etc). Use :attr:`DataFrame.write`
 |  to access this.
 |  
 |  .. versionadded:: 1.4.0
 |  
 |  .. versionchanged:: 3.4.0
 |      Supports Spark Connect.
 |  
 |  Method resolution order:
 |      DataFrameWriter
 |      OptionUtils
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, df: 'DataFrame')
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  bucketBy(self, numBuckets: int, col: Union[str, List[str], Tuple[str, ...]], *cols: Optional[str]) -> 'DataFrameWriter'
 |      Buckets the output by the given columns. If specified,
 |      the output is laid out on the file system similar to Hive's bucketing scheme,
 |      but with a different bucket hash function and is not c

In [6]:
from pyspark.sql.types import *

data = [(1, 'Adarsh'), (2, 'Amruth')]
cols = ['Id', 'Name']

df = spark.createDataFrame(data= data, schema= cols)

In [7]:
df.show()

+---+------+
| Id|  Name|
+---+------+
|  1|Adarsh|
|  2|Amruth|
+---+------+



In [1]:
df.write.csv(path = 'Data source/NewEmp/', header=True, mode="overwrite")

NameError: name 'df' is not defined