### Starting Spark Session

In [13]:
import findspark
findspark.init()

In [14]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MyApp").getOrCreate()

In [15]:
type(spark)

pyspark.sql.session.SparkSession

In [16]:
dir(spark)

['Builder',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_activeSession',
 '_convert_from_pandas',
 '_createFromLocal',
 '_createFromRDD',
 '_create_dataframe',
 '_create_from_pandas_with_arrow',
 '_create_shell_session',
 '_getActiveSessionOrCreate',
 '_get_numpy_record_dtype',
 '_inferSchema',
 '_inferSchemaFromList',
 '_instantiatedSession',
 '_jconf',
 '_jsc',
 '_jsparkSession',
 '_jvm',
 '_repr_html_',
 '_sc',
 'builder',
 'catalog',
 'conf',
 'createDataFrame',
 'getActiveSession',
 'newSession',
 'range',
 'read',
 'readStream',
 'sparkContext',
 'sql',
 'stop',
 'streams',
 'table',
 'udf',
 'version']

In [17]:
help(spark.createDataFrame)

Help on method createDataFrame in module pyspark.sql.session:

createDataFrame(data: Union[pyspark.rdd.RDD[Any], Iterable[Any], ForwardRef('PandasDataFrameLike'), ForwardRef('ArrayLike')], schema: Union[pyspark.sql.types.AtomicType, pyspark.sql.types.StructType, str, NoneType] = None, samplingRatio: Optional[float] = None, verifySchema: bool = True) -> pyspark.sql.dataframe.DataFrame method of pyspark.sql.session.SparkSession instance
    Creates a :class:`DataFrame` from an :class:`RDD`, a list, a :class:`pandas.DataFrame`
    or a :class:`numpy.ndarray`.
    
    .. versionadded:: 2.0.0
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Parameters
    ----------
    data : :class:`RDD` or iterable
        an RDD of any kind of SQL data representation (:class:`Row`,
        :class:`tuple`, ``int``, ``boolean``, etc.), or :class:`list`,
        :class:`pandas.DataFrame` or :class:`numpy.ndarray`.
    schema : :class:`pyspark.sql.types.DataType`, str or list, op

### Creating Spark DataFrame using list

In [18]:
data = [(1, "sk"), (2, "messi"), (3, "virat")]

In [22]:
df = spark.createDataFrame(data, ["Id", "Name"])

In [23]:
df.show()

+---+-----+
| Id| Name|
+---+-----+
|  1|   sk|
|  2|messi|
|  3|virat|
+---+-----+



In [24]:
df.printSchema()

root
 |-- Id: long (nullable = true)
 |-- Name: string (nullable = true)



### Importing types to create schema

In [26]:
from pyspark.sql.types import *

In [27]:
type(df['id'])

pyspark.sql.column.Column

In [28]:
type(df)

pyspark.sql.dataframe.DataFrame

In [29]:
data2 = [(1, 'sk'), (2, 'srk')]

### Creating Schema using StructType and StructField

In [31]:
schema = StructType([StructField(name = 'id', dataType = IntegerType()), StructField(name = 'name', dataType = StringType())])

In [32]:
type(schema)

pyspark.sql.types.StructType

In [36]:
df2 = spark.createDataFrame(data2, schema)

In [34]:
help(StructType)

Help on class StructType in module pyspark.sql.types:

class StructType(DataType)
 |  StructType(fields: Optional[List[pyspark.sql.types.StructField]] = None)
 |  
 |  Struct type, consisting of a list of :class:`StructField`.
 |  
 |  This is the data type representing a :class:`Row`.
 |  
 |  Iterating a :class:`StructType` will iterate over its :class:`StructField`\s.
 |  A contained :class:`StructField` can be accessed by its name or position.
 |  
 |  Examples
 |  --------
 |  >>> from pyspark.sql.types import *
 |  >>> struct1 = StructType([StructField("f1", StringType(), True)])
 |  >>> struct1["f1"]
 |  StructField('f1', StringType(), True)
 |  >>> struct1[0]
 |  StructField('f1', StringType(), True)
 |  
 |  >>> struct1 = StructType([StructField("f1", StringType(), True)])
 |  >>> struct2 = StructType([StructField("f1", StringType(), True)])
 |  >>> struct1 == struct2
 |  True
 |  >>> struct1 = StructType([StructField("f1", CharType(10), True)])
 |  >>> struct2 = StructType([S

In [35]:
help(StructField)

Help on class StructField in module pyspark.sql.types:

class StructField(DataType)
 |  StructField(name: str, dataType: pyspark.sql.types.DataType, nullable: bool = True, metadata: Optional[Dict[str, Any]] = None)
 |  
 |  A field in :class:`StructType`.
 |  
 |  Parameters
 |  ----------
 |  name : str
 |      name of the field.
 |  dataType : :class:`DataType`
 |      :class:`DataType` of the field.
 |  nullable : bool, optional
 |      whether the field can be null (None) or not.
 |  metadata : dict, optional
 |      a dict from string to simple type that can be toInternald to JSON automatically
 |  
 |  Examples
 |  --------
 |  >>> from pyspark.sql.types import StringType, StructField
 |  >>> (StructField("f1", StringType(), True)
 |  ...      == StructField("f1", StringType(), True))
 |  True
 |  >>> (StructField("f1", StringType(), True)
 |  ...      == StructField("f2", StringType(), True))
 |  False
 |  
 |  Method resolution order:
 |      StructField
 |      DataType
 |    

In [37]:
df2.show()

+---+----+
| id|name|
+---+----+
|  1|  sk|
|  2| srk|
+---+----+



In [38]:
df2.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)



### Creating Spark DataFrame using Dictionary

In [39]:
data3 = [{'id': 1, 'name': 'sk'}, {'id': 2, 'name': 'srk'}]

In [40]:
df3 = spark.createDataFrame(data3)

In [41]:
df3.show()

+---+----+
| id|name|
+---+----+
|  1|  sk|
|  2| srk|
+---+----+



In [42]:
df3.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)

