# SparkSession

In [None]:
!pip install pyspark py4j

from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('demo').getOrCreate()
spark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285398 sha256=389290149e608ff9517eecd695b328418dbebf0b6047eb3552febadca5b306a6
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


# import Functions and types

In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

# read without header

In [None]:
file = spark.read.format('csv').load('/content/address.txt')
file.show(truncate = False)

+-------------------------+
|_c0                      |
+-------------------------+
|Hyderabad-Telengana---123|
|Bangalore-Karnataka---245|
+-------------------------+



#split and dropping columns

In [None]:
df1 = file.withColumn('split',split('_c0','-'))

df2 = df1.withColumn('city', col('split')[0]).withColumn('state',col('split')[1]).withColumn('Id',col('split')[4]).drop('_c0','split')

df2.show()

+---------+---------+---+
|     city|    state| Id|
+---------+---------+---+
|Hyderabad|Telengana|123|
|Bangalore|Karnataka|245|
+---------+---------+---+



# reading csv file

In [None]:
df = spark.read.format('csv').option('header',True).option('inferSchema',True).option('nullValue','null').load('/content/employee.csv')
df.show(5)
df.printSchema()
df.count()

+-----+------+--------+----+----------+----+----+------+------------+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|
+-----+------+--------+----+----------+----+----+------+------------+
| 7369| SMITH|   CLERK|7902|17-12-1980| 800|null|    20|  01-01-2022|
| 7499| ALLEN|SALESMAN|7698|20-02-1981|1600| 300|    30|  02-01-2022|
| 7521|  WARD|SALESMAN|7698|22-02-1981|1250| 500|    30|  03-01-2022|
| 7566| JONES| MANAGER|7839|04-02-1981|2975|null|    20|  04-01-2022|
| 7654|MARTIN|SALESMAN|7698|21-09-1981|1250| 600|    30|  05-01-2022|
+-----+------+--------+----+----------+----+----+------+------------+
only showing top 5 rows

root
 |-- EMPNO: integer (nullable = true)
 |-- ENAME: string (nullable = true)
 |-- JOB: string (nullable = true)
 |-- MGR: integer (nullable = true)
 |-- HIREDATE: string (nullable = true)
 |-- SAL: integer (nullable = true)
 |-- COMM: integer (nullable = true)
 |-- DEPTNO: integer (nullable = true)
 |-- UPDATED_DATE: string (nullable = true

33

# Writing schema for dataFrame

In [None]:
from pyspark.sql.types import *

schema = StructType([StructField("EMPNO", IntegerType(), True),\
                     StructField("ENAME", StringType(), True),\
                     StructField("JOB", StringType(), True),\
                     StructField("MGR", IntegerType(), True),\
                     StructField("HIREDATE", DateType(), True),\
                     StructField("SAL", IntegerType(), True),\
                     StructField("COMM", IntegerType(), True),\
                     StructField("DEPTNO", IntegerType(), True),\
                     StructField("UPDATED_DATE",DateType(), True)\
])

#how to handle yy date format in pyspark for before 2000 data

In [None]:
yy_df = spark.read.format('csv').option('header',True).option('sep','|').option('inferSchema',True).option('nullValue','null').load('/content/emp_pipe_yy.txt')

yy_df.show()
yy_df.printSchema()

In [None]:

#Spark defaulty chooses year after 2000

# we need to set spark sql legcy timeparserpolicy to legacy -- Useful for less columns data only ,if more data means need ask source system set proper date format

spark.conf.set('spark.sql.legacy.timeParserPolicy','LEGACY')

yy_df.withColumn('DATE',to_date('UPDATED_DATE','dd-mm-yy')).show()

#how to get number of null in all columns

In [None]:

df_csv.select([count(when(col(i).isNull(),i)).alias(i) for i in df_csv.columns]).show()

#word count program step by step

In [None]:
rd = spark.sparkContext.textFile("/content/word.txt")

print(type(rd))
print(rd.collect())       # no.of lines
print(rd.count())

In [None]:
rd1 = rd.map(lambda x : x.encode('utf-8'))

print(rd1.collect())

In [None]:
# map - gives Number of lists of strings

rd2 = rd.map(lambda x:x.split(' '))
print(type(rd2))
print(rd2.collect())

In [None]:
#flatmap - gives single list of strings

rd3 = rd.flatMap(lambda x : x.split(' '))
print(rd3.collect())

In [None]:
# assiging value to strings
rd4 =  rd3.map(lambda x : (x,1))
print(rd4.collect())

In [None]:
# reduceByKey -  groupping and aggregating same keys from rd4

rd5 = rd4.reduceByKey(lambda x ,y : x+y)

print(rd5.collect())

# word count in single line - Find No.of Occurences of single word

In [None]:
RDD = spark.sparkContext.textFile('/content/word.txt').flatMap(lambda x : x.split(' ')).map(lambda x:(x,1)).reduceByKey(lambda x,y : x+y)

RDD.collect()

# How to Skip first few rows? ( Step By Step detailed)



In [None]:
Rdd = spark.sparkContext.textFile('/content/emp_pipe_skip.txt').zipWithIndex()

Rdd.collect()

[('line 1', 0),
 ('line 2', 1),
 ('line 3', 2),
 ('EMPNO|ENAME|JOB|MGR|HIREDATE|SAL|COMM|DEPTNO|UPDATED_DATE', 3),
 ('7369|SMITH|CLERK|7902|17-12-1980|800|null|20|01-01-2022', 4),
 ('7499|ALLEN|SALESMAN|7698|20-02-1981|1600|300|30|02-01-2022', 5),
 ('7521|WARD|SALESMAN|7698|22-02-1981|1250|500|30|03-01-2022', 6),
 ('7566|JONES|MANAGER|7839|04-02-1981|2975|null|20|04-01-2022', 7),
 ('7654|MARTIN|SALESMAN|7698|21-09-1981|1250|1400|30|05-01-2022', 8),
 ('7698|SGR|MANAGER|7839|05-01-1981|2850|null|30|06-01-2022', 9),
 ('7782|RAVI|MANAGER|7839|06-09-1981|2450|null|10|07-01-2022', 10),
 ('7788|SCOTT|ANALYST|7566|19-04-1987|3000|null|20|08-01-2022', 11),
 ('7839|KING|PRESIDENT|null|null|5000|null|10|null', 12),
 ('7844|TURNER|SALESMAN|7698|09-08-1981|1500|0|30|01-02-2022', 13),
 ('7876|ADAMS|CLERK|7788|23-05-1987|1100|null|20|02-02-2022', 14),
 ('7900|JAMES|CLERK|7698|12-03-1981|950|null|30|03-02-2022', 15),
 ('7902|FORD|ANALYST|7566|12-03-1981|3000|null|20|04-02-2022', 16),
 ('7934|MILLER|CL

In [None]:
Rdd1 = Rdd.filter(lambda a : a[1]>2)
Rdd1.collect()

[('EMPNO|ENAME|JOB|MGR|HIREDATE|SAL|COMM|DEPTNO|UPDATED_DATE', 3),
 ('7369|SMITH|CLERK|7902|17-12-1980|800|null|20|01-01-2022', 4),
 ('7499|ALLEN|SALESMAN|7698|20-02-1981|1600|300|30|02-01-2022', 5),
 ('7521|WARD|SALESMAN|7698|22-02-1981|1250|500|30|03-01-2022', 6),
 ('7566|JONES|MANAGER|7839|04-02-1981|2975|null|20|04-01-2022', 7),
 ('7654|MARTIN|SALESMAN|7698|21-09-1981|1250|1400|30|05-01-2022', 8),
 ('7698|SGR|MANAGER|7839|05-01-1981|2850|null|30|06-01-2022', 9),
 ('7782|RAVI|MANAGER|7839|06-09-1981|2450|null|10|07-01-2022', 10),
 ('7788|SCOTT|ANALYST|7566|19-04-1987|3000|null|20|08-01-2022', 11),
 ('7839|KING|PRESIDENT|null|null|5000|null|10|null', 12),
 ('7844|TURNER|SALESMAN|7698|09-08-1981|1500|0|30|01-02-2022', 13),
 ('7876|ADAMS|CLERK|7788|23-05-1987|1100|null|20|02-02-2022', 14),
 ('7900|JAMES|CLERK|7698|12-03-1981|950|null|30|03-02-2022', 15),
 ('7902|FORD|ANALYST|7566|12-03-1981|3000|null|20|04-02-2022', 16),
 ('7934|MILLER|CLERK|7782|01-03-1982|1300|null|10|05-02-2022', 17

In [None]:
# Removing index and split using sep

Rdd2 = Rdd1.map(lambda a : a[0].split('|'))

Rdd2.collect()


[['EMPNO',
  'ENAME',
  'JOB',
  'MGR',
  'HIREDATE',
  'SAL',
  'COMM',
  'DEPTNO',
  'UPDATED_DATE'],
 ['7369',
  'SMITH',
  'CLERK',
  '7902',
  '17-12-1980',
  '800',
  'null',
  '20',
  '01-01-2022'],
 ['7499',
  'ALLEN',
  'SALESMAN',
  '7698',
  '20-02-1981',
  '1600',
  '300',
  '30',
  '02-01-2022'],
 ['7521',
  'WARD',
  'SALESMAN',
  '7698',
  '22-02-1981',
  '1250',
  '500',
  '30',
  '03-01-2022'],
 ['7566',
  'JONES',
  'MANAGER',
  '7839',
  '04-02-1981',
  '2975',
  'null',
  '20',
  '04-01-2022'],
 ['7654',
  'MARTIN',
  'SALESMAN',
  '7698',
  '21-09-1981',
  '1250',
  '1400',
  '30',
  '05-01-2022'],
 ['7698',
  'SGR',
  'MANAGER',
  '7839',
  '05-01-1981',
  '2850',
  'null',
  '30',
  '06-01-2022'],
 ['7782',
  'RAVI',
  'MANAGER',
  '7839',
  '06-09-1981',
  '2450',
  'null',
  '10',
  '07-01-2022'],
 ['7788',
  'SCOTT',
  'ANALYST',
  '7566',
  '19-04-1987',
  '3000',
  'null',
  '20',
  '08-01-2022'],
 ['7839', 'KING', 'PRESIDENT', 'null', 'null', '5000', 'null'

In [None]:
# collect columns headers
columns = Rdd2.collect()[0]

print(columns)

['EMPNO', 'ENAME', 'JOB', 'MGR', 'HIREDATE', 'SAL', 'COMM', 'DEPTNO', 'UPDATED_DATE']


In [None]:
# skip first row
kipline = Rdd2.first()

print(skipline)

['EMPNO', 'ENAME', 'JOB', 'MGR', 'HIREDATE', 'SAL', 'COMM', 'DEPTNO', 'UPDATED_DATE']


In [None]:
# making df from rdd

DF = Rdd2.filter(lambda a : a != skipline).toDF(columns)

DF.show()

DF.printSchema()

+-----+------+---------+----+----------+----+----+------+------------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|
+-----+------+---------+----+----------+----+----+------+------------+
| 7369| SMITH|    CLERK|7902|17-12-1980| 800|null|    20|  01-01-2022|
| 7499| ALLEN| SALESMAN|7698|20-02-1981|1600| 300|    30|  02-01-2022|
| 7521|  WARD| SALESMAN|7698|22-02-1981|1250| 500|    30|  03-01-2022|
| 7566| JONES|  MANAGER|7839|04-02-1981|2975|null|    20|  04-01-2022|
| 7654|MARTIN| SALESMAN|7698|21-09-1981|1250|1400|    30|  05-01-2022|
| 7698|   SGR|  MANAGER|7839|05-01-1981|2850|null|    30|  06-01-2022|
| 7782|  RAVI|  MANAGER|7839|06-09-1981|2450|null|    10|  07-01-2022|
| 7788| SCOTT|  ANALYST|7566|19-04-1987|3000|null|    20|  08-01-2022|
| 7839|  KING|PRESIDENT|null|      null|5000|null|    10|        null|
| 7844|TURNER| SALESMAN|7698|09-08-1981|1500|   0|    30|  01-02-2022|
| 7876| ADAMS|    CLERK|7788|23-05-1987|1100|null|    20|  02-02-2022|
| 7900

# Skip rows in given file ?

In [None]:
r1 = spark.sparkContext.textFile('/content/emp_pipe_skip.txt').zipWithIndex().filter(lambda a : a[1]>2).map(lambda a : a[0].split('|'))

r1.collect()

[['EMPNO',
  'ENAME',
  'JOB',
  'MGR',
  'HIREDATE',
  'SAL',
  'COMM',
  'DEPTNO',
  'UPDATED_DATE'],
 ['7369',
  'SMITH',
  'CLERK',
  '7902',
  '17-12-1980',
  '800',
  'null',
  '20',
  '01-01-2022'],
 ['7499',
  'ALLEN',
  'SALESMAN',
  '7698',
  '20-02-1981',
  '1600',
  '300',
  '30',
  '02-01-2022'],
 ['7521',
  'WARD',
  'SALESMAN',
  '7698',
  '22-02-1981',
  '1250',
  '500',
  '30',
  '03-01-2022'],
 ['7566',
  'JONES',
  'MANAGER',
  '7839',
  '04-02-1981',
  '2975',
  'null',
  '20',
  '04-01-2022'],
 ['7654',
  'MARTIN',
  'SALESMAN',
  '7698',
  '21-09-1981',
  '1250',
  '1400',
  '30',
  '05-01-2022'],
 ['7698',
  'SGR',
  'MANAGER',
  '7839',
  '05-01-1981',
  '2850',
  'null',
  '30',
  '06-01-2022'],
 ['7782',
  'RAVI',
  'MANAGER',
  '7839',
  '06-09-1981',
  '2450',
  'null',
  '10',
  '07-01-2022'],
 ['7788',
  'SCOTT',
  'ANALYST',
  '7566',
  '19-04-1987',
  '3000',
  'null',
  '20',
  '08-01-2022'],
 ['7839', 'KING', 'PRESIDENT', 'null', 'null', '5000', 'null'

In [None]:
columns = r1.collect()[0]
skipline = r1.collect()[0]
print(columns)
print(skipline)

['EMPNO', 'ENAME', 'JOB', 'MGR', 'HIREDATE', 'SAL', 'COMM', 'DEPTNO', 'UPDATED_DATE']
['EMPNO', 'ENAME', 'JOB', 'MGR', 'HIREDATE', 'SAL', 'COMM', 'DEPTNO', 'UPDATED_DATE']


In [None]:
sk_df = r1.filter(lambda a : a!= skipline).toDF(columns)

sk_df.show(10)
print(sk_df.count())
sk_df.printSchema()
print(type(sk_df))


+-----+------+---------+----+----------+----+----+------+------------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|
+-----+------+---------+----+----------+----+----+------+------------+
| 7369| SMITH|    CLERK|7902|17-12-1980| 800|null|    20|  01-01-2022|
| 7499| ALLEN| SALESMAN|7698|20-02-1981|1600| 300|    30|  02-01-2022|
| 7521|  WARD| SALESMAN|7698|22-02-1981|1250| 500|    30|  03-01-2022|
| 7566| JONES|  MANAGER|7839|04-02-1981|2975|null|    20|  04-01-2022|
| 7654|MARTIN| SALESMAN|7698|21-09-1981|1250|1400|    30|  05-01-2022|
| 7698|   SGR|  MANAGER|7839|05-01-1981|2850|null|    30|  06-01-2022|
| 7782|  RAVI|  MANAGER|7839|06-09-1981|2450|null|    10|  07-01-2022|
| 7788| SCOTT|  ANALYST|7566|19-04-1987|3000|null|    20|  08-01-2022|
| 7839|  KING|PRESIDENT|null|      null|5000|null|    10|        null|
| 7844|TURNER| SALESMAN|7698|09-08-1981|1500|   0|    30|  01-02-2022|
+-----+------+---------+----+----------+----+----+------+------------+
only s

# how to get 53 th weak years from last 50 years

In [None]:
we_df = spark.createDataFrame([(i,"01/01/"+str(i)+"")for i in range(1990,2050)],['id','year'])

we_df = we_df.withColumn('date',to_date('year','dd/mm/yyyy')).withColumn('week',weekofyear('date')).filter('week= 53')
we_df.show()

+----+----------+----------+----+
|  id|      year|      date|week|
+----+----------+----------+----+
|1993|01/01/1993|1993-01-01|  53|
|1999|01/01/1999|1999-01-01|  53|
|2005|01/01/2005|2005-01-01|  53|
|2010|01/01/2010|2010-01-01|  53|
|2016|01/01/2016|2016-01-01|  53|
|2021|01/01/2021|2021-01-01|  53|
|2027|01/01/2027|2027-01-01|  53|
|2033|01/01/2033|2033-01-01|  53|
|2038|01/01/2038|2038-01-01|  53|
|2044|01/01/2044|2044-01-01|  53|
|2049|01/01/2049|2049-01-01|  53|
+----+----------+----------+----+



# Creating a dataframe

In [None]:
cr_df = spark.createDataFrame([(i,"02/01/"+str(i)+"") for i in range (1999,2050)],['id','year'])

cr_df.show()

+----+----------+
|  id|      year|
+----+----------+
|1999|02/01/1999|
|2000|02/01/2000|
|2001|02/01/2001|
|2002|02/01/2002|
|2003|02/01/2003|
|2004|02/01/2004|
|2005|02/01/2005|
|2006|02/01/2006|
|2007|02/01/2007|
|2008|02/01/2008|
|2009|02/01/2009|
|2010|02/01/2010|
|2011|02/01/2011|
|2012|02/01/2012|
|2013|02/01/2013|
|2014|02/01/2014|
|2015|02/01/2015|
|2016|02/01/2016|
|2017|02/01/2017|
|2018|02/01/2018|
+----+----------+
only showing top 20 rows



# how to handle or how to read variable/dynamic no.of columns data file ( with Out Header File)?

In [None]:
df = spark.read.format('text').load('/content/emp_without_header.txt').withColumn('New',split('value',','))

# To find max No.of coulumns in a table
col_size = df.select(max(size(col('New'))))

df.show(5,truncate=True)
col_size.show()
print(type(df))
df.printSchema()


+--------------------+--------------------+
|               value|                 New|
+--------------------+--------------------+
|7369,SMITH,CLERK,...|[7369, SMITH, CLE...|
|7499,ALLEN,SALESM...|[7499, ALLEN, SAL...|
|7521,WARD,SALESMA...|[7521, WARD, SALE...|
|7566,JONES,MANAGE...|[7566, JONES, MAN...|
|7654,MARTIN,SALES...|[7654, MARTIN, SA...|
+--------------------+--------------------+
only showing top 5 rows

+--------------+
|max(size(New))|
+--------------+
|             9|
+--------------+

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- value: string (nullable = true)
 |-- New: array (nullable = true)
 |    |-- element: string (containsNull = false)



In [None]:
for i in range(col_size.collect()[0][0]):
  df = df.withColumn('col'+str(i),df['New'][i])
df.show(truncate = False)

+------------------------------------------------------------+----------------------------------------------------------------------+----+------+---------+----+----------+----+----+----+----------+
|value                                                       |New                                                                   |col0|col1  |col2     |col3|col4      |col5|col6|col7|col8      |
+------------------------------------------------------------+----------------------------------------------------------------------+----+------+---------+----+----------+----+----+----+----------+
|7369,SMITH,CLERK,7902,17-12-1980,800,null,20,01-01-2022     |[7369, SMITH, CLERK, 7902, 17-12-1980, 800, null, 20, 01-01-2022]     |7369|SMITH |CLERK    |7902|17-12-1980|800 |null|20  |01-01-2022|
|7499,ALLEN,SALESMAN,7698,20-02-1981,1600,300,30,02-01-2022  |[7499, ALLEN, SALESMAN, 7698, 20-02-1981, 1600, 300, 30, 02-01-2022]  |7499|ALLEN |SALESMAN |7698|20-02-1981|1600|300 |30  |02-01-2022|
|7521,WARD

In [None]:
df1 = df.drop('value').drop('New')
df1.show(truncate = False)
df1.printSchema()
print(type(df1))


+----+------+---------+----+----------+----+----+----+----------+
|col0|col1  |col2     |col3|col4      |col5|col6|col7|col8      |
+----+------+---------+----+----------+----+----+----+----------+
|7369|SMITH |CLERK    |7902|17-12-1980|800 |null|20  |01-01-2022|
|7499|ALLEN |SALESMAN |7698|20-02-1981|1600|300 |30  |02-01-2022|
|7521|WARD  |SALESMAN |7698|22-02-1981|1250|500 |30  |03-01-2022|
|7566|JONES |MANAGER  |7839|04-02-1981|2975|null|20  |04-01-2022|
|7654|MARTIN|SALESMAN |7698|21-09-1981|1250|1400|30  |05-01-2022|
|7698|SGR   |MANAGER  |7839|05-01-1981|2850|null|30  |06-01-2022|
|7782|RAVI  |MANAGER  |7839|06-09-1981|2450|null|10  |07-01-2022|
|7788|SCOTT |ANALYST  |7566|19-04-1987|3000|null|20  |08-01-2022|
|7839|KING  |PRESIDENT|null|null      |5000|null|10  |null      |
|7844|TURNER|SALESMAN |7698|09-08-1981|1500|0   |30  |01-02-2022|
|7876|ADAMS |CLERK    |7788|23-05-1987|1100|null|20  |02-02-2022|
|7900|JAMES |CLERK    |7698|12-03-1981|950 |null|30  |03-02-2022|
|7902|FORD

# udf register

In [None]:
from pyspark.sql.types import *

# UDF

def f1(x):
  return ((x*x)-x)

spark.udf.register('fun',f1,IntegerType())
fun = udf(f1,IntegerType())

print(f1(5))

In [None]:
df5.select('ENAME',sqrt('SAL'),fun('SAL')).show()

# how to handle bad data

Spark read Mode:

1.PERMISSIVE - allows bad data - it's spark default mode

2.FAILFAST - won't allows bad data -it raise expection - it won't process further

3.DROPMALFORMED - drops bad records based on schema -it won't save bad records

4.badrecordsPath - save good data in table and saves bad it another path

In [None]:
bad = spark.read.format('csv').option('header',True).option('inferSchema',True).option('nullValue','null').load('/content/channels.csv')

bad.show()

bad.schema

In [None]:
from pyspark.sql.types import *

# need to add _corrupt_record column string type in custom made schema
schema  = StructType([StructField('CHANNEL_ID', IntegerType(), True),
                      StructField('CHANNEL_DESC', StringType(), True),
                      StructField('CHANNEL_CLASS', StringType(), True),
                      StructField('CHANNEL_CLASS_ID', IntegerType(), True),
                      StructField('CHANNEL_TOTAL', StringType(), True),
                      StructField('CHANNEL_TOTAL_ID', IntegerType(), True),
                      StructField("BadData", StringType(), True)])

In [None]:
#save bad Records Using mode - PERMISSIVE and _corrupt_record ,columnNameofCorrputRecord

bad1 = spark.read.format('csv').schema(schema).option('Mode','PERMISSIVE').option('ColumnNameOfCorruptRecord','BadData').option('header',True).option('nullValue','null').load('/content/channels.csv')
bad1.show()

# filter good records
goodData = bad1.filter('BadData is Null').drop('BAdData')
goodData.show()

# filter corrupt records
bad3 = bad1.filter('BadData is Not Null')
bad3.show()

In [None]:
#mode - FAILFAST

bad = spark.read.format('csv').schema(schema).option('mode','FAILFAST').option('header',True).option('nullValue','null').load('/content/channels.csv')
bad.show()

In [None]:
#DROPMALFORMED

bad = spark.read.format('csv').schema(schema).option('mode','DROPMALFORMED').option('header',True).option('nullValue','null').load('/content/channels.csv')
bad.show()

# Creating Data Frame from REST API

In [None]:
#REST API -Accessing the data over internet through Urls

import requests
import json


api = requests.request('GET','https://api.github.com/users/hadley/orgs')

data = api.json()

file = open('/content/sample_data/apidata.json','a')

for record in data:
  file.write("%s\n" %record)

api_df = spark.read.format('json').load('/content/sample_data/apidata.json')

In [None]:
print(type(api.json()))
print(len(api.json()))

In [None]:
api_df.show(10)
api_df.printSchema()
api_df.count()