In [5]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import pyspark
from pyspark.sql import SparkSession
from delta import *
from pyspark.sql.types import ShortType

builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()






 Notebook for transcrip code python sklearn to PySpark

In [30]:
#Load data
df = spark.read.csv('dataset_credit_risk.csv',inferSchema =True, header=True).na.drop()


In [31]:
#Show shema
df.printSchema()

root
 |-- loan_id: integer (nullable = true)
 |-- id: integer (nullable = true)
 |-- code_gender: string (nullable = true)
 |-- flag_own_car: string (nullable = true)
 |-- flag_own_realty: string (nullable = true)
 |-- cnt_children: integer (nullable = true)
 |-- amt_income_total: double (nullable = true)
 |-- name_income_type: string (nullable = true)
 |-- name_education_type: string (nullable = true)
 |-- name_family_status: string (nullable = true)
 |-- name_housing_type: string (nullable = true)
 |-- days_birth: integer (nullable = true)
 |-- days_employed: integer (nullable = true)
 |-- flag_mobil: integer (nullable = true)
 |-- flag_work_phone: integer (nullable = true)
 |-- flag_phone: integer (nullable = true)
 |-- flag_email: integer (nullable = true)
 |-- occupation_type: string (nullable = true)
 |-- cnt_fam_members: double (nullable = true)
 |-- status: integer (nullable = true)
 |-- birthday: string (nullable = true)
 |-- job_start_date: string (nullable = true)
 |-- loan_

In [32]:
# Shorted 
df = df.sort(F.col('id'),F.col('loan_date'))

In [33]:
# Show 
df.show(3,vertical = True)

-RECORD 0-----------------------------------
 loan_id             | 1042                 
 id                  | 5008806              
 code_gender         | M                    
 flag_own_car        | Y                    
 flag_own_realty     | Y                    
 cnt_children        | 0                    
 amt_income_total    | 112500.0             
 name_income_type    | Working              
 name_education_type | Secondary / secon... 
 name_family_status  | Married              
 name_housing_type   | House / apartment    
 days_birth          | -21474               
 days_employed       | -1134                
 flag_mobil          | 1                    
 flag_work_phone     | 0                    
 flag_phone          | 0                    
 flag_email          | 0                    
 occupation_type     | Security staff       
 cnt_fam_members     | 2.0                  
 status              | 0                    
 birthday            | 1962-12-02           
 job_start

In [10]:
df.select(F.col('loan_date')).show(3)

+----------+
| loan_date|
+----------+
|2019-02-07|
|2019-02-18|
|2019-02-24|
+----------+
only showing top 3 rows



In [11]:
df = df.withColumn("loan_date",F.to_date(F.col("loan_date"),"yyyy-MM-dd"))

In [12]:
df.printSchema()

root
 |-- loan_id: integer (nullable = true)
 |-- id: integer (nullable = true)
 |-- code_gender: string (nullable = true)
 |-- flag_own_car: string (nullable = true)
 |-- flag_own_realty: string (nullable = true)
 |-- cnt_children: integer (nullable = true)
 |-- amt_income_total: double (nullable = true)
 |-- name_income_type: string (nullable = true)
 |-- name_education_type: string (nullable = true)
 |-- name_family_status: string (nullable = true)
 |-- name_housing_type: string (nullable = true)
 |-- days_birth: integer (nullable = true)
 |-- days_employed: integer (nullable = true)
 |-- flag_mobil: integer (nullable = true)
 |-- flag_work_phone: integer (nullable = true)
 |-- flag_phone: integer (nullable = true)
 |-- flag_email: integer (nullable = true)
 |-- occupation_type: string (nullable = true)
 |-- cnt_fam_members: double (nullable = true)
 |-- status: integer (nullable = true)
 |-- birthday: string (nullable = true)
 |-- job_start_date: string (nullable = true)
 |-- loan_

In [13]:
#row_number()



In [14]:
#orderBy(F.desc("loan_date"))))
df =  df.withColumn("nb_previous_loans", F.dense_rank().over(Window.partitionBy("id").orderBy(F.col("loan_date")))-1)

In [15]:
df = df.sort(F.col('id'),F.col('loan_date'))

In [16]:
#Show output
df.select(F.col('id'),F.col("nb_previous_loans"),F.col('loan_date')).show(3,vertical = True)

-RECORD 0-----------------------
 id                | 5008806    
 nb_previous_loans | 0          
 loan_date         | 2019-02-07 
-RECORD 1-----------------------
 id                | 5008806    
 nb_previous_loans | 1          
 loan_date         | 2019-02-18 
-RECORD 2-----------------------
 id                | 5008806    
 nb_previous_loans | 2          
 loan_date         | 2019-02-24 
only showing top 3 rows



In [17]:
#Window.currentRow

In [18]:
w =Window.partitionBy(F.col('id')).orderBy(F.col("loan_date")).rowsBetween(Window.unboundedPreceding,-1)
df = df.withColumn('avg_amount_loans_previous', F.avg(F.col('loan_amount')).over(w))

In [19]:
#show output
df.select(F.col('id'),F.col("nb_previous_loans"),F.col('loan_date'),F.col('avg_amount_loans_previous'),F.col('loan_amount')).show(10,vertical = True)

-RECORD 0---------------------------------------
 id                        | 5008806            
 nb_previous_loans         | 0                  
 loan_date                 | 2019-02-07         
 avg_amount_loans_previous | null               
 loan_amount               | 107.07612254863264 
-RECORD 1---------------------------------------
 id                        | 5008806            
 nb_previous_loans         | 1                  
 loan_date                 | 2019-02-18         
 avg_amount_loans_previous | 107.07612254863264 
 loan_amount               | 102.58733003715705 
-RECORD 2---------------------------------------
 id                        | 5008806            
 nb_previous_loans         | 2                  
 loan_date                 | 2019-02-24         
 avg_amount_loans_previous | 104.83172629289484 
 loan_amount               | 154.74866156248083 
-RECORD 3---------------------------------------
 id                        | 5008806            
 nb_previous_loans  

In [35]:
df = df.withColumn("age",F.round(F.months_between(F.current_date(),F.to_date(F.col("birthday"), "yyyy-MM-dd"), True)/12).cast('int'))

In [21]:
#Show output
df.select(F.col('age'), F.col('birthday')).show()

+---+----------+
|age|  birthday|
+---+----------+
| 59|1962-12-02|
| 59|1962-12-02|
| 59|1962-12-02|
| 59|1962-12-02|
| 59|1962-12-02|
| 59|1962-12-02|
| 59|1962-12-02|
| 59|1962-12-02|
| 59|1962-12-02|
| 59|1962-12-02|
| 59|1962-12-02|
| 59|1962-12-02|
| 59|1962-12-02|
| 59|1962-12-02|
| 59|1962-12-02|
| 59|1962-12-02|
| 59|1962-12-02|
| 59|1962-12-02|
| 59|1962-12-02|
| 59|1962-12-02|
+---+----------+
only showing top 20 rows



In [22]:
df = df.withColumn("years_on_the_job",F.round(F.months_between(F.current_date(),F.to_date(F.col("job_start_date"), "yyyy-MM-dd"), True)/12).cast('int'))

In [23]:
#Show output
df.select(F.col('years_on_the_job'), F.col('job_start_date')).show()

+----------------+--------------+
|years_on_the_job|job_start_date|
+----------------+--------------+
|               4|    2018-08-10|
|               4|    2018-08-10|
|               4|    2018-08-10|
|               4|    2018-08-10|
|               4|    2018-08-10|
|               4|    2018-08-10|
|               4|    2018-08-10|
|               4|    2018-08-10|
|               4|    2018-08-10|
|               4|    2018-08-10|
|               4|    2018-08-10|
|               4|    2018-08-10|
|               4|    2018-08-10|
|               4|    2018-08-10|
|               4|    2018-08-10|
|               4|    2018-08-10|
|               4|    2018-08-10|
|               4|    2018-08-10|
|               4|    2018-08-10|
|               4|    2018-08-10|
+----------------+--------------+
only showing top 20 rows



In [34]:
udf_flag_own_car = F.udf(lambda x: 0 if x == 'N' else 1,ShortType() )

df = df.withColumn("flag_own_car",udf_flag_own_car(F.col('flag_own_car')))

In [25]:
df.select(F.col('flag_own_car')).show()

+------------+
|flag_own_car|
+------------+
|           1|
|           1|
|           1|
|           1|
|           1|
|           1|
|           1|
|           1|
|           1|
|           1|
|           1|
|           1|
|           1|
|           1|
|           1|
|           1|
|           1|
|           1|
|           1|
|           1|
+------------+
only showing top 20 rows



In [26]:
# Parquet format
#df.select('id', 'age', 'years_on_the_job', 'nb_previous_loans', 'avg_amount_loans_previous', 'flag_own_car', 'status').write.format("parquet").save("train_model.parquet")


In [27]:
#Delta Format
#df.select('id', 'age', 'years_on_the_job', 'nb_previous_loans', 'avg_amount_loans_previous', 'flag_own_car', 'status').write.format("delta").save("train_model.delta")

In [28]:
# CSV format
#df.select('id', 'age', 'years_on_the_job', 'nb_previous_loans', 'avg_amount_loans_previous', 'flag_own_car', 'status')\
#.repartition(1).write.format("csv").option("header", "true").save("train.csv")

#format("csv").save("train_model.csv")


In [29]:
!ls

-					     train_model
dataset_credit_risk.csv			     train_model.csv
Features_engineering-Notebook_1-Spark.ipynb  train_model.delta
PipLine_ETL.ipynb			     train_model_.delta
spark-warehouse				     train_model.parquet
train.csv				     Untitled.ipynb
