In [57]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StringType, IntegerType, StructType,DoubleType
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [58]:
spark = SparkSession.builder.appName("credit").getOrCreate()

In [59]:
spark

In [60]:
df = spark.read.csv("data.csv")

In [61]:
df

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string, _c14: string, _c15: string, _c16: string, _c17: string, _c18: string, _c19: string, _c20: string, _c21: string, _c22: string, _c23: string, _c24: string, _c25: string, _c26: string, _c27: string, _c28: string, _c29: string, _c30: string, _c31: string, _c32: string, _c33: string, _c34: string, _c35: string, _c36: string]

In [62]:
data_schema = [StructField("TARGET", IntegerType(), True),
               StructField("NAME_CONTRACT_TYPE",StringType(),True),
               StructField("CODE_GENDER", StringType(), True),
               StructField("FLAG_OWN_CAR", StringType(), True),
               StructField("FLAG_OWN_REALTY", StringType(), True),
               StructField("CNT_CHILDREN", DoubleType(), True),
               StructField("AMT_INCOME_TOTAL", DoubleType(), True),
               StructField("AMT_CREDIT", DoubleType(), True),
               StructField("AMT_ANNUITY", DoubleType(), True),
               StructField("NAME_INCOME_TYPE", StringType(), True),
               StructField("NAME_EDUCATION_TYPE", StringType(), True),
               StructField("NAME_FAMILY_STATUS", StringType(), True),
               StructField("NAME_HOUSING_TYPE", StringType(), True),
               StructField("DAYS_BIRTH", IntegerType(), True),
               StructField("DAYS_EMPLOYED", IntegerType(), True),
               StructField("FLAG_MOBIL", IntegerType(), True),
               StructField("FLAG_EMP_PHONE", IntegerType(), True),
               StructField("FLAG_WORK_PHONE", IntegerType(), True),
               StructField("FLAG_CONT_MOBILE", IntegerType(), True),
               StructField("FLAG_PHONE", DoubleType(), True),
               StructField("CNT_FAM_MEMBERS", DoubleType(), True),
               StructField("REGION_RATING_CLIENT", IntegerType(), True),
               StructField("REGION_RATING_CLIENT_W_CITY", IntegerType(), True),
               StructField("REG_REGION_NOT_LIVE_REGION", IntegerType(), True),
               StructField("REG_REGION_NOT_WORK_REGION", StringType(), True),
               StructField("ORGANIZATION_TYPE", StringType(), True),
               StructField("FLAG_DOCUMENT_2", DoubleType(), True),
               StructField("FLAG_DOCUMENT_3", DoubleType(), True),
               StructField("FLAG_DOCUMENT_4", DoubleType(), True),
               StructField("FLAG_DOCUMENT_5", DoubleType(), True),
               StructField("FLAG_DOCUMENT_6", DoubleType(), True),
               StructField("FLAG_DOCUMENT_7", DoubleType(), True),
               StructField("FLAG_DOCUMENT_8", DoubleType(), True),
               StructField("FLAG_DOCUMENT_9", DoubleType(), True),
               StructField("FLAG_DOCUMENT_10", DoubleType(), True),
               StructField("FLAG_DOCUMENT_11", DoubleType(), True),
               StructField("FLAG_DOCUMENT_12", DoubleType(), True)]

In [63]:
final_schema = StructType(fields=data_schema)

In [64]:
StructType

pyspark.sql.types.StructType

In [65]:
df = spark.read.csv("data.csv", schema=final_schema)

In [66]:
df.printSchema()

root
 |-- TARGET: integer (nullable = true)
 |-- NAME_CONTRACT_TYPE: string (nullable = true)
 |-- CODE_GENDER: string (nullable = true)
 |-- FLAG_OWN_CAR: string (nullable = true)
 |-- FLAG_OWN_REALTY: string (nullable = true)
 |-- CNT_CHILDREN: double (nullable = true)
 |-- AMT_INCOME_TOTAL: double (nullable = true)
 |-- AMT_CREDIT: double (nullable = true)
 |-- AMT_ANNUITY: double (nullable = true)
 |-- NAME_INCOME_TYPE: string (nullable = true)
 |-- NAME_EDUCATION_TYPE: string (nullable = true)
 |-- NAME_FAMILY_STATUS: string (nullable = true)
 |-- NAME_HOUSING_TYPE: string (nullable = true)
 |-- DAYS_BIRTH: integer (nullable = true)
 |-- DAYS_EMPLOYED: integer (nullable = true)
 |-- FLAG_MOBIL: integer (nullable = true)
 |-- FLAG_EMP_PHONE: integer (nullable = true)
 |-- FLAG_WORK_PHONE: integer (nullable = true)
 |-- FLAG_CONT_MOBILE: integer (nullable = true)
 |-- FLAG_PHONE: double (nullable = true)
 |-- CNT_FAM_MEMBERS: double (nullable = true)
 |-- REGION_RATING_CLIENT: integ

In [67]:
df.describe().select("AMT_CREDIT","AMT_CREDIT").show()



+------------------+------------------+
|        AMT_CREDIT|        AMT_CREDIT|
+------------------+------------------+
|            307497|            307497|
| 599027.0918984575| 599027.0918984575|
|402493.94600611855|402493.94600611855|
|           45000.0|           45000.0|
|         4050000.0|         4050000.0|
+------------------+------------------+



                                                                                

In [68]:
print((df.count(), len(df.columns)))

(307497, 37)


In [70]:
df.toPandas().head()

                                                                                

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,NAME_INCOME_TYPE,...,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12
0,1,Cash loans,M,N,Y,0.0,202500.0,406597.5,24700.5,Working,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,Cash loans,F,N,N,0.0,270000.0,1293502.5,35698.5,State servant,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,Revolving loans,M,Y,Y,0.0,67500.0,135000.0,6750.0,Working,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,Cash loans,F,N,Y,0.0,135000.0,312682.5,29686.5,Working,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,Cash loans,M,N,Y,0.0,121500.0,513000.0,21865.5,Working,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [71]:
categorical_cols = ["NAME_CONTRACT_TYPE", "CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE", "ORGANIZATION_TYPE"]
num_cols = ["CNT_CHILDREN", "AMT_INCOME_TOTAL", "AMT_CREDIT", "AMT_ANNUITY", "DAYS_BIRTH", "DAYS_EMPLOYED", "FLAG_MOBIL", "FLAG_EMP_PHONE", "FLAG_WORK_PHONE", "FLAG_CONT_MOBILE", "FLAG_PHONE", "CNT_FAM_MEMBERS", "REGION_RATING_CLIENT", "REGION_RATING_CLIENT_W_CITY", "REG_REGION_NOT_LIVE_REGION", "REG_REGION_NOT_WORK_REGION", "FLAG_DOCUMENT_2",'FLAG_DOCUMENT_3','FLAG_DOCUMENT_4','FLAG_DOCUMENT_5','FLAG_DOCUMENT_6','FLAG_DOCUMENT_7','FLAG_DOCUMENT_8','FLAG_DOCUMENT_9','FLAG_DOCUMENT_10','FLAG_DOCUMENT_11','FLAG_DOCUMENT_12']

In [72]:
for x in num_cols:
    df = df.withColumn(x,df[x].cast(DoubleType()))

In [73]:
stages = []
for col in categorical_cols:
    indexer = StringIndexer(inputCol=col,outputCol=col+'Ind')
    stages.append(indexer)

In [74]:
stages

[StringIndexer_6daf480359f5,
 StringIndexer_34e88d4ed380,
 StringIndexer_d99db9c4406c,
 StringIndexer_d3d0d2324371,
 StringIndexer_e11602a6ca7b,
 StringIndexer_067b73eca9a0,
 StringIndexer_63c47571533c,
 StringIndexer_7f5bef723101,
 StringIndexer_eb37e7006b34]

In [75]:
label_indexer = StringIndexer(inputCol = "TARGET", outputCol = "TARGETIND")
stages.append(label_indexer)

In [76]:
categorical_ind_cols = []
for i in categorical_cols:
    categorical_ind_cols.append(i+'Ind')

In [77]:
assembler = VectorAssembler(inputCols=categorical_ind_cols + num_cols, outputCol='features')
stages.append(assembler)

In [78]:
stages

[StringIndexer_6daf480359f5,
 StringIndexer_34e88d4ed380,
 StringIndexer_d99db9c4406c,
 StringIndexer_d3d0d2324371,
 StringIndexer_e11602a6ca7b,
 StringIndexer_067b73eca9a0,
 StringIndexer_63c47571533c,
 StringIndexer_7f5bef723101,
 StringIndexer_eb37e7006b34,
 StringIndexer_b8c0d670b9c8,
 VectorAssembler_59ec68525464]

In [79]:
df.dtypes

[('TARGET', 'int'),
 ('NAME_CONTRACT_TYPE', 'string'),
 ('CODE_GENDER', 'string'),
 ('FLAG_OWN_CAR', 'string'),
 ('FLAG_OWN_REALTY', 'string'),
 ('CNT_CHILDREN', 'double'),
 ('AMT_INCOME_TOTAL', 'double'),
 ('AMT_CREDIT', 'double'),
 ('AMT_ANNUITY', 'double'),
 ('NAME_INCOME_TYPE', 'string'),
 ('NAME_EDUCATION_TYPE', 'string'),
 ('NAME_FAMILY_STATUS', 'string'),
 ('NAME_HOUSING_TYPE', 'string'),
 ('DAYS_BIRTH', 'double'),
 ('DAYS_EMPLOYED', 'double'),
 ('FLAG_MOBIL', 'double'),
 ('FLAG_EMP_PHONE', 'double'),
 ('FLAG_WORK_PHONE', 'double'),
 ('FLAG_CONT_MOBILE', 'double'),
 ('FLAG_PHONE', 'double'),
 ('CNT_FAM_MEMBERS', 'double'),
 ('REGION_RATING_CLIENT', 'double'),
 ('REGION_RATING_CLIENT_W_CITY', 'double'),
 ('REG_REGION_NOT_LIVE_REGION', 'double'),
 ('REG_REGION_NOT_WORK_REGION', 'double'),
 ('ORGANIZATION_TYPE', 'string'),
 ('FLAG_DOCUMENT_2', 'double'),
 ('FLAG_DOCUMENT_3', 'double'),
 ('FLAG_DOCUMENT_4', 'double'),
 ('FLAG_DOCUMENT_5', 'double'),
 ('FLAG_DOCUMENT_6', 'double'),
 

In [80]:
pipeline = Pipeline(stages=stages)

In [81]:
pipelineModel = pipeline.fit(df)
dataset = pipelineModel.transform(df)

In [82]:
dataset.head()

Row(TARGET=1, NAME_CONTRACT_TYPE='Cash loans', CODE_GENDER='M', FLAG_OWN_CAR='N', FLAG_OWN_REALTY='Y', CNT_CHILDREN=0.0, AMT_INCOME_TOTAL=202500.0, AMT_CREDIT=406597.5, AMT_ANNUITY=24700.5, NAME_INCOME_TYPE='Working', NAME_EDUCATION_TYPE='Secondary / secondary special', NAME_FAMILY_STATUS='Single / not married', NAME_HOUSING_TYPE='House / apartment', DAYS_BIRTH=-9461.0, DAYS_EMPLOYED=-637.0, FLAG_MOBIL=1.0, FLAG_EMP_PHONE=1.0, FLAG_WORK_PHONE=0.0, FLAG_CONT_MOBILE=1.0, FLAG_PHONE=1.0, CNT_FAM_MEMBERS=1.0, REGION_RATING_CLIENT=2.0, REGION_RATING_CLIENT_W_CITY=2.0, REG_REGION_NOT_LIVE_REGION=0.0, REG_REGION_NOT_WORK_REGION=0.0, ORGANIZATION_TYPE='Business Entity Type 3', FLAG_DOCUMENT_2=0.0, FLAG_DOCUMENT_3=1.0, FLAG_DOCUMENT_4=0.0, FLAG_DOCUMENT_5=0.0, FLAG_DOCUMENT_6=0.0, FLAG_DOCUMENT_7=0.0, FLAG_DOCUMENT_8=0.0, FLAG_DOCUMENT_9=0.0, FLAG_DOCUMENT_10=0.0, FLAG_DOCUMENT_11=0.0, FLAG_DOCUMENT_12=0.0, NAME_CONTRACT_TYPEInd=0.0, CODE_GENDERInd=1.0, FLAG_OWN_CARInd=0.0, FLAG_OWN_REALTYInd=0

In [83]:
dataset = dataset.select(['features','TARGETIND'])

In [84]:
(trainData, testData) = dataset.randomSplit([0.8, 0.2])

In [101]:
lr = LogisticRegression(labelCol="TARGETIND", featuresCol="features",maxIter=10)

In [102]:
model=lr.fit(trainData)

                                                                                

In [103]:
predict_train=model.transform(trainData)

In [104]:
predict_test=model.transform(testData)

In [105]:
predict_test.select("TARGETIND","prediction").show(10)

+---------+----------+
|TARGETIND|prediction|
+---------+----------+
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
|      0.0|       0.0|
+---------+----------+
only showing top 10 rows



                                                                                

In [106]:
evaluator = MulticlassClassificationEvaluator(labelCol="TARGETIND", predictionCol="prediction", metricName="accuracy")

In [107]:
accuracy = evaluator.evaluate(predict_test)

                                                                                

In [108]:
(print('Accuracy =',accuracy * 100,'%'))

Accuracy = 91.66936005171299 %


In [111]:
lr

LogisticRegression_95172d794004

In [None]:
import pickle
pickle_out = open("model.pkl", "wb")
pickle.dump(lr, pickle_out)
pickle_out.close()