In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql.types import Row

In [2]:
spark = SparkSession.builder.appName('First App').getOrCreate()
sc = spark.sparkContext

In [3]:
sc.applicationId

u'application_1590214224778_12802'

In [4]:
raw_df = spark.read.text('hdfs:///user/edureka_448212/sqoop_import1/*')

In [5]:
raw_df.show(5)

+--------------------+
|               value|
+--------------------+
|1,Cash loans,M,N,...|
|0,Cash loans,F,N,...|
|0,Revolving loans...|
|0,Cash loans,F,N,...|
|0,Cash loans,M,N,...|
+--------------------+
only showing top 5 rows



In [6]:
df = raw_df.rdd.map(lambda r:r[0].split(',')).map(lambda arr:Row(
TARGET=int(arr[0]),
NAME_CONTRACT_TYPE=arr[1],
CODE_GENDER=arr[2],
FLAG_OWN_CAR=arr[3],
FLAG_OWN_REALTY=arr[4],
CNT_CHILDREN=int(arr[5]),
AMT_INCOME_TOTAL=float(arr[6]),
AMT_CREDIT=float(arr[7]),
AMT_ANNUITY=float(arr[8]),
NAME_EDUCATION_TYPE=arr[9]
)).toDF()

In [7]:
df.show(5)

+-----------+----------+----------------+------------+-----------+------------+---------------+------------------+-------------------+------+
|AMT_ANNUITY|AMT_CREDIT|AMT_INCOME_TOTAL|CNT_CHILDREN|CODE_GENDER|FLAG_OWN_CAR|FLAG_OWN_REALTY|NAME_CONTRACT_TYPE|NAME_EDUCATION_TYPE|TARGET|
+-----------+----------+----------------+------------+-----------+------------+---------------+------------------+-------------------+------+
|    24700.5|  406597.5|        202500.0|           0|          M|           N|              Y|        Cash loans|            Working|     1|
|    35698.5| 1293502.5|        270000.0|           0|          F|           N|              N|        Cash loans|      State servant|     0|
|     6750.0|  135000.0|         67500.0|           0|          M|           Y|              Y|   Revolving loans|            Working|     0|
|    29686.5|  312682.5|        135000.0|           0|          F|           N|              Y|        Cash loans|            Working|     0|
|    2

In [8]:
columns = [
'TARGET',
'NAME_CONTRACT_TYPE',
'CODE_GENDER',
'FLAG_OWN_CAR',
'FLAG_OWN_REALTY',
'CNT_CHILDREN',
'AMT_INCOME_TOTAL',
'AMT_CREDIT',
'AMT_ANNUITY',
'NAME_INCOME_TYPE',
'NAME_EDUCATION_TYPE',
'NAME_FAMILY_STATUS',
'NAME_HOUSING_TYPE',
'DAYS_BIRTH',
'DAYS_EMPLOYED',
'FLAG_MOBIL',
'FLAG_EMP_PHONE',
'FLAG_WORK_PHONE',
'FLAG_CONT_MOBILE',
'FLAG_PHONE',
'CNT_FAM_MEMBERS',
'REGION_RATING_CLIENT',
'REGION_RATING_CLIENT_W_CITY',
'REG_REGION_NOT_LIVE_REGION',
'REG_REGION_NOT_WORK_REGION',
'ORGANIZATION_TYPE',
'FLAG_DOCUMENT_2',
'FLAG_DOCUMENT_3',
'FLAG_DOCUMENT_4',
'FLAG_DOCUMENT_5',
'FLAG_DOCUMENT_6',
'FLAG_DOCUMENT_7',
'FLAG_DOCUMENT_8',
'FLAG_DOCUMENT_9',
'FLAG_DOCUMENT_10',
'FLAG_DOCUMENT_11',
'FLAG_DOCUMENT_12'
]

In [9]:
data = spark.read.option('inferSchema',True).csv('hdfs:///user/edureka_448212/sqoop_import1/*').limit(1000)

In [10]:
data = data.toDF(*columns)

In [11]:
data.show(5)

+------+------------------+-----------+------------+---------------+------------+----------------+----------+-----------+----------------+--------------------+--------------------+-----------------+----------+-------------+----------+--------------+---------------+----------------+----------+---------------+--------------------+---------------------------+--------------------------+--------------------------+--------------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+----------------+
|TARGET|NAME_CONTRACT_TYPE|CODE_GENDER|FLAG_OWN_CAR|FLAG_OWN_REALTY|CNT_CHILDREN|AMT_INCOME_TOTAL|AMT_CREDIT|AMT_ANNUITY|NAME_INCOME_TYPE| NAME_EDUCATION_TYPE|  NAME_FAMILY_STATUS|NAME_HOUSING_TYPE|DAYS_BIRTH|DAYS_EMPLOYED|FLAG_MOBIL|FLAG_EMP_PHONE|FLAG_WORK_PHONE|FLAG_CONT_MOBILE|FLAG_PHONE|CNT_FAM_MEMBERS|REGION_RATING_CLIENT|REGION_RATING_CLIENT_W_CITY|REG_REGION_NOT_LIVE_REGION|RE

In [12]:
data.cache()

DataFrame[TARGET: int, NAME_CONTRACT_TYPE: string, CODE_GENDER: string, FLAG_OWN_CAR: string, FLAG_OWN_REALTY: string, CNT_CHILDREN: int, AMT_INCOME_TOTAL: double, AMT_CREDIT: double, AMT_ANNUITY: double, NAME_INCOME_TYPE: string, NAME_EDUCATION_TYPE: string, NAME_FAMILY_STATUS: string, NAME_HOUSING_TYPE: string, DAYS_BIRTH: int, DAYS_EMPLOYED: int, FLAG_MOBIL: int, FLAG_EMP_PHONE: int, FLAG_WORK_PHONE: int, FLAG_CONT_MOBILE: int, FLAG_PHONE: int, CNT_FAM_MEMBERS: double, REGION_RATING_CLIENT: int, REGION_RATING_CLIENT_W_CITY: int, REG_REGION_NOT_LIVE_REGION: int, REG_REGION_NOT_WORK_REGION: int, ORGANIZATION_TYPE: string, FLAG_DOCUMENT_2: int, FLAG_DOCUMENT_3: int, FLAG_DOCUMENT_4: int, FLAG_DOCUMENT_5: int, FLAG_DOCUMENT_6: int, FLAG_DOCUMENT_7: int, FLAG_DOCUMENT_8: int, FLAG_DOCUMENT_9: int, FLAG_DOCUMENT_10: int, FLAG_DOCUMENT_11: int, FLAG_DOCUMENT_12: int]

### Exploratory Data Analysis

In [13]:
from pyspark.sql.functions import isnan, when, count, col, StringType, countDistinct, avg

#### No of loans falling into each Target with percentage

In [14]:
data.groupBy('TARGET').count().withColumn('Percentage',col('count')*100/data.count()).show()

+------+-----+----------+
|TARGET|count|Percentage|
+------+-----+----------+
|     1|   70|       7.0|
|     0|  930|      93.0|
+------+-----+----------+



#### Number of missing values in each column

In [15]:
null_df = data.select([count(when(isnan(c), c)).alias(c) for c in data.columns]).toPandas()
null_df.head(5)

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,NAME_INCOME_TYPE,...,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### View data types

In [16]:
data.dtypes

[('TARGET', 'int'),
 ('NAME_CONTRACT_TYPE', 'string'),
 ('CODE_GENDER', 'string'),
 ('FLAG_OWN_CAR', 'string'),
 ('FLAG_OWN_REALTY', 'string'),
 ('CNT_CHILDREN', 'int'),
 ('AMT_INCOME_TOTAL', 'double'),
 ('AMT_CREDIT', 'double'),
 ('AMT_ANNUITY', 'double'),
 ('NAME_INCOME_TYPE', 'string'),
 ('NAME_EDUCATION_TYPE', 'string'),
 ('NAME_FAMILY_STATUS', 'string'),
 ('NAME_HOUSING_TYPE', 'string'),
 ('DAYS_BIRTH', 'int'),
 ('DAYS_EMPLOYED', 'int'),
 ('FLAG_MOBIL', 'int'),
 ('FLAG_EMP_PHONE', 'int'),
 ('FLAG_WORK_PHONE', 'int'),
 ('FLAG_CONT_MOBILE', 'int'),
 ('FLAG_PHONE', 'int'),
 ('CNT_FAM_MEMBERS', 'double'),
 ('REGION_RATING_CLIENT', 'int'),
 ('REGION_RATING_CLIENT_W_CITY', 'int'),
 ('REG_REGION_NOT_LIVE_REGION', 'int'),
 ('REG_REGION_NOT_WORK_REGION', 'int'),
 ('ORGANIZATION_TYPE', 'string'),
 ('FLAG_DOCUMENT_2', 'int'),
 ('FLAG_DOCUMENT_3', 'int'),
 ('FLAG_DOCUMENT_4', 'int'),
 ('FLAG_DOCUMENT_5', 'int'),
 ('FLAG_DOCUMENT_6', 'int'),
 ('FLAG_DOCUMENT_7', 'int'),
 ('FLAG_DOCUMENT_8', 'i

#### Number of columns in each data type

In [17]:
from collections import Counter
print(Counter((x[1] for x in data.dtypes)))

Counter({'int': 24, 'string': 9, 'double': 4})


#### View unique values in all string columns

In [18]:
str_col_names = [x.name for x in data.schema.fields if x.dataType == StringType()]
unique_df = data.agg(*((countDistinct(col(c))).alias(c) for c in str_col_names))
unique_df.show()

+------------------+-----------+------------+---------------+----------------+-------------------+------------------+-----------------+-----------------+
|NAME_CONTRACT_TYPE|CODE_GENDER|FLAG_OWN_CAR|FLAG_OWN_REALTY|NAME_INCOME_TYPE|NAME_EDUCATION_TYPE|NAME_FAMILY_STATUS|NAME_HOUSING_TYPE|ORGANIZATION_TYPE|
+------------------+-----------+------------+---------------+----------------+-------------------+------------------+-----------------+-----------------+
|                 2|          2|           2|              2|               4|                  4|                 5|                6|               50|
+------------------+-----------+------------+---------------+----------------+-------------------+------------------+-----------------+-----------------+



In [19]:
unique_df.toPandas().head(5)

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,ORGANIZATION_TYPE
0,2,2,2,2,4,4,5,6,50


#### Describe days birth column

In [20]:
data = data.withColumn('AGE', col('DAYS_BIRTH')/-365)
data.toPandas().head()

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,NAME_INCOME_TYPE,...,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,AGE
0,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,Working,...,0,0,0,0,0,0,0,0,0,25.920548
1,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,State servant,...,0,0,0,0,0,0,0,0,0,45.931507
2,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,Working,...,0,0,0,0,0,0,0,0,0,52.180822
3,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,Working,...,0,0,0,0,0,0,0,0,0,52.068493
4,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,Working,...,0,0,0,0,1,0,0,0,0,54.608219


In [21]:
data.select('DAYS_BIRTH','AGE').describe().show(5)

+-------+-----------------+------------------+
|summary|       DAYS_BIRTH|               AGE|
+-------+-----------------+------------------+
|  count|             1000|              1000|
|   mean|       -15872.748|43.486980821917804|
| stddev|4235.854370105366|11.605080466042105|
|    min|           -25104| 21.10958904109589|
|    max|            -7705| 68.77808219178083|
+-------+-----------------+------------------+



#### Describe days employed

In [22]:
data.select('DAYS_EMPLOYED').describe().show(5)

+-------+------------------+
|summary|     DAYS_EMPLOYED|
+-------+------------------+
|  count|              1000|
|   mean|         55733.906|
| stddev|134159.10852572627|
|    min|            -15632|
|    max|            365243|
+-------+------------------+



#### We will now dig deep into anomalies of DAY_EMPLOYED column

In [23]:
anom = data.filter(col('DAYS_EMPLOYED') == 365243)


In [24]:
anom.toPandas().head(5)

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,NAME_INCOME_TYPE,...,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,AGE
0,0,Cash loans,F,N,Y,0,112500.0,1019610.0,33826.5,Pensioner,...,0,0,0,0,0,0,0,0,0,55.065753
1,0,Cash loans,F,N,Y,0,38419.155,148365.0,10678.5,Pensioner,...,0,0,1,0,0,0,0,0,0,55.936986
2,0,Cash loans,F,N,Y,0,83250.0,239850.0,23850.0,Pensioner,...,0,0,1,0,0,0,0,0,0,68.019178
3,0,Cash loans,F,N,Y,0,99000.0,247275.0,17338.5,Pensioner,...,0,0,0,0,0,0,0,0,0,65.534247
4,0,Cash loans,F,N,Y,0,108000.0,746280.0,42970.5,Pensioner,...,0,0,0,0,0,0,0,0,0,64.515068


In [25]:
non_anom = data.filter(col('DAYS_EMPLOYED') != 365243)

In [26]:
print('The non-anomalies default on %0.2f%% of loans' % (100 * non_anom.select(avg(non_anom.TARGET)).first()[0]))

The non-anomalies default on 7.13% of loans


In [27]:
print('The anomalies default on %0.2f%% of loans' % (100 * anom.select(avg(non_anom.TARGET)).first()[0]))

The anomalies default on 6.33% of loans


In [28]:
print('There are %d anomalous days of employment' % anom.count())

There are 158 anomalous days of employment


#### Create anomaly flag column

In [29]:
data = data.withColumn('DAYS_EMPLOYED_ANOM',col('DAYS_EMPLOYED') == 365243)

In [30]:
#Replace anomaly value with 0
data = data.withColumn('DAYS_EMPLOYED', when(col('DAYS_EMPLOYED') == 365243, 0).otherwise(col('DAYS_EMPLOYED')))

#### Effect of age on repayment by binning the column and the generating pivot table

In [31]:
data.select('AGE').describe().show()

+-------+------------------+
|summary|               AGE|
+-------+------------------+
|  count|              1000|
|   mean|43.486980821917804|
| stddev|11.605080466042105|
|    min| 21.10958904109589|
|    max| 68.77808219178083|
+-------+------------------+



In [14]:
from pyspark.ml.feature import Bucketizer

In [33]:
splits = [0, 25.0, 35.0, 55.0, 100.0]

In [15]:
help(Bucketizer)

Help on class Bucketizer in module pyspark.ml.feature:

class Bucketizer(pyspark.ml.wrapper.JavaTransformer, pyspark.ml.param.shared.HasInputCol, pyspark.ml.param.shared.HasOutputCol, pyspark.ml.util.JavaMLReadable, pyspark.ml.util.JavaMLWritable)
 |  Maps a column of continuous features to a column of feature buckets.
 |  
 |  >>> values = [(0.1,), (0.4,), (1.2,), (1.5,), (float("nan"),), (float("nan"),)]
 |  >>> df = spark.createDataFrame(values, ["values"])
 |  >>> bucketizer = Bucketizer(splits=[-float("inf"), 0.5, 1.4, float("inf")],
 |  ...     inputCol="values", outputCol="buckets")
 |  >>> bucketed = bucketizer.setHandleInvalid("keep").transform(df).collect()
 |  >>> len(bucketed)
 |  6
 |  >>> bucketed[0].buckets
 |  0.0
 |  >>> bucketed[1].buckets
 |  0.0
 |  >>> bucketed[2].buckets
 |  1.0
 |  >>> bucketed[3].buckets
 |  2.0
 |  >>> bucketizer.setParams(outputCol="b").transform(df).head().b
 |  0.0
 |  >>> bucketizerPath = temp_path + "/bucketizer"
 |  >>> bucketizer.save(bu

In [34]:
bucketizer = Bucketizer(splits=splits, inputCol="AGE", outputCol="bucketedData")

In [35]:
bucketedData = bucketizer.transform(data)

In [36]:
bucketedData.toPandas().head(5)

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,NAME_INCOME_TYPE,...,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,AGE,DAYS_EMPLOYED_ANOM,bucketedData
0,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,Working,...,0,0,0,0,0,0,0,25.920548,False,1.0
1,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,State servant,...,0,0,0,0,0,0,0,45.931507,False,2.0
2,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,Working,...,0,0,0,0,0,0,0,52.180822,False,2.0
3,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,Working,...,0,0,0,0,0,0,0,52.068493,False,2.0
4,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,Working,...,0,0,1,0,0,0,0,54.608219,False,2.0


In [37]:
print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))

Bucketizer output with 4 buckets


In [38]:
bucketedData.groupBy("bucketedData").pivot("TARGET").count().show()

+------------+---+---+
|bucketedData|  0|  1|
+------------+---+---+
|         1.0|210| 22|
|         2.0|496| 30|
|         3.0|188| 12|
|         0.0| 36|  6|
+------------+---+---+



#### Create new variables based on domain knowledge

In [39]:
bucketedData = bucketedData.withColumn('CREDIT_INCOME_PERCENT',col('AMT_CREDIT')/col('AMT_INCOME_TOTAL'))

In [40]:
bucketedData = bucketedData.withColumn('ANNUITY_INCOME_PERCENT',col('AMT_ANNUITY')/col('AMT_INCOME_TOTAL'))

In [41]:
bucketedData = bucketedData.withColumn('CREDIT_TERM',col('AMT_ANNUITY')/col('AMT_CREDIT'))

In [42]:
bucketedData = bucketedData.withColumn('DAYS_EMPLOYED_PERCENT',col('DAYS_EMPLOYED')/col('DAYS_BIRTH'))

#### We will now convert string column with only 2 unique values to a column of label indices NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY

In [43]:
from pyspark.ml import Pipeline

In [44]:
from pyspark.ml.feature import StringIndexer

In [45]:
indexer = StringIndexer(inputCol='NAME_CONTRACT_TYPE', outputCol='NAME_CONTRACT_TYPE_Index')

In [46]:
model = indexer.fit(bucketedData)

In [47]:
indexed = model.transform(bucketedData)

In [48]:
indexers = [StringIndexer(inputCol=column, outputCol=column+'_Index') for column in ['NAME_CONTRACT_TYPE','CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY']]

In [49]:
pipeline = Pipeline(stages=indexers)

In [50]:
df_r = pipeline.fit(bucketedData).transform(bucketedData)

In [51]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

In [52]:
indexer = StringIndexer().setInputCol("NAME_INCOME_TYPE").setOutputCol("NAME_INCOME_TYPE_Index").fit(df_r)

In [53]:
indexed = indexer.transform(df_r)

In [54]:
encoder = OneHotEncoder().setInputCol("NAME_INCOME_TYPE_Index").setOutputCol("NAME_INCOME_TYPE_Vec")

In [55]:
encoded = encoder.transform(indexed)

In [56]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_Index") for column in ['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE','ORGANIZATION_TYPE']]

In [57]:
encoder = [OneHotEncoder().setInputCol(column+"_Index").setOutputCol(column + "_Vec") for column in ['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE','ORGANIZATION_TYPE']]

In [58]:
pipeline = Pipeline(stages=indexers + encoder)

In [59]:
encoded = pipeline.fit(df_r).transform(df_r)

In [60]:
feature_cols = [
'CNT_CHILDREN',
'AMT_INCOME_TOTAL',
'AMT_CREDIT',
'AMT_ANNUITY',
'DAYS_EMPLOYED',
'FLAG_MOBIL',
'FLAG_EMP_PHONE',
'FLAG_WORK_PHONE',
'FLAG_CONT_MOBILE',
'FLAG_PHONE',
'CNT_FAM_MEMBERS',
'REGION_RATING_CLIENT',
'REGION_RATING_CLIENT_W_CITY',
'REG_REGION_NOT_LIVE_REGION',
'REG_REGION_NOT_WORK_REGION',
'FLAG_DOCUMENT_2',
'FLAG_DOCUMENT_3',
'FLAG_DOCUMENT_4',
'FLAG_DOCUMENT_5',
'FLAG_DOCUMENT_6',
'FLAG_DOCUMENT_7',
'FLAG_DOCUMENT_8',
'FLAG_DOCUMENT_9',
'FLAG_DOCUMENT_10',
'FLAG_DOCUMENT_11',
'FLAG_DOCUMENT_12',
'NAME_CONTRACT_TYPE_Index',
'CODE_GENDER_Index',
'FLAG_OWN_CAR_Index',
'FLAG_OWN_REALTY_Index',
'NAME_INCOME_TYPE_Vec',
'NAME_EDUCATION_TYPE_Vec',
'ORGANIZATION_TYPE_Vec',
'AGE',
'DAYS_EMPLOYED_ANOM',
'bucketedData',
'CREDIT_INCOME_PERCENT',
'ANNUITY_INCOME_PERCENT',
'CREDIT_TERM',
'DAYS_EMPLOYED_PERCENT']

In [61]:
from pyspark.ml.feature import VectorAssembler

In [62]:
from pyspark.ml.linalg import Vectors

In [63]:
assembler = VectorAssembler().setInputCols(feature_cols).setOutputCol("features")

In [64]:
output = assembler.transform(encoded).withColumn("label",col("TARGET"))

In [65]:
from pyspark.ml.classification import LogisticRegression

In [66]:
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [67]:
lrModel = lr.fit(output)

In [68]:
print("Coefficients: "  + str(lrModel.coefficients))

Coefficients: (92,[],[])


In [69]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [70]:
transformed = lrModel.transform(output)

In [71]:
results = transformed.select(['prediction', 'label'])

In [72]:
results = results.withColumn("label", col("label").cast(DoubleType()))

In [73]:
predictionAndLabels=results.rdd

In [74]:
metrics = MulticlassMetrics(predictionAndLabels)

In [75]:
cm=metrics.confusionMatrix().toArray()

In [76]:
accuracy=(cm[0][0]+cm[1][1])/cm.sum()

In [77]:
print("Logistic Regression: accuracy",accuracy)

('Logistic Regression: accuracy', 0.93)


In [80]:
# before deploying we have to persist the model

In [79]:
pipeline = Pipeline(stages= [bucketizer,index, indexers+encoder, assembler, lr] )

# Train model. This also runs the indexers.
model = pipeline.fit(bucketedData)

model.save("'hdfs:///user/edureka_448212/pymodel")

NameError: name 'index' is not defined