In [49]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('kaggle4396/cs-training.csv')

In [6]:
 # 去除重复值
# df.duplicated()
df.drop_duplicates(inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 0 to 149999
Data columns (total 12 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   Unnamed: 0                            150000 non-null  int64  
 1   SeriousDlqin2yrs                      150000 non-null  int64  
 2   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 3   age                                   150000 non-null  int64  
 4   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  int64  
 5   DebtRatio                             150000 non-null  float64
 6   MonthlyIncome                         120269 non-null  float64
 7   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 8   NumberOfTimes90DaysLate               150000 non-null  int64  
 9   NumberRealEstateLoansOrLines          150000 non-null  int64  
 10  NumberOfTime60-89DaysPastDueNotWorse  150000 non-null  int64  
 11  

In [8]:
# 缺失值按均值填充
for col in list(df.columns[df.isnull().sum()>0]):
    mean_val = df[col].mean()
    df[col].fillna(mean_val, inplace=True)

In [9]:
# 删除不分析的列
columns = ['RevolvingUtilizationOfUnsecuredLines',"DebtRatio","NumberOfOpenCreditLinesAndLoans","NumberOfTimes90DaysLate"]
df.drop(columns, axis=1, inplace=True)
# 保存到本地
df.to_csv('kaggle4396/data.csv')

 将文件上传至HDFS文件系统
 ```
 hdfs dfs -put kaggle4396/data.csv  ... 
 ```

# 三、使用Spark对数据处理分析

In [10]:
from pyspark.sql import SparkSession, Row
from pyspark import SparkConf

In [11]:
conf = SparkConf().setAppName("信用卡").setMaster("local[4]")
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [52]:
filename='kaggle4396/data.csv'
df = spark.read.format('csv').option("header", 'true').option('inferSchema', 'true').load(filename)

In [53]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Unnamed: 0: integer (nullable = true)
 |-- SeriousDlqin2yrs: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- NumberOfTime30-59DaysPastDueNotWorse: integer (nullable = true)
 |-- MonthlyIncome: double (nullable = true)
 |-- NumberRealEstateLoansOrLines: integer (nullable = true)
 |-- NumberOfTime60-89DaysPastDueNotWorse: integer (nullable = true)
 |-- NumberOfDependents: double (nullable = true)



In [54]:
 # 修改列名
df = df.withColumnRenamed('SeriousDlqin2yrs','y')
df = df.withColumnRenamed('NumberOfTime30-59DaysPastDueNotWorse','30-59days')
df = df.withColumnRenamed('NumberOfTime60-89DaysPastDueNotWorse','60-89days')
df = df.withColumnRenamed('NumberRealEstateLoansOrLines','RealEstateLoans')
df = df.withColumnRenamed('NumberOfDependents','families')

In [77]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Unnamed: 0: integer (nullable = true)
 |-- y: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- 30-59days: integer (nullable = true)
 |-- MonthlyIncome: double (nullable = true)
 |-- RealEstateLoans: integer (nullable = true)
 |-- 60-89days: integer (nullable = true)
 |-- families: double (nullable = true)
 |-- age_bucket: double (nullable = true)



In [72]:
# 返回data_web.py的数据列表
all_list = []
total_y = []
for i in range(2):
    total_y.append(df.filter(df['y'] == i).count())
all_list.append(total_y)
total_y  # 共有逾期10026人，139974没有逾期，总人数150000

[139974, 10026]

In [56]:
from pyspark.ml import feature as ft

In [57]:
bins = [0, 30, 45, 60, 75, np.inf]

In [67]:
# # 年龄分析
# df_age  = df.select('age', 'y')
# agenum = []

# # 统计各个年龄段的人口
# for i in range(5):
#     agenum.append(
#         df_age.filter(
#             df['age'].between(bins[i], bins[i+1])
#         ).count()
#     )
# all_list.append(agenum)
# agenum, sum(agenum)  # 左右都是闭的

([10758, 42484, 57137, 38189, 11370], 159938)

In [59]:
bucketizer = ft.Bucketizer(
    splits=bins,
    inputCol='age', outputCol='age_bucket'
)
df = bucketizer.setHandleInvalid("keep").transform(df)
df.show()

+---+----------+---+---+---------+-----------------+---------------+---------+------------------+----------+
|_c0|Unnamed: 0|  y|age|30-59days|    MonthlyIncome|RealEstateLoans|60-89days|          families|age_bucket|
+---+----------+---+---+---------+-----------------+---------------+---------+------------------+----------+
|  0|         1|  1| 45|        2|           9120.0|              6|        0|               2.0|       2.0|
|  1|         2|  0| 40|        0|           2600.0|              0|        0|               1.0|       1.0|
|  2|         3|  0| 38|        1|           3042.0|              0|        0|               0.0|       1.0|
|  3|         4|  0| 30|        0|           3300.0|              0|        0|               0.0|       1.0|
|  4|         5|  0| 49|        1|          63588.0|              1|        0|               0.0|       2.0|
|  5|         6|  0| 74|        0|           3500.0|              1|        0|               1.0|       3.0|
|  6|         7|  0

In [74]:
df_age  = df.select('age', 'age_bucket', 'y')

In [73]:
all_list

[[139974, 10026]]

In [75]:
agenum = []
for i in range(5):
    agenum.append(
        df_age.filter(
            df_age['age_bucket']==float(i)
        ).count()
    )
agenum

[8821, 38982, 53879, 36948, 11370]

In [78]:
# 统计各个年龄段逾期与不逾期的数量
age_y = []
for i in range(5):
    y0 = df_age.filter(df['age_bucket'] == float(i)).filter(df['y'] == 0).count()
    y1 = df_age.filter(df['age_bucket'] == float(i)).filter(df['y'] == 1).count()
    age_y.append((y0, y1))
age_y

[(7786, 1035), (35282, 3700), (50088, 3791), (35677, 1271), (11141, 229)]

In [79]:
all_list.append(age_y)

In [81]:
# 有逾期记录的人的本次信用卡逾期数量
df_pastDue = df.select(df['30-59days'],df['60-89days'],df['y'])
numofpastdue = []
# 逾期30-59
numofpastdue.append(df_pastDue.filter(df_pastDue['30-59days'] > 0).count())
numofpastdue

[23982]