# Setting

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!wget -q https://dlcdn.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz
!tar xf spark-3.3.1-bin-hadoop3.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.1-bin-hadoop3"

import findspark
findspark.init()


from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local")\
        .appName("my_first_pyspark")\
        .config('spark.ui.port','4050')\
        .getOrCreate()

sc = spark.sparkContext

# Data 분석 및 전처리

In [None]:
df= spark.read.format("csv").option("header","true").load('/content/drive/MyDrive/Colab Notebooks/데이터분석을위한프로그래밍/income_data.csv')
df.show()

cols = df.columns
col_index = []
for col in cols: 
  col_index.append(col+'_index')
col_ohe = []
for col in cols: 
  col_ohe.append(col+'_ohe')

+---+-----------------+------+-------------+---------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+--------------+--------------+------+
|age|        workclass|fnlwgt|    education|educational-num|      marital-status|        occupation|  relationship|               race| gender|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+-----------------+------+-------------+---------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+--------------+--------------+------+
| 39|        State-gov| 77516|    Bachelors|             13|       Never-married|      Adm-clerical| Not-in-family|              White|   Male|        2174|           0|            40| United-States| <=50K|
| 50| Self-emp-not-inc| 83311|    Bachelors|             13|  Married-civ-spouse|   Exec-managerial|       Husband|              White|   Male|           0|           0|   

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler


indexer = StringIndexer(inputCols=cols, outputCols=col_index) 
ohe_encoder = OneHotEncoder(inputCols=col_index, outputCols=col_ohe,dropLast=False)
colcol = col_ohe
colcol.remove("income_ohe")

assembler = VectorAssembler(inputCols =colcol, outputCol = "features")


pipeline = Pipeline(stages=[indexer,ohe_encoder,assembler])
df_indexed =pipeline.fit(df).transform(df)
df_indexed.show()

+---+-----------------+------+-------------+---------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+--------------+--------------+------+---------+---------------+------------+---------------+---------------------+--------------------+----------------+------------------+----------+------------+------------------+------------------+--------------------+--------------------+------------+---------------+-------------+--------------------+---------------+-------------------+------------------+---------------+----------------+-------------+-------------+----------------+----------------+------------------+------------------+-------------+--------------------+
|age|        workclass|fnlwgt|    education|educational-num|      marital-status|        occupation|  relationship|               race| gender|capital-gain|capital-loss|hours-per-week|native-country|income|age_index|workclass_index|fnlwgt_index|education_index|educati

# 데이터 전처리
## 중복 데이터 처리

In [None]:
# 데이터 전처리 
# 중복 데이터 체크

print(df_indexed.count())
print(df_indexed.distinct().count())

32561
32537


In [None]:
#중복된 거 제거 방법
df_indexed = df_indexed.drop_duplicates()

print(df_indexed.count())
print(df_indexed.distinct().count())

32537
32537


### 쓰레기 컬럼제거

In [None]:
import pyspark.sql.functions as fn
# 각 col 별 결측 값이 없는 row의 비율
# - 결측값이 100%에 가까운 컬럼의 경우 제거 

# null이 아닌 값의 비율 계산
df_indexed.select(*[
    (fn.count(c) / fn.count('*')).alias(c +'_missing') for c in df_indexed.columns
]).show()

# 버릴게 없네... 

+-----------+-----------------+--------------+-----------------+-----------------------+----------------------+------------------+--------------------+------------+--------------+--------------------+--------------------+----------------------+----------------------+--------------+-----------------+-----------------------+--------------------+-----------------------+-----------------------------+----------------------------+------------------------+--------------------------+------------------+--------------------+--------------------------+--------------------------+----------------------------+----------------------------+--------------------+---------------+---------------------+------------------+---------------------+---------------------------+--------------------------+----------------------+------------------------+----------------+------------------+------------------------+------------------------+--------------------------+--------------------------+------------------+------

## Feature vector 만들기 

### VectorAssembler

In [None]:
df_indexed.select("features").show(truncate =False)

+-------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                                         |
+-------------------------------------------------------------------------------------------------------------------------------------------------+
|(22144,[2,73,2043,21738,21754,21762,21777,21784,21793,21795,21797,21916,22010,22103],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])  |
|(22144,[8,73,4830,21732,21748,21762,21773,21784,21790,21795,21797,21916,22009,22102],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])  |
|(22144,[4,73,3233,21731,21747,21763,21773,21785,21793,21796,21797,21916,22008,22102],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])  |
|(22144,[18,79,11306,21731,21747,21763,21772,21787,21790,21796,21797,21916,22019,22102],[1.0,1.0,1.0,1.0,1.0,1.0

### 정규화 StandardScaler
- vector rows의 각 차원 값을 정규화
- 파라미터 
  - withstd: True by default ; 각 차원 값이 unit standard deviation을 따르도록 정규화 즉 N(뮤,1) 을 따르도록 정규화
  - withMean : False by default ; 평균이 0이 되도록 정규화 
  


In [None]:
from pyspark.ml.feature import StandardScaler

standardscaler = StandardScaler(
    inputCol = "features", outputCol = "scaled_features",withStd =True , withMean =False
    )
df_imputed = standardscaler.fit(df_indexed).transform(df_indexed)

df_imputed.select("features","scaled_features").show(6)

+--------------------+--------------------+
|            features|     scaled_features|
+--------------------+--------------------+
|(22144,[2,73,2043...|(22144,[2,73,2043...|
|(22144,[8,73,4830...|(22144,[8,73,4830...|
|(22144,[4,73,3233...|(22144,[4,73,3233...|
|(22144,[18,79,113...|(22144,[18,79,113...|
|(22144,[29,77,511...|(22144,[29,77,511...|
|(22144,[2,77,4716...|(22144,[2,77,4716...|
+--------------------+--------------------+
only showing top 6 rows



## train,test split

In [None]:
train,test = df_imputed.randomSplit([0.8,0.2],seed= 37)
print(train.count())
test.count()

26112


6425

In [None]:
df_imputed.show()

+---+------------+------+-------------+---------------+-------------------+------------------+--------------+-------------------+-------+------------+------------+--------------+--------------+------+---------+---------------+------------+---------------+---------------------+--------------------+----------------+------------------+----------+------------+------------------+------------------+--------------------+--------------------+------------+---------------+-------------+--------------------+---------------+-------------------+------------------+---------------+----------------+-------------+-------------+----------------+----------------+------------------+------------------+-------------+--------------------+--------------------+
|age|   workclass|fnlwgt|    education|educational-num|     marital-status|        occupation|  relationship|               race| gender|capital-gain|capital-loss|hours-per-week|native-country|income|age_index|workclass_index|fnlwgt_index|education_inde

# LinearSVC

In [None]:
from pyspark.ml.classification import LinearSVC

lr = LinearSVC(
    labelCol = "income_index",
    featuresCol = "scaled_features",
    maxIter =3
)


model = lr.fit(train)
predict_train = model.transform(train)
predict_test = model.transform(test)

predict_test.select("income_index","prediction").show()

+------------+----------+
|income_index|prediction|
+------------+----------+
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
|         0.0|       0.0|
+------------+----------+
only showing top 20 rows



In [None]:
# 정확도 구하기 
from pyspark.sql.functions import *
accuracy_df= predict_test.select("income_index","prediction")
accuracy_df =accuracy_df.withColumn("accuracy",expr("income_index ==prediction"))
accuracy_df.show()

+------------+----------+--------+
|income_index|prediction|accuracy|
+------------+----------+--------+
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
+------------+----------+--------+
only showing top 20 rows



In [None]:
accuracy_df.where("accuracy=='true'").show()
true_count = accuracy_df.where("accuracy=='true'").count()
print(true_count)

+------------+----------+--------+
|income_index|prediction|accuracy|
+------------+----------+--------+
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
|         0.0|       0.0|    true|
+------------+----------+--------+
only showing top 20 rows

5343


In [None]:
print("정확도는 ", 100*true_count/accuracy_df.count(),"% 입니다")

정확도는  83.15953307392996 % 입니다
