In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!wget -q https://dlcdn.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz
!tar xf spark-3.3.1-bin-hadoop3.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.1-bin-hadoop3"

import findspark
findspark.init()


from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .master("local[2]")\
        .appName("my_first_pyspark")\
        .config('spark.ui.port','4050')\
        .getOrCreate()

sc = spark.sparkContext

In [26]:
from pyspark.sql.types import *
schema = StructType([
    StructField('text',StringType(),True),
    StructField('label',IntegerType(),True),
])

df= spark.read.format("csv").option("header","true").option("quote", "\"").option("escape", "\"").schema(schema).load('/content/drive/MyDrive/Colab Notebooks/데이터분석을위한프로그래밍/imdb-review-sentiment (1).csv')
df.show(5)
df.count()

+--------------------+-----+
|                text|label|
+--------------------+-----+
|I grew up (b. 196...|    0|
|When I put this m...|    0|
|Why do people who...|    0|
|Even though I hav...|    0|
|Im a die hard Dad...|    1|
+--------------------+-----+
only showing top 5 rows



40000

In [27]:
df = df.limit(500)
df.count()

500

In [28]:
df.printSchema()

root
 |-- text: string (nullable = true)
 |-- label: integer (nullable = true)



In [29]:
df.drop_duplicates()

DataFrame[text: string, label: int]

In [30]:
df =df.where(df.label.isNotNull())
df.show()
df.count()

+--------------------+-----+
|                text|label|
+--------------------+-----+
|I grew up (b. 196...|    0|
|When I put this m...|    0|
|Why do people who...|    0|
|Even though I hav...|    0|
|Im a die hard Dad...|    1|
|A terrible movie ...|    0|
|Finally watched t...|    1|
|I caught this fil...|    0|
|It may be the rem...|    1|
|My Super Ex Girlf...|    1|
|I can't believe p...|    1|
|If you haven't se...|    0|
|I have always bee...|    1|
|Greg Davis and Br...|    0|
|A half-hearted at...|    0|
|If you want a fun...|    1|
|I really wanted t...|    1|
|The main problem ...|    0|
|The folks at Disn...|    0|
|A friend told me ...|    0|
+--------------------+-----+
only showing top 20 rows



500

- TrainValidationSplit를 이용하여 영화리뷰 긍/부정 예측 Estimator 

- pipeline을 테스트 (trainRatio=0.8)
- ParamGridBuilder를 사용하여 Word2Vec의 파라미터 vectorSize를 5, 10, 20, 40으로 바꾸며 정확도를 측정하여 출력
- 정확도는 BinaryClassificationEvaluator를 사용할 것
- 매뉴얼 및 웹검색을 통해 문제해결!

In [31]:
from pyspark.ml import *
from pyspark import keyword_only
from pyspark.ml.param.shared import *
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.sql.functions import *
class RemoveStopWordsAndSpecialCharacters(Transformer, HasInputCol, HasOutputCol):
  @keyword_only
  def __init__(self, inputCol = None, outputCol =None, stopwords =None):
    super(RemoveStopWordsAndSpecialCharacters, self).__init__()
    self.stopwords = Param(self, "stopwords", "")
    self._setDefault(stopwords=set())
    kwargs = self._input_kwargs
    self._set(**kwargs)
  
  def setStopwords(self, value):
    self._paramMap[self.stopwords] = value
    return self
  
  def getStopwords(self):
    return self.getOrDefault(self.stopwords)
  def _transform(self, dataset):
    stopwords = self.getStopwords()
    def f(s):
      return [ ''.join(e for e in token if e.isalnum()) for token in s if token not in stopwords ]
    t = ArrayType(StringType())
    out_col = self.getOutputCol()
    in_col = dataset[self.getInputCol()]
    return dataset.withColumn(out_col, udf(f, t)(in_col))

In [32]:

train,test= df.randomSplit([0.8,0.2],seed= 37)


In [33]:
train.count()

393

In [34]:
test.count()

107

In [35]:
train.show(5)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|"A Guy Thing" may...|    1|
|"A trio of treasu...|    0|
|"Elvira, Mistress...|    1|
|"Embarassing" is ...|    0|
|"Go Fish" garnere...|    0|
+--------------------+-----+
only showing top 5 rows



In [36]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.ml.classification import LinearSVC
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
tokenizer = Tokenizer(inputCol = "text",outputCol = "words")
cleaning = RemoveStopWordsAndSpecialCharacters(inputCol = "words", outputCol = "clean_words", stopwords = stopwords)
#hashingTF = HashingTF(inputCol = "clean_words",outputCol = "tf")
w2v = Word2Vec(vectorSize = 1, inputCol ="clean_words", outputCol ="w2v", minCount =1, maxIter=1)
#asm = VectorAssembler(inputCols = [hashingTF.getOutputCol(),w2v.getOutputCol()], outputCol = "features") 
asm = VectorAssembler(inputCols = [w2v.getOutputCol()], outputCol = "features") 

svm = LinearSVC(labelCol = "label")

df = tokenizer.transform(df)
df.show(5)

+--------------------+-----+--------------------+
|                text|label|               words|
+--------------------+-----+--------------------+
|I grew up (b. 196...|    0|[i, grew, up, (b....|
|When I put this m...|    0|[when, i, put, th...|
|Why do people who...|    0|[why, do, people,...|
|Even though I hav...|    0|[even, though, i,...|
|Im a die hard Dad...|    1|[im, a, die, hard...|
+--------------------+-----+--------------------+
only showing top 5 rows



In [37]:
df = cleaning.transform(df)
df.show(5)

+--------------------+-----+--------------------+--------------------+
|                text|label|               words|         clean_words|
+--------------------+-----+--------------------+--------------------+
|I grew up (b. 196...|    0|[i, grew, up, (b....|[grew, up, b, 196...|
|When I put this m...|    0|[when, i, put, th...|[when, put, this,...|
|Why do people who...|    0|[why, do, people,...|[why, do, people,...|
|Even though I hav...|    0|[even, though, i,...|[even, though, ha...|
|Im a die hard Dad...|    1|[im, a, die, hard...|[im, a, die, hard...|
+--------------------+-----+--------------------+--------------------+
only showing top 5 rows



In [38]:
model = w2v.fit(df)
df = model.transform(df)
df.show(5)

+--------------------+-----+--------------------+--------------------+--------------------+
|                text|label|               words|         clean_words|                 w2v|
+--------------------+-----+--------------------+--------------------+--------------------+
|I grew up (b. 196...|    0|[i, grew, up, (b....|[grew, up, b, 196...|[0.21288166276806...|
|When I put this m...|    0|[when, i, put, th...|[when, put, this,...|[0.31739286768647...|
|Why do people who...|    0|[why, do, people,...|[why, do, people,...|[0.28979744896779...|
|Even though I hav...|    0|[even, though, i,...|[even, though, ha...|[0.3246716960046727]|
|Im a die hard Dad...|    1|[im, a, die, hard...|[im, a, die, hard...|[0.23359658324957...|
+--------------------+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [40]:
w2v.getOutputCol()

'w2v'

In [41]:
asm = VectorAssembler(inputCols = [w2v.getOutputCol()], outputCol = "features") 
df = asm.transform(df)


DataFrame[text: string, label: int, words: array<string>, clean_words: array<string>, w2v: vector, features: vector]

In [42]:
df.show(5)

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+
|                text|label|               words|         clean_words|                 w2v|            features|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+
|I grew up (b. 196...|    0|[i, grew, up, (b....|[grew, up, b, 196...|[0.21288166276806...|[0.21288166276806...|
|When I put this m...|    0|[when, i, put, th...|[when, put, this,...|[0.31739286768647...|[0.31739286768647...|
|Why do people who...|    0|[why, do, people,...|[why, do, people,...|[0.28979744896779...|[0.28979744896779...|
|Even though I hav...|    0|[even, though, i,...|[even, though, ha...|[0.3246716960046727]|[0.3246716960046727]|
|Im a die hard Dad...|    1|[im, a, die, hard...|[im, a, die, hard...|[0.23359658324957...|[0.23359658324957...|
+--------------------+-----+--------------------+--------------------+--------------------+-----

In [None]:
ㄴㄴㄴ
from pyspark.ml import Pipeline
from pyspark.ml.feature import *
from pyspark.ml.classification import LinearSVC
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
tokenizer = Tokenizer(inputCol = "text",outputCol = "words")
cleaning = RemoveStopWordsAndSpecialCharacters(inputCol = "words", outputCol = "clean_words", stopwords = stopwords)
#hashingTF = HashingTF(inputCol = "clean_words",outputCol = "tf")
w2v = Word2Vec(vectorSize = 1, inputCol ="clean_words", outputCol ="w2v", minCount =1, maxIter=1)
#asm = VectorAssembler(inputCols = [hashingTF.getOutputCol(),w2v.getOutputCol()], outputCol = "features") 
asm = VectorAssembler(inputCols = [w2v.getOutputCol()], outputCol = "features") 

svm = LinearSVC(labelCol = "label")

#pipeline = Pipeline(stages = [tokenizer, cleaning, hashingTF, w2v , asm, svm])
pipeline = Pipeline(stages = [tokenizer, cleaning, w2v , asm, svm])

model = pipeline.fit(train)

predict_train = model.transform(train)
predict_test = model.transform(test)
predict_test.show()


accuracy_df= predict_test.select("label","prediction")
accuracy_df =accuracy_df.withColumn("accuracy",expr("label =prediction"))
true_count = accuracy_df.where("accuracy=='true'").count()
print("정확도는 ", 100*true_count/accuracy_df.count(),"% 입니다")


In [None]:
bdml = [10, 20, 40]
ACC = []
for sssss in bdml:
  stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
  tokenizer = Tokenizer(inputCol = "text",outputCol = "words")
  cleaning = RemoveStopWordsAndSpecialCharacters(inputCol = "words", outputCol = "clean_words", stopwords = stopwords)
  #hashingTF = HashingTF(inputCol = "clean_words",outputCol = "tf")
  w2v = Word2Vec(vectorSize = sssss, inputCol ="clean_words", outputCol ="w2v", minCount =1, maxIter=3)
  #asm = VectorAssembler(inputCols = [hashingTF.getOutputCol(),w2v.getOutputCol()], outputCol = "features") 
  asm = VectorAssembler(inputCols = [w2v.getOutputCol()], outputCol = "features") 

  svm = LinearSVC(labelCol = "label")

  #pipeline = Pipeline(stages = [tokenizer, cleaning, hashingTF, w2v , asm, svm])
  pipeline = Pipeline(stages = [tokenizer, cleaning, w2v , asm, svm])

  model = pipeline.fit(train)

  predict_train = model.transform(train)
  predict_test = model.transform(test)
  predict_test.show()


  accuracy_df= predict_test.select("label","prediction")
  accuracy_df =accuracy_df.withColumn("accuracy",expr("label =prediction"))
  true_count = accuracy_df.where("accuracy=='true'").count()
  zzzzz = 100*true_count/accuracy_df.count()
  ACC.append(zzzzz)

print(ACC)


In [None]:
import matplotlib.pyplot as plt
 
X= [1, 10,  20,  40]
ACC.insert(0,59.87162461266047)
plt.plot(X, ACC, color='red', marker='o', alpha=0.5, linewidth=2)
 
plt.xlabel("vector_size")
plt.ylabel("accuracy")
plt.show()