In [20]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("affair_app").getOrCreate()

spark

In [21]:
df = spark.read.csv("affairs.csv", header = True, inferSchema = True)
df.show(5, False)

+-------------+----+-----------+--------+---------+-------+
|rate_marriage|age |yrs_married|children|religious|affairs|
+-------------+----+-----------+--------+---------+-------+
|5            |32.0|6.0        |1.0     |3        |0      |
|4            |22.0|2.5        |0.0     |2        |0      |
|3            |32.0|9.0        |3.0     |3        |1      |
|3            |27.0|13.0       |3.0     |1        |1      |
|4            |22.0|2.5        |0.0     |1        |1      |
+-------------+----+-----------+--------+---------+-------+
only showing top 5 rows



In [22]:
from pyspark.sql.functions import col

df = df.withColumn("yrs_before_marriage", col("age") - col("yrs_married"))

df.show()

+-------------+----+-----------+--------+---------+-------+-------------------+
|rate_marriage| age|yrs_married|children|religious|affairs|yrs_before_marriage|
+-------------+----+-----------+--------+---------+-------+-------------------+
|            5|32.0|        6.0|     1.0|        3|      0|               26.0|
|            4|22.0|        2.5|     0.0|        2|      0|               19.5|
|            3|32.0|        9.0|     3.0|        3|      1|               23.0|
|            3|27.0|       13.0|     3.0|        1|      1|               14.0|
|            4|22.0|        2.5|     0.0|        1|      1|               19.5|
|            4|37.0|       16.5|     4.0|        3|      1|               20.5|
|            5|27.0|        9.0|     1.0|        1|      1|               18.0|
|            4|27.0|        9.0|     0.0|        2|      1|               18.0|
|            5|37.0|       23.0|     5.5|        2|      1|               14.0|
|            5|37.0|       23.0|     5.5

In [23]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

features = df.drop("affairs")

#features.columns

In [24]:
vector_assembler = VectorAssembler(inputCols = features.columns, outputCol = "features")

features_df = vector_assembler.transform(df)
features_df.show(5, False)

+-------------+----+-----------+--------+---------+-------+-------------------+----------------------------+
|rate_marriage|age |yrs_married|children|religious|affairs|yrs_before_marriage|features                    |
+-------------+----+-----------+--------+---------+-------+-------------------+----------------------------+
|5            |32.0|6.0        |1.0     |3        |0      |26.0               |[5.0,32.0,6.0,1.0,3.0,26.0] |
|4            |22.0|2.5        |0.0     |2        |0      |19.5               |[4.0,22.0,2.5,0.0,2.0,19.5] |
|3            |32.0|9.0        |3.0     |3        |1      |23.0               |[3.0,32.0,9.0,3.0,3.0,23.0] |
|3            |27.0|13.0       |3.0     |1        |1      |14.0               |[3.0,27.0,13.0,3.0,1.0,14.0]|
|4            |22.0|2.5        |0.0     |1        |1      |19.5               |[4.0,22.0,2.5,0.0,1.0,19.5] |
+-------------+----+-----------+--------+---------+-------+-------------------+----------------------------+
only showing top 5 

In [27]:
correlation_matrix = df.stat.corr("rate_marriage", "children")

correlation_matrix

-0.12916120274747694

In [29]:
correlation_table = df.toPandas().corr()
correlation_table

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,affairs,yrs_before_marriage
rate_marriage,1.0,-0.111127,-0.128978,-0.129161,0.078794,-0.331776,0.054294
age,-0.111127,1.0,0.894082,0.673902,0.136598,0.146519,0.103363
yrs_married,-0.128978,0.894082,1.0,0.772806,0.132683,0.203109,-0.35309
children,-0.129161,0.673902,0.772806,1.0,0.141845,0.159833,-0.308483
religious,0.078794,0.136598,0.132683,0.141845,1.0,-0.129299,-0.009315
affairs,-0.331776,0.146519,0.203109,0.159833,-0.129299,1.0,-0.144986
yrs_before_marriage,0.054294,0.103363,-0.35309,-0.308483,-0.009315,-0.144986,1.0


In [15]:
features_df.printSchema()

root
 |-- rate_marriage: integer (nullable = true)
 |-- age: double (nullable = true)
 |-- yrs_married: double (nullable = true)
 |-- children: double (nullable = true)
 |-- religious: integer (nullable = true)
 |-- affairs: integer (nullable = true)
 |-- yrs_before_marriage: double (nullable = true)
 |-- features: vector (nullable = true)



In [16]:
model_df = features_df.select("features", "affairs")
model_df.show(5, False)

+----------------------------+-------+
|features                    |affairs|
+----------------------------+-------+
|[5.0,32.0,6.0,1.0,3.0,26.0] |0      |
|[4.0,22.0,2.5,0.0,2.0,19.5] |0      |
|[3.0,32.0,9.0,3.0,3.0,23.0] |1      |
|[3.0,27.0,13.0,3.0,1.0,14.0]|1      |
|[4.0,22.0,2.5,0.0,1.0,19.5] |1      |
+----------------------------+-------+
only showing top 5 rows



In [17]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator


vector_assembler = VectorAssembler(inputCols = features.columns, outputCol = "features")

features_df = vector_assembler.transform(df)
features_df.show(5, False)

# Veriyi eğitim ve test kümelerine bölün
X_train, X_test = features_df.randomSplit([0.8, 0.2], seed=42)

# Logistic Regression modelini oluşturma
log_reg = LogisticRegression(featuresCol="features", labelCol="affairs")

# Modeli eğitme
log_model = log_reg.fit(X_train)

# Test verileri üzerinde tahminleri alın
predictions = log_model.transform(X_test)

# Tahminleri değerlendirme
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="affairs")
auc = evaluator.evaluate(predictions)

# AUC değerini yazdırma
print("AUC:", auc)

# Spark oturumunu kapat
spark.stop()

+-------------+----+-----------+--------+---------+-------+-------------------+----------------------------+
|rate_marriage|age |yrs_married|children|religious|affairs|yrs_before_marriage|features                    |
+-------------+----+-----------+--------+---------+-------+-------------------+----------------------------+
|5            |32.0|6.0        |1.0     |3        |0      |26.0               |[5.0,32.0,6.0,1.0,3.0,26.0] |
|4            |22.0|2.5        |0.0     |2        |0      |19.5               |[4.0,22.0,2.5,0.0,2.0,19.5] |
|3            |32.0|9.0        |3.0     |3        |1      |23.0               |[3.0,32.0,9.0,3.0,3.0,23.0] |
|3            |27.0|13.0       |3.0     |1        |1      |14.0               |[3.0,27.0,13.0,3.0,1.0,14.0]|
|4            |22.0|2.5        |0.0     |1        |1      |19.5               |[4.0,22.0,2.5,0.0,1.0,19.5] |
+-------------+----+-----------+--------+---------+-------+-------------------+----------------------------+
only showing top 5 