* [Index]

* [Simple Regression](#1)
* [Multi-Regression Model](#2)
    * [Multi-Reg Example](#3)

# Simple Regression

In [1]:
import findspark
findspark.init("/Users/resitkadir/spark/spark-3.0.0/")
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("LinearRegression") \
        .master("local[4]") \
        .config("spark.driver.memory","2g") \
        .config("spark.executor.memory","4g") \
        .getOrCreate()



### Verisetini okuma

    // veriyi okuyarak dataframe oluşturma
    // Veri hakkında kısa bilgi: Bir ürünün satış miktarında kullanılan reklam bütçesine ait 200 adet veri
    // Veri kaynağı: https://www.kaggle.com/ishaanv/ISLR-Auto#Advertising.csv

In [2]:
df = spark.read.format("csv") \
    .option("header",True) \
    .option("sep",",") \
    .option("inferSchema",True) \
    .load("datasets/Advertising.csv")


df.toPandas().head()

Unnamed: 0,_c0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [3]:
## TV, Radio, Newspaper sütunlarını toplayıp Advertisement adında yeni nitelik oluşturup bunları düşürelim
## since we will do basic linear regression, we will focusin on only a columnn

df2 = df.withColumn("Advertisement", (df.TV + df.Radio + df.Newspaper)) \
        .withColumnRenamed("Sales","label") \
        .drop("TV","Radio","Newspaper")
df2.toPandas().head()

Unnamed: 0,_c0,label,Advertisement
0,1,22.1,337.1
1,2,10.4,128.9
2,3,9.3,132.4
3,4,18.5,251.3
4,5,12.9,250.0


In [4]:
#Data preprocessing
df2.describe("Advertisement","label").toPandas().head()

Unnamed: 0,summary,Advertisement,label
0,count,200.0,200.0
1,mean,200.86049999999992,14.022500000000004
2,stddev,92.9851805869837,5.217456565710477
3,min,11.7,1.6
4,max,433.6,27.0


*Since we dont have any kind of **categoreical feature** in our dataset
we dont need to use **stringindexer or one hot encoder**
we will just user* **VectorAssembler**

In [5]:
#no missin values at above

#VECTOR ASSEMBLER

from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler() \
                    .setInputCols(["Advertisement"]) \
                    .setOutputCol("features")

In [6]:
#RegressionModel
#we dont need stadart scaler since our data is stable
from pyspark.ml.regression import LinearRegression
linear_reg_obj = LinearRegression() \
                .setLabelCol("label") \
                .setFeaturesCol("features")

**PIPELINE**

In [7]:
#Lets create pipeline object(nesnesi)
from pyspark.ml import Pipeline
pipeline_obj = Pipeline() \
                .setStages([vectorAssembler, linear_reg_obj])

#test train split
train_df, test_df = df2.randomSplit([0.8, 0.2], seed=142)
#train Model
pipeline_model = pipeline_obj.fit(train_df)
#Transform test data
result_df = pipeline_model.transform(test_df)
result_df.toPandas().head()
#label is actual and prediction is our model result

Unnamed: 0,_c0,label,Advertisement,features,prediction
0,4,18.5,251.3,[251.3],16.55387
1,9,4.8,11.7,[11.7],4.929785
2,11,8.6,96.1,[96.1],9.024412
3,15,19.0,283.0,[283.0],18.091781
4,25,9.7,93.2,[93.19999999999999],8.88372


#### LinearRegresyon Modelini Pipeline Stages arasından alma

In [8]:
pipeline_model.stages
#0 is Vector,1 is Linear Regression model

[VectorAssembler_ef7282af8b80,
 LinearRegressionModel: uid=LinearRegression_09b8523148e9, numFeatures=1]

In [9]:
lrModel = pipeline_model.stages[1]

In [10]:
lrModel.coefficients

DenseVector([0.0485])

In [11]:
lrModel.summary.r2

0.7541832816333721

In [12]:
lrModel.intercept #our constant value

4.362164413237526

In [13]:
#bu model anlamlimi
lrModel.summary.pValues

[0.0, 6.661338147750939e-16]

In [14]:
lrModel.summary.tValues

[22.086724008307982, 8.977906065711718]

In [15]:
#root mean square Error
lrModel.summary.rootMeanSquaredError

2.540740128120867

In [16]:
## Regression Model = intercept + coeffe * Advertisement
# y = 4.537119328969264 + 0.0472 * Advertisement
#toplam satista Adverstimenet coeff kadar etkisi oluyor

In [17]:
#Reklam butcesine gore ayarlama
#100K budged
#Lets create data fram with RDD

df_precit_rdd = spark.sparkContext.parallelize([100.0])
df_predict = df_precit_rdd.map(lambda x :(x,)).toDF(["Advertisement"])
df_predict.show()

+-------------+
|Advertisement|
+-------------+
|        100.0|
+-------------+



In [18]:
#100 k yi 9.2 K diye tahmin etti
4.537119328969264 + 0.0472 * 100
#tutarli mi ya bakalim,sayilar cok yakin tutarli

9.257119328969264

In [19]:
df_pred_vec = vectorAssembler.transform(df_predict)
lrModel.transform(df_pred_vec).toPandas().head()

Unnamed: 0,Advertisement,features,prediction
0,100.0,[100.0],9.213619


# Multi Regressions

*hatalarin dogruya degil duzleme olan uzakliklarndan bahsediliyor*

**Varsayimlar**

    1-Esvaryanslik (homoscedasticity)

    2- Cok degiskenli normallik (multivariate normality)
    
    3-Hatalarin bagimsizli (independence of errors)
    
    4-Coklu baglanti yoklugu (multicollineraity)
    
**Model Creation(Model kurma)**

     1-Hepsini birden dahil et(ALL_IN):Guclu bir kuram
     
     2-Geriye dogru eleme (Backward Elimination)
      #hepsini dahil et ve anlamligina goree zayiflari cikar 
     
     3-ileri dogru Secme (Forward Selection)
         #tek tek ekleyerek git
     
     4-iki yonlu eleyerek ecme (Biderectional elimination)

     5-Sonuclari karsilastirma (Score Comparasion)
     

In [20]:
#Read the dataset
df = spark.read.format("csv") \
     .option("header",True) \
     .option("sep",",") \
     .option("inferSchema",True) \
     .load("datasets/Advertising.csv")

df.toPandas().head(2)

Unnamed: 0,_c0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4


In [21]:
#sales label ,c0 id
# Sütun İsimlerini Değiştirme
change = ["id","TV","Radio","Newspaper","label"]
df2 = df.selectExpr("_c0 as id","TV","Radio","Newspaper","Sales as label")
df2.toPandas().head(2)

Unnamed: 0,id,TV,Radio,Newspaper,label
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4


In [22]:
numeric_fea = ["TV","Radio",
               #,"Newspaper"#p values 0.5 buyuk oldugu icin cikardk
              ]
label = ["label"]

In [23]:
df2.describe().toPandas().head()

Unnamed: 0,summary,id,TV,Radio,Newspaper,label
0,count,200.0,200.0,200.0,200.0,200.0
1,mean,100.5,147.0425,23.264000000000024,30.553999999999995,14.022500000000004
2,stddev,57.87918451395112,85.85423631490805,14.846809176168728,21.77862083852283,5.217456565710477
3,min,1.0,0.7,0.0,0.3,1.6
4,max,200.0,296.4,49.6,114.0,27.0


In [24]:
#preparation data
from pyspark.ml.feature import VectorAssembler
vector_assembler =VectorAssembler().setInputCols(numeric_fea).setOutputCol("features")

#Regression Model
from pyspark.ml.regression import LinearRegression
lr_obj = LinearRegression().setFeaturesCol("features").setLabelCol("label")

#pipe-line
from pyspark.ml import Pipeline
pipeline_obj = Pipeline().setStages([vector_assembler,lr_obj])
#Split test train
train_df,test_df = df2.randomSplit([0.8,0.2],seed=142)
#Model train
pipeline_model=pipeline_obj.fit(train_df)
#Model testing
result_df=pipeline_model.transform(test_df)
result_df.toPandas().head(3)



Unnamed: 0,id,TV,Radio,Newspaper,label,features,prediction
0,4,151.5,41.3,58.5,18.5,"[151.5, 41.3]",17.650134
1,9,8.6,2.1,1.0,4.8,"[8.6, 2.1]",3.716108
2,11,66.1,5.8,24.2,8.6,"[66.1, 5.8]",7.043478


### Lineer Modeli Pipeline Model İçinden Almak

In [25]:
#label actual,
#pipeline model icinden linear modeli almak
pipeline_model.stages
lr_model=pipeline_model.stages[1]

lr_model.coefficients

DenseVector([0.0457, 0.1888])

In [26]:
lr_model.intercept #sabit sayimiz

2.926454603092079

In [27]:
lr_model.summary.r2
#Toplam degiskenin %89 bu modelle aciklanabilinir demek

0.9063946363969204

In [28]:
lr_model.summary.rootMeanSquaredError

1.5678514278266518

In [29]:
lr_model.summary.pValues
#p degerleine gore model olusturmamiz lazim

[0.0, 0.0, 0.0]

In [30]:
lr_model.summary.tValues

[31.28678417062139, 22.454684697674782, 9.309569401834192]

In [31]:
#backward ile yapalim
#P degerleinde esik degerin ustunde kalanlari cikartarak yapicaz
#newsapepr ustunde onu cikarmaliyiz 0.5717102604020492 ve ustte cikardik


### Model Seçimi
    
    Geriye doğru eleme yönteminde Newspaper p değeri 0.05 değerinden büyük olduğu (0.5717) için modelden çıkarılır. Bunun için nümerik nitelikler listesinden Newspaper'ı çıkarmamız yeterli. Daha sonra o paragraftan itibaren tekrar çalıştıralım.
    
        İkinci denemede tüm p değerleri eşik değer altında kaldığı için regresyon modeli: 
    y = 2.935593134859488 + (0.0442 * TV) + (0.1978 * Radio)
    
### Prediction

In [32]:
#100k for TV and 10k for Radio
#lets create dataframe
import pandas as pd 
d = {"TV":[100.0],"Radio":[10.0]}
df_df = pd.DataFrame(data=d)
df_df.head(2)

Unnamed: 0,TV,Radio
0,100.0,10.0


In [33]:
#spark data frame cevirelim
predict_df = spark.createDataFrame(df_df)
predict_df.show()

+-----+-----+
|   TV|Radio|
+-----+-----+
|100.0| 10.0|
+-----+-----+



In [34]:
predict_df_vec=vector_assembler.transform(predict_df)
lr_model.transform(predict_df_vec).show()
#satis mikatrin 9.3k olucakdur

+-----+-----+------------+-----------------+
|   TV|Radio|    features|       prediction|
+-----+-----+------------+-----------------+
|100.0| 10.0|[100.0,10.0]|9.386286831310258|
+-----+-----+------------+-----------------+



In [35]:
spark.stop()

## Multi-Regression Example

In [36]:
import findspark
findspark.init("/Users/resitkadir/spark/spark-3.0.0/")
from pyspark.sql import SparkSession


spark = SparkSession.builder \
        .appName("CokluLineerRegresyonOdevCevabi") \
        .master("local[4]") \
        .config("spark.driver.memory","2g") \
        .config("spark.executor.memory","4g") \
        .getOrCreate()

#Read The data
df = spark.read.format("csv") \
        .option("header", True) \
        .option("inferSchema", True) \
        .option("sep",",") \
        .load("datasets/Life_Expectancy_Data.csv")

df.toPandas().head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [37]:
df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Status: string (nullable = true)
 |-- Life expectancy : double (nullable = true)
 |-- Adult Mortality: integer (nullable = true)
 |-- infant deaths: integer (nullable = true)
 |-- Alcohol: double (nullable = true)
 |-- percentage expenditure: double (nullable = true)
 |-- Hepatitis B: integer (nullable = true)
 |-- Measles : integer (nullable = true)
 |--  BMI : double (nullable = true)
 |-- under-five deaths : integer (nullable = true)
 |-- Polio: integer (nullable = true)
 |-- Total expenditure: double (nullable = true)
 |-- Diphtheria : integer (nullable = true)
 |--  HIV/AIDS: double (nullable = true)
 |-- GDP: double (nullable = true)
 |-- Population: double (nullable = true)
 |--  thinness  1-19 years: double (nullable = true)
 |--  thinness 5-9 years: double (nullable = true)
 |-- Income composition of resources: double (nullable = true)
 |-- Schooling: double (nullable = true)



In [38]:
# Nitelik isimlerini değiştir. Sıralamayı bozmadan.
new_cols = ["Country", "Year", "Status", "label", "AdultMortality",
      "InfantDeaths", "Alcohol", "PercentageExpenditure", "HepatitisB", "Measles", "BMI", "UnderFiveDeaths",
      "Polio", "TotalExpenditure", "Diphtheria", "HIV_AIDS", "GDP", "Population", "Thinness119", "Thinness59",
      "IncomeCompositionOfResources", "Schooling"]


df2 = df.toDF(*new_cols)
df2.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Status: string (nullable = true)
 |-- label: double (nullable = true)
 |-- AdultMortality: integer (nullable = true)
 |-- InfantDeaths: integer (nullable = true)
 |-- Alcohol: double (nullable = true)
 |-- PercentageExpenditure: double (nullable = true)
 |-- HepatitisB: integer (nullable = true)
 |-- Measles: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- UnderFiveDeaths: integer (nullable = true)
 |-- Polio: integer (nullable = true)
 |-- TotalExpenditure: double (nullable = true)
 |-- Diphtheria: integer (nullable = true)
 |-- HIV_AIDS: double (nullable = true)
 |-- GDP: double (nullable = true)
 |-- Population: double (nullable = true)
 |-- Thinness119: double (nullable = true)
 |-- Thinness59: double (nullable = true)
 |-- IncomeCompositionOfResources: double (nullable = true)
 |-- Schooling: double (nullable = true)



In [39]:
df2.toPandas().head()

Unnamed: 0,Country,Year,Status,label,AdultMortality,InfantDeaths,Alcohol,PercentageExpenditure,HepatitisB,Measles,...,Polio,TotalExpenditure,Diphtheria,HIV_AIDS,GDP,Population,Thinness119,Thinness59,IncomeCompositionOfResources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [40]:
# Türlerine Göre Nitelikleri Ayırma
categorical_cols = ["Country","Status"]
'''
numerical_cols = ["Year", "AdultMortality",
      "InfantDeaths", "Alcohol", "PercentageExpenditure", "HepatitisB", "Measles", "BMI", "UnderFiveDeaths",
      "Polio", "TotalExpenditure", "Diphtheria", "HIV_AIDS", "GDP", "Population", "Thinness119", "Thinness59",
      "IncomeCompositionOfResources", "Schooling"]
'''
numerical_cols = ["Year", "AdultMortality",
      "InfantDeaths", "Alcohol",   "BMI", "UnderFiveDeaths",
      "TotalExpenditure", "Diphtheria", "HIV_AIDS", "GDP",  
      "IncomeCompositionOfResources", "Schooling"]
label = ["label"]
#Data Cleaning

df3 = df2.na.drop()
df3.count()

1649

In [41]:
#DAta preprocessing
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.regression import LinearRegression, LinearRegressionModel


#StringIndexer
# Country'de kategori sayısı çok fazla olduğundan analize dahil etmedik
status_string_indexer = StringIndexer().setInputCol("Status").setOutputCol("StatusIndexed")

#OneHotEncoder
encoder = OneHotEncoder().setInputCols(["StatusIndexed"]).setOutputCols(["StatusEncoded"])

#VectorAssembler
vector_assembler = VectorAssembler().setInputCols(numerical_cols + encoder.getOutputCols()).setOutputCol("features")

#Lineer Model
linear_regression_object = LinearRegression().setFeaturesCol("features").setLabelCol("label")


#pipeline

pipeline_object = Pipeline().setStages([status_string_indexer, encoder, vector_assembler, linear_regression_object])

#Veri setini train-test ayırma
train_df, test_df = df3.randomSplit([0.8, 0.2], seed=142)
train_df.cache()
test_df.cache()

## Modeli Eğitme
pipeline_model = pipeline_object.fit(train_df)
pipeline_model.transform(train_df).select("label","prediction").toPandas().head(10)


Unnamed: 0,label,prediction
0,54.8,57.477491
1,55.3,57.968683
2,56.2,63.517118
3,57.0,58.917665
4,57.3,60.973816
5,57.3,60.965253
6,57.5,61.271526
7,58.6,62.330411
8,59.2,62.914456
9,59.5,63.417605


### Lineer Modeli Pipeline Model İçinden Almak

In [42]:
pipeline_model.stages

[StringIndexerModel: uid=StringIndexer_43b4d3b86c93, handleInvalid=error,
 OneHotEncoderModel: uid=OneHotEncoder_f7a4b9b8e522, dropLast=true, handleInvalid=error, numInputCols=1, numOutputCols=1,
 VectorAssembler_370ab6fca250,
 LinearRegressionModel: uid=LinearRegression_5bb233257612, numFeatures=13]

In [43]:
lr_model = pipeline_model.stages[-1]

In [44]:
lr_model.coefficients

DenseVector([-0.1288, -0.017, 0.0837, -0.1189, 0.0322, -0.0638, 0.1301, 0.0189, -0.4489, 0.0001, 9.5061, 0.9633, -0.7417])

In [45]:
lr_model.intercept

311.38582512126834

In [46]:
lr_model.summary.r2

0.8376909414684949

In [47]:
lr_model.summary.rootMeanSquaredError

3.548258073692097

In [48]:
lr_model.summary.pValues

[4.906680344252123e-07,
 0.0,
 1.0839773523230178e-11,
 0.0013139767268131042,
 3.488250992500497e-07,
 4.583222690257571e-12,
 0.0032213559086500254,
 0.00010182824188298412,
 0.0,
 2.0167800762749266e-10,
 0.0,
 0.0,
 0.04916901684857211,
 1.388568815485769e-09]

In [49]:
lr_model.summary.tValues

[-5.055419563326975,
 -15.941254538270861,
 6.856859783076918,
 -3.2199032389729627,
 5.1214264046069236,
 -6.983291352567848,
 2.9512647331799053,
 3.89823326824372,
 -22.824045885241958,
 6.41098466173556,
 10.633613335912266,
 14.734853131467316,
 -1.9689627057884276,
 6.100865298620751]

In [50]:
#Niteliklerle pValues bir arada
zipped = zip(lr_model.summary.pValues,numerical_cols+["Status","Sabit"])
sorted_zip = sorted(zipped)

for i in sorted_zip:
    print(i)
#0.05 den buyuk deger kalmiyana kadar gidicez Pvalue

(0.0, 'AdultMortality')
(0.0, 'HIV_AIDS')
(0.0, 'IncomeCompositionOfResources')
(0.0, 'Schooling')
(4.583222690257571e-12, 'UnderFiveDeaths')
(1.0839773523230178e-11, 'InfantDeaths')
(2.0167800762749266e-10, 'GDP')
(1.388568815485769e-09, 'Sabit')
(3.488250992500497e-07, 'BMI')
(4.906680344252123e-07, 'Year')
(0.00010182824188298412, 'Diphtheria')
(0.0013139767268131042, 'Alcohol')
(0.0032213559086500254, 'TotalExpenditure')
(0.04916901684857211, 'Status')


    (0.0, 'AdultMortality')
    (0.0, 'HIV_AIDS')
    (0.0, 'IncomeCompositionOfResources')
    (0.0, 'Schooling')
    (1.021405182655144e-14, 'UnderFiveDeaths')
    (9.037215420448774e-14, 'InfantDeaths')
    (8.194222855806288e-09, 'Sabit')
    (2.214080842888322e-06, 'Year')
    (9.277282233099982e-06, 'BMI')
    (0.0069944469886213945, 'Alcohol')
    (0.03194558950457549, 'Diphtheria')
    (0.033259070668361534, 'Status')
    (0.041134342157394865, 'TotalExpenditure')
    6. Tur (0.14696628529633227, 'Polio')
    7. Tur (0.20657371462857865, 'Thinness59')
    (0.2415689423147689, 'GDP')
    5. Tur (0.3280541792373892, 'HepatitisB')
    4. Tur (0.4246739503761421, 'PercentageExpenditure')
    3. Tur (0.46316882090430034, 'Measles')
    2. Tur (0.6154505536890638, 'Thinness119')
    1. Tur (0.9774634776466289, 'Population')
    
    
    
            7. Turdan sonra tüm nitelikler 0.05 eşik değeri altında kaldığı için model tamamlanmıştır.
        (0.0, 'AdultMortality')
        (0.0, 'HIV_AIDS')
        (0.0, 'IncomeCompositionOfResources')
        (0.0, 'Schooling')
        (4.218847493575595e-15, 'UnderFiveDeaths')
        (1.554312234475219e-14, 'InfantDeaths')
        (7.28083149326153e-10, 'GDP')
        (9.313896320861659e-10, 'Sabit')
        (6.888301440355349e-08, 'BMI')
        (3.7210678383026163e-07, 'Year')
        (0.0037352078727632687, 'Diphtheria')
        (0.011812369488869878, 'Alcohol')
        (0.02289647691655916, 'TotalExpenditure')
        (0.036279656005715255, 'Status')

In [51]:
#spark.stop()

# Hyperparameters and optimizations

**PARAMETRE**

    Model parametresi,modele dahil olan ve degeri veriden tahmin edilebilecek bir konfirigasyon degiskenidir.
    
    1-Egitim esnasinda tahmin edilir ve ogrenilir.
    
    2-Tahmin yaparken model tarafindan istenir.
    
    3-Genellikle kullanici tarafindan elle ayarlanmaz.
    
    Ørnek olarak,yapay sinir aglarindaki agirliklar,Regresyondaki katsayilar,SVM deki destek vectorleri
 
**hiperparametre**

    Model hiperparametresi,modele harici. olan ve degeri veriden tahmin edilemeyen bir configurasyondur.
    
    Genelllikle model parametrelerinin tahmin edilmesini yardimci olan sureclerde kullanilir.Kullanici tarfindan belirlenir.
    
    Genellikle ongorulen modlleme problemi icin ayarlanmistir(tune).
    
    Ornekler:
        1-K-means teknigindeki k belirlenmesinde(kullanici belirler)
        2-Random forest icin agac sayisi
        3-Ridge ve Lasso duzenleyiciler icin Lambda degeri
        4-Deep Learningde katman sayisi ve ogrenme orani(Learning rate)
        
        
**Hiper parametreleri icin ideal degerler**

    Grid Search(olasi tum kombinasyonlar)
    
Elle bir model parametresi belirmeniz gerekiyorsa,o muhtemeken bir **hiper parametredir.**


**Linear Regresyon icin hiperparametreler**

**elasticNetParam:**

    Regulasyon ile ilgilidir.Lasso(L1) ve Ridge(L2) duzenleyicilerine ait LAmbda katsayisinin 0-1 ayarlamasidir.0.0 tamamen L2,1.0 ise L1 cezasidir.Varsayilan 0.0
    
**loss** 
       
       : *Modeli optimize edecek hata fonksiyonudur.Varsayilan fonksiyon "saqureError" digeri ise "huber"dir
            Varsayilan :square Error' dur.*
            
**MaxIter** :

        MAximum iterasyon sayisi,Varsayilan :100
        
**Cross Validaion** *Ornegin bese ayir 4 u egitim biri test olarak kullan, ve her iterasyonda farkli yeri test diye kullan*

**Train -Validation Split**

In [52]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


#paramgrid
#tum olasilik combinasyonlarini tutugumuz yer
param_grid = ParamGridBuilder() \
            .addGrid(lr_obj.aggregationDepth, [2,5]) \
            .addGrid(lr_obj.elasticNetParam, [0.0, 0.2, 0.7]) \
            .addGrid(lr_obj.epsilon, [1.35, 1.55]) \
            .addGrid(lr_obj.maxIter, [10, 20]) \
            .addGrid(lr_obj.regParam, [0.00, 0.01, 0.05]) \
            .addGrid(lr_obj.solver, ["auto", "normal", "l-bfgs"]) \
            .addGrid(lr_obj.tol, [1.0E-6, 1.0E-4]) \
            .build()


#Cross Validation

cv = CrossValidator() \
    .setEstimator(pipeline_object) \
    .setEvaluator(RegressionEvaluator()) \
    .setEstimatorParamMaps(param_grid) \
    .setNumFolds(5) \
    .setParallelism(2)


df_train, df_test = df3.randomSplit([0.9, 0.2], seed=142)
df_train.cache()
df_test.cache()

DataFrame[Country: string, Year: int, Status: string, label: double, AdultMortality: int, InfantDeaths: int, Alcohol: double, PercentageExpenditure: double, HepatitisB: int, Measles: int, BMI: double, UnderFiveDeaths: int, Polio: int, TotalExpenditure: double, Diphtheria: int, HIV_AIDS: double, GDP: double, Population: double, Thinness119: double, Thinness59: double, IncomeCompositionOfResources: double, Schooling: double]

In [53]:
cv_model = cv.fit(df_train)
cv_model.transform(df3).select("label","prediction").toPandas().head()

Unnamed: 0,label,prediction
0,65.0,63.565448
1,59.9,63.345039
2,59.9,63.369293
3,59.5,63.363253
4,59.2,62.868723


### THE BEST MODEL DECISION

In [55]:
# En iyi pipeline model
best_model = cv_model.bestModel

In [56]:
type(best_model)#pipeline cinsi

pyspark.ml.pipeline.PipelineModel

In [57]:
best_model

PipelineModel_5e6409bed9b2

In [58]:
lr_model.coefficients

DenseVector([-0.1288, -0.017, 0.0837, -0.1189, 0.0322, -0.0638, 0.1301, 0.0189, -0.4489, 0.0001, 9.5061, 0.9633, -0.7417])

In [59]:
lr_model.intercept

311.38582512126834

In [60]:
lr_model.explainParams()

'aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)\nelasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)\nepsilon: The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber (default: 1.35)\nfeaturesCol: features column name. (default: features, current: features)\nfitIntercept: whether to fit an intercept term. (default: True)\nlabelCol: label column name. (default: label, current: label)\nloss: The loss function to be optimized. Supported options: squaredError, huber. (default: squaredError)\nmaxIter: max number of iterations (>= 0). (default: 100)\npredictionCol: prediction column name. (default: prediction)\nregParam: regularization parameter (>= 0). (default: 0.0)\nsolver: The solver algorithm for optimization. Supported options: auto, normal, l-bfgs. (default: auto)\nstandardization: whether to standardize 

In [61]:
lr_model.summary.r2
#tunning yapmadan elde ettigimiz deger

0.8376909414684949

In [62]:
pipeline_model.stages

[StringIndexerModel: uid=StringIndexer_43b4d3b86c93, handleInvalid=error,
 OneHotEncoderModel: uid=OneHotEncoder_f7a4b9b8e522, dropLast=true, handleInvalid=error, numInputCols=1, numOutputCols=1,
 VectorAssembler_370ab6fca250,
 LinearRegressionModel: uid=LinearRegression_5bb233257612, numFeatures=13]

In [63]:
# En iyi pipeline model içinden lr modeli almak
#pipe linde daki stages -1 alinca linear regressyonu secmis oluyoruz
lr_model = best_model.stages[-1]

In [64]:
lr_model.coefficients

DenseVector([-0.1325, -0.0169, 0.0776, -0.1209, 0.0327, -0.0594, 0.1327, 0.0196, -0.4508, 0.0001, 9.6153, 0.9558, -0.7429])

In [65]:
lr_model.explainParams().split("\n")

['aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)',
 'elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)',
 'epsilon: The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber (default: 1.35)',
 'featuresCol: features column name. (default: features, current: features)',
 'fitIntercept: whether to fit an intercept term. (default: True)',
 'labelCol: label column name. (default: label, current: label)',
 'loss: The loss function to be optimized. Supported options: squaredError, huber. (default: squaredError)',
 'maxIter: max number of iterations (>= 0). (default: 100)',
 'predictionCol: prediction column name. (default: prediction)',
 'regParam: regularization parameter (>= 0). (default: 0.0)',
 'solver: The solver algorithm for optimization. Supported options: auto, normal, l-bfgs. (default: auto)',
 'standa