In [40]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler,StringIndexer,OneHotEncoder
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor,GBTRegressor

In [2]:
spark=SparkSession.builder.appName("Maas_tahmin").getOrCreate()

In [9]:
df=spark.read.csv("/content/Salary Data.csv",header=True,inferSchema=True)

In [10]:
df.show()

+---+------+---------------+--------------------+-------------------+------+
|Age|Gender|Education Level|           Job Title|Years of Experience|Salary|
+---+------+---------------+--------------------+-------------------+------+
| 32|  Male|     Bachelor's|   Software Engineer|                5.0| 90000|
| 28|Female|       Master's|        Data Analyst|                3.0| 65000|
| 45|  Male|            PhD|      Senior Manager|               15.0|150000|
| 36|Female|     Bachelor's|     Sales Associate|                7.0| 60000|
| 52|  Male|       Master's|            Director|               20.0|200000|
| 29|  Male|     Bachelor's|   Marketing Analyst|                2.0| 55000|
| 42|Female|       Master's|     Product Manager|               12.0|120000|
| 31|  Male|     Bachelor's|       Sales Manager|                4.0| 80000|
| 26|Female|     Bachelor's|Marketing Coordin...|                1.0| 45000|
| 38|  Male|            PhD|    Senior Scientist|               10.0|110000|

In [12]:
df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Education Level: string (nullable = true)
 |-- Job Title: string (nullable = true)
 |-- Years of Experience: double (nullable = true)
 |-- Salary: integer (nullable = true)



In [13]:
df=df.withColumnRenamed("Education Level","EducationLevel")\
.withColumnRenamed("Job Title","JobTitle")\
.withColumnRenamed("Years of Experience","YearsOfExperience")

In [14]:
df=df.na.drop()

In [15]:
indexer_edu=StringIndexer(inputCol="EducationLevel",outputCol="EducationLevel_index")

In [17]:
indexer_gen=StringIndexer(inputCol="Gender",outputCol="Gender_index")

In [18]:
indexer_job=StringIndexer(inputCol="JobTitle",outputCol="JobTitle_index",handleInvalid="keep")

In [19]:
encoder_job=OneHotEncoder(inputCol="JobTitle_index",outputCol="JobTitle_encoded")

In [26]:
assembler=VectorAssembler(
    inputCols=["Age","Gender_index","EducationLevel_index","JobTitle_encoded","YearsOfExperience"],
    outputCol="features"
)

In [27]:
pipeline=Pipeline(stages=[indexer_edu,indexer_gen,indexer_job,encoder_job,assembler])

In [28]:
model_pipeline=pipeline.fit(df)

In [29]:
final_data=model_pipeline.transform(df)

In [30]:
final_data.select("features","Salary").show()

+--------------------+------+
|            features|Salary|
+--------------------+------+
|(178,[0,162,177],...| 90000|
|(178,[0,1,2,46,17...| 65000|
|(178,[0,2,68,177]...|150000|
|(178,[0,1,63,177]...| 60000|
|(178,[0,2,90,177]...|200000|
|(178,[0,58,177],[...| 55000|
|(178,[0,1,2,60,17...|120000|
|(178,[0,35,177],[...| 80000|
|(178,[0,1,34,177]...| 45000|
|(178,[0,2,40,177]...|110000|
|(178,[0,2,161,177...| 75000|
|(178,[0,1,52,177]...|140000|
|(178,[0,98,177],[...| 65000|
|(178,[0,1,2,61,17...|130000|
|(178,[0,82,177],[...| 40000|
|(178,[0,59,177],[...|125000|
|(178,[0,1,2,123,1...| 90000|
|(178,[0,2,65,177]...|115000|
|(178,[0,1,86],[25...| 35000|
|(178,[0,137,177],...|180000|
+--------------------+------+
only showing top 20 rows


In [36]:
train_data,test_data=final_data.randomSplit([0.7,0.3],seed=42)

In [37]:
print("Eğitim seti sayisi: ",train_data.count())
print("test seti sayisi: ",test_data.count())

Eğitim seti sayisi:  280
test seti sayisi:  93


In [55]:
lr=LinearRegression(featuresCol="features",labelCol="Salary",predictionCol="tahmini_maas")

In [56]:
rf=RandomForestRegressor(featuresCol="features",labelCol="Salary",predictionCol="tahmini_maas",numTrees=20,maxDepth=5)

In [57]:
gbt=GBTRegressor(featuresCol="features",labelCol="Salary",predictionCol="tahmini_maas",maxIter=20,maxDepth=3,stepSize=0.05)

In [58]:
models={
    "Linear":lr,
    "Random Forest":rf,
    "GBT":gbt
}

In [60]:
evaluator_rmse=RegressionEvaluator(labelCol="Salary",predictionCol="tahmini_maas",metricName="rmse")
evaluator_r2=RegressionEvaluator(labelCol="Salary",predictionCol="tahmini_maas", metricName="r2")

In [46]:
print("\n" + "="*70)
print(f"{'Model Adı':<30} | {'RMSE (Hata)':<15} | {'R2 (Başarı)':<10}")
print("="*70)


Model Adı                      | RMSE (Hata)     | R2 (Başarı)


In [61]:
for name, model in models.items():
  trained_model=model.fit(train_data)
  predictions=trained_model.transform(test_data)

  rmse=evaluator_rmse.evaluate(predictions)
  r2=evaluator_r2.evaluate(predictions)
  print(f"{name:<30} | {rmse:<15.2f} | {r2:<10.4f}")
print("="*70)

Linear                         | 14078.93        | 0.9111    
Random Forest                  | 11722.24        | 0.9383    
GBT                            | 13610.25        | 0.9169    
