In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("T-systems: quinta sesion. MLib") \
    .getOrCreate()

sc = spark.sparkContext

# Ejecutar modelos ML en distribuido haciendo soluciones locales + ensemble

In [2]:
df = spark.read.csv("C:\\Users\\ruben\\Desktop\\data engineering\\s5-pysparkML\\results.csv", header = True)

In [3]:
import pyspark.sql.functions as sql_f

In [4]:
df = df.select([sql_f.col(c).cast('float') for c in ['Ex01','Ex02','Ex03','Ex04','Project','Total']])

In [5]:
df.show(5)

+-----+-----+-----+----+-------+-----+
| Ex01| Ex02| Ex03|Ex04|Project|Total|
+-----+-----+-----+----+-------+-----+
|100.0| 85.0| 80.0|70.0|   80.0| 81.0|
|100.0| 85.0| 80.0|90.0|   93.0| 79.0|
|100.0|100.0| 85.0|30.0|   70.0| 42.0|
| 95.0| 95.0|100.0|55.0|   87.0| 67.0|
| 65.0| 95.0| 65.0|25.0|   70.0| 39.0|
+-----+-----+-----+----+-------+-----+
only showing top 5 rows



In [6]:
# convertimos en problema de clasificación

In [7]:
df = df.withColumn("Classification",
                   sql_f.when(df.Total<70, "not-first")\
                   .otherwise("first"))

In [8]:
df.show(5)

+-----+-----+-----+----+-------+-----+--------------+
| Ex01| Ex02| Ex03|Ex04|Project|Total|Classification|
+-----+-----+-----+----+-------+-----+--------------+
|100.0| 85.0| 80.0|70.0|   80.0| 81.0|         first|
|100.0| 85.0| 80.0|90.0|   93.0| 79.0|         first|
|100.0|100.0| 85.0|30.0|   70.0| 42.0|     not-first|
| 95.0| 95.0|100.0|55.0|   87.0| 67.0|     not-first|
| 65.0| 95.0| 65.0|25.0|   70.0| 39.0|     not-first|
+-----+-----+-----+----+-------+-----+--------------+
only showing top 5 rows



In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Transform the Spark DataFrame into a Pandas DataFrame
results = df.toPandas()

# shape the Pandas DataFrame for scikit-learn
X = results.iloc[:, :-1]
y = results["Classification"]

# we do a random 70/30 split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1
)

# we train the model with a depth of 2
clf = DecisionTreeClassifier(random_state=0, max_depth=2)
clf.fit(X_train.values, y_train.values)

In [10]:
train_df, test_df = df.randomSplit([0.7, 0.3], seed = 12345678)

In [11]:
#como el conjunto de datos es muy pequeño, para la demostracion hace falta hacer una reparticion manual

In [12]:
train_df.rdd.getNumPartitions()

1

In [13]:
train_rdd = train_df.rdd.repartition(3)
test_rdd = test_df.rdd.repartition(2)

In [14]:
column_names = df.columns

In [15]:
def build_model(part_it):
  part_df = pd.DataFrame(part_it, columns= column_names)

  # shape the Pandas DataFrame for scikit-learn
  X_train = part_df.iloc[:, :-1]
  y_train = part_df["Classification"]

  clf = DecisionTreeClassifier(random_state=0, max_depth=2)
  model = clf.fit(X_train.values, y_train.values)

  return [model]

In [16]:
modelos = train_rdd.mapPartitions(build_model)

In [17]:
modelos.count()

3

In [18]:
modelos_driver = modelos.collect()

In [19]:
modelos.collect()

[DecisionTreeClassifier(max_depth=2, random_state=0),
 DecisionTreeClassifier(max_depth=2, random_state=0),
 DecisionTreeClassifier(max_depth=2, random_state=0)]

In [20]:
test_rdd.take(1)

[Row(Ex01=95.0, Ex02=80.0, Ex03=80.0, Ex04=70.0, Project=82.0, Total=80.0, Classification='first')]

In [21]:
def predict(instance):
    return [m.predict([instance[:-1]])[0] for m in modelos_driver]

In [22]:
test_rdd.map(predict).take(5)

[['first', 'first', 'first'],
 ['first', 'first', 'first'],
 ['first', 'first', 'first'],
 ['not-first', 'not-first', 'not-first'],
 ['not-first', 'not-first', 'not-first']]

In [23]:
def agg_prediction():
    predictions = {"first" : 0,
                   "not-first": 1 }
    for elem in preds:
        predictions[elem]+=1
    return max(predictions, key = predictions.get)

In [24]:
#test_rdd.map(

SyntaxError: incomplete input (2950376140.py, line 1)

In [None]:
#completar notebook https://colab.research.google.com/drive/18ERxpDciXN0sdlUsM8PMigW9qUHFrzKk?usp=sharing#scrollTo=M0YuQLHeokSH