In [0]:
# Global data variables

DATA_PATH = "/content/drive/My Drive/Colab Notebooks/Data/"

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.2.1-bin-hadoop2.7"

In [0]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
datapath = '/content/drive/My Drive/Colab Notebooks/Data/'
pokemon = spark.read.csv(datapath + 'pokemon.csv', sep=',', header=True, inferSchema=True)
pokemon.show(3)

+---+---------+------+------+---+------+-------+-------+-------+-----+----------+---------+
|  #|     Name|Type 1|Type 2| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+---------+------+------+---+------+-------+-------+-------+-----+----------+---------+
|  1|Bulbasaur| Grass|Poison| 45|    49|     49|     65|     65|   45|         1|    false|
|  2|  Ivysaur| Grass|Poison| 60|    62|     63|     80|     80|   60|         1|    false|
|  3| Venusaur| Grass|Poison| 80|    82|     83|    100|    100|   80|         1|    false|
+---+---------+------+------+---+------+-------+-------+-------+-----+----------+---------+
only showing top 3 rows



In [0]:
from pyspark.sql import functions as F

In [0]:
for col in pokemon.columns:
    pokemon = pokemon.withColumnRenamed(col, col.lower().replace(' ','_').replace('.',''))

In [0]:
pokemon.show(3)

+---+---------+------+------+---+------+-------+------+------+-----+----------+---------+
|  #|     name|type_1|type_2| hp|attack|defense|sp_atk|sp_def|speed|generation|legendary|
+---+---------+------+------+---+------+-------+------+------+-----+----------+---------+
|  1|Bulbasaur| Grass|Poison| 45|    49|     49|    65|    65|   45|         1|    false|
|  2|  Ivysaur| Grass|Poison| 60|    62|     63|    80|    80|   60|         1|    false|
|  3| Venusaur| Grass|Poison| 80|    82|     83|   100|   100|   80|         1|    false|
+---+---------+------+------+---+------+-------+------+------+-----+----------+---------+
only showing top 3 rows



Determine la columna con el mayor número de nulos

In [0]:
n_rows_pokemon = pokemon.count()

In [0]:
missing_ops = [F.round(F.sum(F.col(c).isNull().cast('int')), 2).alias(c) 
               for c in pokemon.columns]

pokemon_nulos = pokemon.select(missing_ops)
pokemon_nulos.show()

+---+----+------+------+---+------+-------+------+------+-----+----------+---------+
|  #|name|type_1|type_2| hp|attack|defense|sp_atk|sp_def|speed|generation|legendary|
+---+----+------+------+---+------+-------+------+------+-----+----------+---------+
|  0|   1|     0|   386|  0|     0|      0|     0|     0|    0|         0|        0|
+---+----+------+------+---+------+-------+------+------+-----+----------+---------+



In [0]:
import numpy as np

In [0]:
num_nulos = pokemon_nulos.toPandas().T
num_nulos

In [0]:

col_max_nulos = num_nulos.idxmax().values[0]
print(col_max_nulos)

type_2


Complete las variables categóricas nulas con el valor mayoritario.

In [0]:
pokemon.printSchema()

root
 |-- #: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- type_1: string (nullable = true)
 |-- type_2: string (nullable = true)
 |-- hp: integer (nullable = true)
 |-- attack: integer (nullable = true)
 |-- defense: integer (nullable = true)
 |-- sp_atk: integer (nullable = true)
 |-- sp_def: integer (nullable = true)
 |-- speed: integer (nullable = true)
 |-- generation: integer (nullable = true)
 |-- legendary: boolean (nullable = true)



In [0]:
string_cols = [col for col, tipo in pokemon.dtypes if tipo=='string']
string_cols

['name', 'type_1', 'type_2']

In [0]:
for col in string_cols:
    print(col, pokemon.select(col).distinct().count())

name 800
type_1 18
type_2 19


In [0]:
# Valor más repetido en cada columna

modes = {}
for col in string_cols:
    modes[col] = pokemon.dropna(subset=[col]).groupby(col).count().sort(F.col('count').desc()).first()[0]

In [0]:
modes

{'name': 'Squirtle', 'type_1': 'Water', 'type_2': 'Flying'}

In [0]:
for col in string_cols:
    pokemon = pokemon.fillna(modes[col], subset=[col])

In [0]:
pokemon.show()

+---+----------------+------+------+---+------+-------+------+------+-----+----------+---------+
|  #|            name|type_1|type_2| hp|attack|defense|sp_atk|sp_def|speed|generation|legendary|
+---+----------------+------+------+---+------+-------+------+------+-----+----------+---------+
|  1|       Bulbasaur| Grass|Poison| 45|    49|     49|    65|    65|   45|         1|    false|
|  2|         Ivysaur| Grass|Poison| 60|    62|     63|    80|    80|   60|         1|    false|
|  3|        Venusaur| Grass|Poison| 80|    82|     83|   100|   100|   80|         1|    false|
|  4|   Mega Venusaur| Grass|Poison| 80|   100|    123|   122|   120|   80|         1|    false|
|  5|      Charmander|  Fire|Flying| 39|    52|     43|    60|    50|   65|         1|    false|
|  6|      Charmeleon|  Fire|Flying| 58|    64|     58|    80|    65|   80|         1|    false|
|  7|       Charizard|  Fire|Flying| 78|    84|     78|   109|    85|  100|         1|    false|
|  8|Mega Charizard X|  Fire|D

In [0]:
pokemon_nulos = pokemon.select(missing_ops)
pokemon_nulos.show()

+---+----+------+------+---+------+-------+------+------+-----+----------+---------+
|  #|name|type_1|type_2| hp|attack|defense|sp_atk|sp_def|speed|generation|legendary|
+---+----+------+------+---+------+-------+------+------+-----+----------+---------+
|  0|   0|     0|     0|  0|     0|      0|     0|     0|    0|         0|        0|
+---+----+------+------+---+------+-------+------+------+-----+----------+---------+



In [0]:
pokemon = spark.read.csv(DATA_PATH + 'pokemon.csv', sep=',', header=True, inferSchema=True)
pokemon.show(3)

+---+---------+------+------+---+------+-------+-------+-------+-----+----------+---------+
|  #|     Name|Type 1|Type 2| HP|Attack|Defense|Sp. Atk|Sp. Def|Speed|Generation|Legendary|
+---+---------+------+------+---+------+-------+-------+-------+-----+----------+---------+
|  1|Bulbasaur| Grass|Poison| 45|    49|     49|     65|     65|   45|         1|    false|
|  2|  Ivysaur| Grass|Poison| 60|    62|     63|     80|     80|   60|         1|    false|
|  3| Venusaur| Grass|Poison| 80|    82|     83|    100|    100|   80|         1|    false|
+---+---------+------+------+---+------+-------+-------+-------+-----+----------+---------+
only showing top 3 rows



In [0]:
for col in pokemon.columns:
    pokemon = pokemon.withColumnRenamed(col, col.lower().replace(' ','_').replace('.',''))

In [0]:
pokemon_num_nulos = pokemon.select([F.col(col).isNull().cast('int').alias(col + '_nnulls') for col in pokemon.columns] + pokemon.columns)
pokemon_num_nulos.show()

+--------+-----------+-------------+-------------+---------+-------------+--------------+-------------+-------------+------------+-----------------+----------------+---+----------------+------+------+---+------+-------+------+------+-----+----------+---------+
|#_nnulls|name_nnulls|type_1_nnulls|type_2_nnulls|hp_nnulls|attack_nnulls|defense_nnulls|sp_atk_nnulls|sp_def_nnulls|speed_nnulls|generation_nnulls|legendary_nnulls|  #|            name|type_1|type_2| hp|attack|defense|sp_atk|sp_def|speed|generation|legendary|
+--------+-----------+-------------+-------------+---------+-------------+--------------+-------------+-------------+------------+-----------------+----------------+---+----------------+------+------+---+------+-------+------+------+-----+----------+---------+
|       0|          0|            0|            0|        0|            0|             0|            0|            0|           0|                0|               0|  1|       Bulbasaur| Grass|Poison| 45|    49|     4

In [0]:
nulls_cols = [col + '_nnulls' for col in pokemon.columns]
nulls_cols

['#_nnulls',
 'name_nnulls',
 'type_1_nnulls',
 'type_2_nnulls',
 'hp_nnulls',
 'attack_nnulls',
 'defense_nnulls',
 'sp_atk_nnulls',
 'sp_def_nnulls',
 'speed_nnulls',
 'generation_nnulls',
 'legendary_nnulls']

In [0]:
pokemon_num_nulos.select(*pokemon_num_nulos.columns, sum(pokemon_num_nulos[col] for col in nulls_cols).alias('total_nulls')).show()

+--------+-----------+-------------+-------------+---------+-------------+--------------+-------------+-------------+------------+-----------------+----------------+---+----------------+------+------+---+------+-------+------+------+-----+----------+---------+-----------+
|#_nnulls|name_nnulls|type_1_nnulls|type_2_nnulls|hp_nnulls|attack_nnulls|defense_nnulls|sp_atk_nnulls|sp_def_nnulls|speed_nnulls|generation_nnulls|legendary_nnulls|  #|            name|type_1|type_2| hp|attack|defense|sp_atk|sp_def|speed|generation|legendary|total_nulls|
+--------+-----------+-------------+-------------+---------+-------------+--------------+-------------+-------------+------------+-----------------+----------------+---+----------------+------+------+---+------+-------+------+------+-----+----------+---------+-----------+
|       0|          0|            0|            0|        0|            0|             0|            0|            0|           0|                0|               0|  1|       Bulba

In [0]:
pokemon_num_nulos = pokemon_num_nulos.select(*pokemon_num_nulos.columns, sum(pokemon_num_nulos[col] for col in nulls_cols).alias('total_nulls'))
max_num_nulos_row = pokemon_num_nulos.select(F.max('total_nulls').alias('max')).collect()[0][0]

In [0]:
max_num_nulos_row

2

In [0]:
pokemon_num_nulos.filter(F.col('total_nulls') < 2).count()

799