In [74]:
from pyspark.sql import (
    functions as f,
    SparkSession,
    types as t
)

spark = SparkSession.builder.appName("df_most_interviewed").getOrCreate()
table_schema = t.StructType([
    t.StructField("interviwer_id", t.StringType(), False),
    t.StructField("occupation_id", t.StringType(), False),
    t.StructField("rating", t.IntegerType(), False)])

csv_file_path = "file:///home/jovyan/work/sample/like.csv"
df = spark.read.schema(table_schema).csv(csv_file_path)

interviwer_count = df.groupBy("occupation_id").count().orderBy(f.desc("count"))

for d in interviwer_count.select("occupation_id", f.col("count").alias("cnt")).collect():
    print(f"{d.occupation_id}: {d.cnt}")


# But, What if we want to know what occupation_id is?  
# 1100: engineer
# 2030: developer
# 3801: painter
# 3021: chemistry teacher
# 9382: priest

meta = {
    "1100": "engineer",
    "2030": "developer",
    "3801": "painter",
    "3021": "chemistry teacher",
    "9382": "priest"
}
occupation_dict = spark.sparkContext.broadcast(meta)

def get_occupation_name(occupation_id: str) -> str:
    return occupation_dict.value[occupation_id]

occupation_lookup_udf = f.udf(get_occupation_name)

occupation_with_name = interviwer_count.withColumn("occupation_name", occupation_lookup_udf(f.col("occupation_id")))

occupation_with_name.show(10)

ConnectionRefusedError: [Errno 111] Connection refused

In [69]:
df.printSchema()

root
 |-- interviwer_id: string (nullable = true)
 |-- occupation_id: string (nullable = true)
 |-- rating: integer (nullable = true)



In [70]:
csv_file_path = "file:///home/jovyan/work/sample/like.csv"
df = spark.read.option("inferSchema","True").csv(csv_file_path)
col = ['interviwer_id', 'occupation_id','rating']
df = df.toDF(*col)

In [71]:
df.printSchema()

root
 |-- interviwer_id: integer (nullable = true)
 |-- occupation_id: integer (nullable = true)
 |-- rating: integer (nullable = true)



In [64]:
interviwer_count.printSchema()

root
 |-- occupation_id: integer (nullable = true)
 |-- count: long (nullable = false)



In [65]:
interviwer_count = df.groupBy("occupation_id").count().orderBy(f.desc("count"))
interviwer_count.show()

+-------------+-----+
|occupation_id|count|
+-------------+-----+
|         1100|  217|
|         3801|  203|
|         2030|  200|
|         3021|  191|
|         9382|  189|
+-------------+-----+



In [72]:
meta = {
    "1100": "engineer",
    "2030": "developer",
    "3801": "painter",
    "3021": "chemistry teacher",
    "9382": "priest"
}

occupation_dict = spark.sparkContext.broadcast(meta)

def get_occupation_name(occupation_id: str) -> str:
    return occupation_dict.value[occupation_id]

occupation_lookup_udf = f.udf(get_occupation_name)

occupation_with_name = interviwer_count.withColumn("occupation_name", occupation_lookup_udf(f.col("occupation_id")))

occupation_with_name.show()


+-------------+-----+-----------------+
|occupation_id|count|  occupation_name|
+-------------+-----+-----------------+
|         1100|  217|         engineer|
|         3801|  203|          painter|
|         2030|  200|        developer|
|         3021|  191|chemistry teacher|
|         9382|  189|           priest|
+-------------+-----+-----------------+



In [None]:
df

In [1]:
print(1)

1


In [4]:
from pyspark.sql import SparkSession, functions as f, types as t

In [5]:
spark = SparkSession.builder.appName("sp").getOrCreate()

In [6]:
path = 'file:///home/jovyan/work/sample/like.csv'

In [7]:
df= spark.read.option("inferSchema","True").csv(path)

In [9]:
col = ['id1','id2','point']
df = df.toDF(*col)

In [11]:
df.show()

+-----+----+-----+
|  id1| id2|point|
+-----+----+-----+
|11657|1100|    8|
|13727|2030|    2|
|59892|3801|    1|
| 6538|3021|    6|
|95811|2030|    9|
|54500|1100|   10|
|69741|2030|    3|
|51166|2030|   10|
|70009|9382|    5|
|63152|2030|    6|
|70758|1100|    2|
|35580|2030|    5|
|63199|1100|   10|
|33078|2030|    3|
|97480|9382|    2|
|47223|1100|    8|
|80308|3021|    8|
|26691|1100|    3|
|17194|3021|    3|
|96584|2030|    4|
+-----+----+-----+
only showing top 20 rows



In [18]:
re = df.groupBy(df.id2).count().orderBy(f.desc("count"))
re.show()

+----+-----+
| id2|count|
+----+-----+
|1100|  217|
|3801|  203|
|2030|  200|
|3021|  191|
|9382|  189|
+----+-----+



In [70]:
dat.printSchema()

root
 |-- id2: integer (nullable = true)
 |-- count: long (nullable = false)



In [78]:
dat = dat.withColumn("id2", f.col("id2").cast("String"))

In [81]:
dat.printSchema()

root
 |-- id2: string (nullable = true)
 |-- count: long (nullable = false)



In [91]:
meta = {
    "1100":"천재",
    "3801":"바보",
    "2030":"보통",
    "3021":"준수",
    "9382":"비범"
}

meta_dict = spark.sparkContext.broadcast(meta)

def get_name(id):
    return meta_dict.value[id]

meta_udf = f.udf(get_name)

dat2 = dat.withColumn("name", meta_udf(f.col("id2")))

In [93]:
dat2.show()

+----+-----+----+
| id2|count|name|
+----+-----+----+
|1100|  217|천재|
|3801|  203|바보|
|2030|  200|보통|
|3021|  191|준수|
|9382|  189|비범|
+----+-----+----+



In [94]:
path = "file:///home/jovyan/work/sample/like.csv"

In [95]:
from pyspark.sql import SparkSession, functions as f , types as t

In [96]:
spark = SparkSession.builder.appName("test").getOrCreate()

In [98]:
df = spark.read.option("inferSchema","true").csv(path)

In [100]:
df.show(10)

+-----+----+---+
|  _c0| _c1|_c2|
+-----+----+---+
|11657|1100|  8|
|13727|2030|  2|
|59892|3801|  1|
| 6538|3021|  6|
|95811|2030|  9|
|54500|1100| 10|
|69741|2030|  3|
|51166|2030| 10|
|70009|9382|  5|
|63152|2030|  6|
+-----+----+---+
only showing top 10 rows



In [106]:
df = df.toDF('id1','id2','val')

In [107]:
df.show(10)

+-----+----+---+
|  id1| id2|val|
+-----+----+---+
|11657|1100|  8|
|13727|2030|  2|
|59892|3801|  1|
| 6538|3021|  6|
|95811|2030|  9|
|54500|1100| 10|
|69741|2030|  3|
|51166|2030| 10|
|70009|9382|  5|
|63152|2030|  6|
+-----+----+---+
only showing top 10 rows



In [108]:
df.printSchema()

root
 |-- id1: integer (nullable = true)
 |-- id2: integer (nullable = true)
 |-- val: integer (nullable = true)



In [109]:
df = df.withColumn("id2", f.col("id2").cast("String"))

In [110]:
df.printSchema()

root
 |-- id1: integer (nullable = true)
 |-- id2: string (nullable = true)
 |-- val: integer (nullable = true)



In [111]:
df = df.groupBy(df.id2).count().orderBy(f.desc("count"))

In [112]:
df.show()

+----+-----+
| id2|count|
+----+-----+
|1100|  217|
|3801|  203|
|2030|  200|
|3021|  191|
|9382|  189|
+----+-----+



In [116]:
meta = {
    "1100": "engineer",
    "2030": "developer",
    "3801": "painter",
    "3021": "chemistry teacher",
    "9382": "priest"
}

meta_dict=spark.sparkContext.broadcast(meta)

def get_name(id):
    return meta_dict.value[id]

meta_udf = f.udf(get_name)

df = df.withColumn("name", meta_udf(f.col("id2")))




In [117]:
df.show(10)

+----+-----+-----------------+
| id2|count|             name|
+----+-----+-----------------+
|1100|  217|         engineer|
|3801|  203|          painter|
|2030|  200|        developer|
|3021|  191|chemistry teacher|
|9382|  189|           priest|
+----+-----+-----------------+



In [119]:
dat.show(10)

+----+-----+
| id2|count|
+----+-----+
|1100|  217|
|3801|  203|
|2030|  200|
|3021|  191|
|9382|  189|
+----+-----+



In [121]:
df.createOrReplaceTempView('df')
dat.createOrReplaceTempView('dat')

In [126]:
spark.sql(
    'select a.* , b.name from dat a left outer join df b on a.id2 = b.id2' 
    ).show()

+----+-----+-----------------+
| id2|count|             name|
+----+-----+-----------------+
|1100|  217|         engineer|
|2030|  200|        developer|
|3801|  203|          painter|
|3021|  191|chemistry teacher|
|9382|  189|           priest|
+----+-----+-----------------+



In [1]:
dat

NameError: name 'dat' is not defined

In [89]:
from pyspark.sql import functions as f, SparkSession, types as t


spark = SparkSession.builder.appName("df_study").getOrCreate()
csv_file_path = "file:///home/jovyan/work/sample/hero-network.csv"
df = spark.read.option("header","True").option("inferSchema","True").csv(csv_file_path)
df.show(10)

+--------------------+--------------------+
|               hero1|               hero2|
+--------------------+--------------------+
|       LITTLE, ABNER|      PRINCESS ZANDA|
|       LITTLE, ABNER|BLACK PANTHER/T'CHAL|
|BLACK PANTHER/T'CHAL|      PRINCESS ZANDA|
|       LITTLE, ABNER|      PRINCESS ZANDA|
|       LITTLE, ABNER|BLACK PANTHER/T'CHAL|
|BLACK PANTHER/T'CHAL|      PRINCESS ZANDA|
|STEELE, SIMON/WOLFGA|    FORTUNE, DOMINIC|
|STEELE, SIMON/WOLFGA| ERWIN, CLYTEMNESTRA|
|STEELE, SIMON/WOLFGA|IRON MAN/TONY STARK |
|STEELE, SIMON/WOLFGA|IRON MAN IV/JAMES R.|
+--------------------+--------------------+
only showing top 10 rows



+--------------------+--------------------+
|               hero1|               hero2|
+--------------------+--------------------+
|       LITTLE, ABNER|      PRINCESS ZANDA|
|       LITTLE, ABNER|BLACK PANTHER/T'CHAL|
|BLACK PANTHER/T'CHAL|      PRINCESS ZANDA|
|       LITTLE, ABNER|      PRINCESS ZANDA|
|       LITTLE, ABNER|BLACK PANTHER/T'CHAL|
|BLACK PANTHER/T'CHAL|      PRINCESS ZANDA|
|STEELE, SIMON/WOLFGA|    FORTUNE, DOMINIC|
|STEELE, SIMON/WOLFGA| ERWIN, CLYTEMNESTRA|
|STEELE, SIMON/WOLFGA|IRON MAN/TONY STARK |
|STEELE, SIMON/WOLFGA|IRON MAN IV/JAMES R.|
|STEELE, SIMON/WOLFGA|RAVEN, SABBATH II/EL|
|RAVEN, SABBATH II/EL|    FORTUNE, DOMINIC|
|RAVEN, SABBATH II/EL| ERWIN, CLYTEMNESTRA|
|RAVEN, SABBATH II/EL|IRON MAN/TONY STARK |
|RAVEN, SABBATH II/EL|IRON MAN IV/JAMES R.|
|IRON MAN IV/JAMES R.|    FORTUNE, DOMINIC|
|IRON MAN IV/JAMES R.| ERWIN, CLYTEMNESTRA|
|IRON MAN IV/JAMES R.|IRON MAN/TONY STARK |
|IRON MAN/TONY STARK |    FORTUNE, DOMINIC|
|IRON MAN/TONY STARK | ERWIN, CL

In [92]:
data = df.groupBy("hero1").agg(f.collect_set("hero2").alias("connection"))\
            .withColumnRenamed("hero1", "hero")
data.show(10)

+--------------------+--------------------+
|                hero|          connection|
+--------------------+--------------------+
|             ABCISSA|[ELSIE DEE, FURY,...|
|             ABSALOM|[SHATTERSTAR II/G...|
|ABSORBING MAN | MUTA|[DRAX | MUTANT X-...|
|ABSORBING MAN/CARL C|[SOMMERS, APRIL, ...|
|ADAMS, CONGRESSMAN H|[SPIDER-MAN/PETER...|
| ADAMS, NICOLE NIKKI|[JUSTICE II/VANCE...|
|    ADAMSON, REBECCA|[KABALLA, GOLEM I...|
|               ADRIA|[DORMAMMU, ANCIEN...|
|   ADVENT/KYLE GROBE|[JUSTICE II/VANCE...|
|AGAMEMNON II/ANDREI |[BLACK WIDOW/NATA...|
+--------------------+--------------------+
only showing top 10 rows



In [93]:
data.printSchema()

root
 |-- hero: string (nullable = true)
 |-- connection: array (nullable = false)
 |    |-- element: string (containsNull = false)



In [None]:
data = data.withColumn('connection',f.concat_ws(',',f.col('connection')))

In [12]:
data.show()

+--------------------+--------------------+
|                hero|          connection|
+--------------------+--------------------+
|             ABCISSA|ELSIE DEE,FURY, C...|
|             ABSALOM|SHATTERSTAR II/GA...|
|ABSORBING MAN | MUTA|DRAX | MUTANT X-V...|
|ABSORBING MAN/CARL C|SOMMERS, APRIL,HE...|
|ADAMS, CONGRESSMAN H|SPIDER-MAN/PETER ...|
| ADAMS, NICOLE NIKKI|JUSTICE II/VANCE ...|
|    ADAMSON, REBECCA|KABALLA,GOLEM III...|
|               ADRIA|DORMAMMU,ANCIENT ...|
|   ADVENT/KYLE GROBE|JUSTICE II/VANCE ...|
|AGAMEMNON II/ANDREI |BLACK WIDOW/NATASHA |
|      AGAMEMNON III/|ASTER, LUCIAN,HOG...|
|            AGAMOTTO|SATANNISH,DORMAMM...|
|             AGGAMON|DR. STRANGE/STEPHEN |
|              AGINAR|SIF,REJECT/RAN-SA...|
|                AGON|MARISTA,BLACK BOL...|
|     AGUIRRE, ISOBEL|TERMINUS,HUMAN TO...|
|               AINET|STORM/ORORO MUNRO...|
|    AKUTAGAWA, OSAMU|HUMAN TORCH/JOHNN...|
|ALDEN, PROF. MEREDIT|CABE, BETHANY,STO...|
|              ALIOTH|LIBRA/GUST

In [15]:
data.coalesce(5).write.option("header",True).csv('output')

In [17]:
csv_file_path = "file:///home/jovyan/work/output"
df = spark.read.option('header','true').option('inferSchema','true').csv(csv_file_path)

In [19]:
df.show()

+--------------------+--------------------+
|                hero|          connection|
+--------------------+--------------------+
|        ABBOTT, JACK|OWL/LELAND OWLSLE...|
|                ACBA|QUASAR III/WENDEL...|
|ACHEBE, REVEREND DOC|COPYCAT/VANESSA,D...|
|          ADAM 3,031|FROST, CARMILLA,S...|
|              ADAM X|SABRETOOTH/VICTOR...|
|               ADORA|DR. SUN,CAPTAIN M...|
|           AFTERLIFE|                WONG|
|       AGEE, REBECCA|DEATHBIRD [SHI'AR...|
|AJAK/TECUMOTZIN [ETE|KA-ZAR/KEVIN PLUN...|
|                AJAX|HOTSHOT/LOUIS,HOG...|
|             AJES'HA|STERMAN, ANDREA A...|
|               AKAFE|BLACK PANTHER/T'C...|
|             ALBERIK|R'TEE,SINTARIIS,F...|
|ALCHEMY/THOMAS JONES|BRIGHTWIND,FERON,...|
|               ALPHA|HUMAN TORCH/JOHNN...|
|      ALVAREZ, FELIX|GHOST,TIGRA/GREER...|
|              AMAZON|IRONCLAD,STINGER ...|
|AMERICAN EAGLE III/J|QUASAR III/WENDEL...|
|AMPHIBIAN/KINGLEY RI|POWER PRINCESS/ZA...|
|               ANAIS|SUMMERS, N

In [21]:
df.show()

+--------------------+--------------------+
|                hero|          connection|
+--------------------+--------------------+
|        ABBOTT, JACK|OWL/LELAND OWLSLE...|
|                ACBA|QUASAR III/WENDEL...|
|ACHEBE, REVEREND DOC|COPYCAT/VANESSA,D...|
|          ADAM 3,031|FROST, CARMILLA,S...|
|              ADAM X|SABRETOOTH/VICTOR...|
|               ADORA|DR. SUN,CAPTAIN M...|
|           AFTERLIFE|                WONG|
|       AGEE, REBECCA|DEATHBIRD [SHI'AR...|
|AJAK/TECUMOTZIN [ETE|KA-ZAR/KEVIN PLUN...|
|                AJAX|HOTSHOT/LOUIS,HOG...|
|             AJES'HA|STERMAN, ANDREA A...|
|               AKAFE|BLACK PANTHER/T'C...|
|             ALBERIK|R'TEE,SINTARIIS,F...|
|ALCHEMY/THOMAS JONES|BRIGHTWIND,FERON,...|
|               ALPHA|HUMAN TORCH/JOHNN...|
|      ALVAREZ, FELIX|GHOST,TIGRA/GREER...|
|              AMAZON|IRONCLAD,STINGER ...|
|AMERICAN EAGLE III/J|QUASAR III/WENDEL...|
|AMPHIBIAN/KINGLEY RI|POWER PRINCESS/ZA...|
|               ANAIS|SUMMERS, N

In [30]:

df.withColumn('connection_size',f.size(f.split(f.col('connection'),','))).orderBy(f.desc('connection_size')).show()

+--------------------+--------------------+---------------+
|                hero|          connection|connection_size|
+--------------------+--------------------+---------------+
|     CAPTAIN AMERICA|URICH, DORIS,ARMA...|           1795|
|SPIDER-MAN/PETER PAR|RED SHIFT,GAMELIN...|           1737|
| IRON MAN/TONY STARK|RED SHIFT,SABRETO...|           1443|
|     WOLVERINE/LOGAN|SABRETOOTH/VICTOR...|           1278|
|THING/BENJAMIN J. GR|CHORD, ANDREW,CAT...|           1262|
| SCARLET WITCH/WANDA|SABRETOOTH/VICTOR...|           1246|
|HUMAN TORCH/JOHNNY S|CAT KING,BUZZ,MAK...|           1202|
|MR. FANTASTIC/REED R|ARMADILLO/ANTONIO...|           1200|
|THOR/DR. DONALD BLAK|PARKER, MAY | TIM...|           1183|
| INVISIBLE WOMAN/SUE|CAPTAIN MARVEL II...|           1143|
|BEAST/HENRY &HANK& P|AMERICAN EAGLE II...|           1140|
|              VISION|PHOSPHORUS,AMERIC...|           1110|
|                HAWK|AMERICAN EAGLE II...|           1086|
|CYCLOPS/SCOTT SUMMER|SABRETOOTH/VICTOR.

In [71]:
path = 'file:///home/jovyan/work/sample/hero-network.csv'

In [72]:
from pyspark.sql import SparkSession, functions as f , types as t

In [73]:
spark = SparkSession.builder.appName('ee').getOrCreate()

In [74]:
df = spark.read.option('inferSchema','true').option('header','true').csv(path)

In [75]:
df.show(20)

+--------------------+--------------------+
|               hero1|               hero2|
+--------------------+--------------------+
|       LITTLE, ABNER|      PRINCESS ZANDA|
|       LITTLE, ABNER|BLACK PANTHER/T'CHAL|
|BLACK PANTHER/T'CHAL|      PRINCESS ZANDA|
|       LITTLE, ABNER|      PRINCESS ZANDA|
|       LITTLE, ABNER|BLACK PANTHER/T'CHAL|
|BLACK PANTHER/T'CHAL|      PRINCESS ZANDA|
|STEELE, SIMON/WOLFGA|    FORTUNE, DOMINIC|
|STEELE, SIMON/WOLFGA| ERWIN, CLYTEMNESTRA|
|STEELE, SIMON/WOLFGA|IRON MAN/TONY STARK |
|STEELE, SIMON/WOLFGA|IRON MAN IV/JAMES R.|
|STEELE, SIMON/WOLFGA|RAVEN, SABBATH II/EL|
|RAVEN, SABBATH II/EL|    FORTUNE, DOMINIC|
|RAVEN, SABBATH II/EL| ERWIN, CLYTEMNESTRA|
|RAVEN, SABBATH II/EL|IRON MAN/TONY STARK |
|RAVEN, SABBATH II/EL|IRON MAN IV/JAMES R.|
|IRON MAN IV/JAMES R.|    FORTUNE, DOMINIC|
|IRON MAN IV/JAMES R.| ERWIN, CLYTEMNESTRA|
|IRON MAN IV/JAMES R.|IRON MAN/TONY STARK |
|IRON MAN/TONY STARK |    FORTUNE, DOMINIC|
|IRON MAN/TONY STARK | ERWIN, CL

In [76]:
data = df.groupBy('hero1').agg(f.collect_set('hero2').alias('connection'))\
.withColumnRenamed('hero1','hero')

In [79]:
data = data.withColumn("connection", f.concat_ws(",", f.col("connection")))

In [106]:
data.show(10)

+--------------------+--------------------+
|                hero|          connection|
+--------------------+--------------------+
|             ABCISSA|ELSIE DEE,FURY, C...|
|             ABSALOM|SHATTERSTAR II/GA...|
|ABSORBING MAN | MUTA|DRAX | MUTANT X-V...|
|ABSORBING MAN/CARL C|SOMMERS, APRIL,HE...|
|ADAMS, CONGRESSMAN H|SPIDER-MAN/PETER ...|
| ADAMS, NICOLE NIKKI|JUSTICE II/VANCE ...|
|    ADAMSON, REBECCA|KABALLA,GOLEM III...|
|               ADRIA|DORMAMMU,ANCIENT ...|
|   ADVENT/KYLE GROBE|JUSTICE II/VANCE ...|
|AGAMEMNON II/ANDREI |BLACK WIDOW/NATASHA |
+--------------------+--------------------+
only showing top 10 rows



In [96]:
data = data.withColumn('connection',f.concat_ws(',',f.col('connection')))

In [104]:
data.withColumn('connection_size',f.size(f.split(f.col('connection'),','))).orderBy(f.desc('connection_size')).show()

+--------------------+--------------------+---------------+
|                hero|          connection|connection_size|
+--------------------+--------------------+---------------+
|     CAPTAIN AMERICA|URICH, DORIS,ARMA...|           1795|
|SPIDER-MAN/PETER PAR|RED SHIFT,GAMELIN...|           1737|
|IRON MAN/TONY STARK |RED SHIFT,SABRETO...|           1443|
|    WOLVERINE/LOGAN |SABRETOOTH/VICTOR...|           1278|
|THING/BENJAMIN J. GR|CHORD, ANDREW,CAT...|           1262|
|SCARLET WITCH/WANDA |SABRETOOTH/VICTOR...|           1246|
|HUMAN TORCH/JOHNNY S|CAT KING,BUZZ,MAK...|           1202|
|MR. FANTASTIC/REED R|ARMADILLO/ANTONIO...|           1200|
|THOR/DR. DONALD BLAK|PARKER, MAY | TIM...|           1183|
|INVISIBLE WOMAN/SUE |CAPTAIN MARVEL II...|           1143|
|BEAST/HENRY &HANK& P|AMERICAN EAGLE II...|           1140|
|             VISION |PHOSPHORUS,AMERIC...|           1110|
|                HAWK|AMERICAN EAGLE II...|           1086|
|CYCLOPS/SCOTT SUMMER|SABRETOOTH/VICTOR.

In [109]:
data.withColumn('connection_size',f.size(f.split(f.col('connection'),','))).orderBy(f.desc('connection_size')).show(10)

+--------------------+--------------------+---------------+
|                hero|          connection|connection_size|
+--------------------+--------------------+---------------+
|     CAPTAIN AMERICA|URICH, DORIS,ARMA...|           1795|
|SPIDER-MAN/PETER PAR|RED SHIFT,GAMELIN...|           1737|
|IRON MAN/TONY STARK |RED SHIFT,SABRETO...|           1443|
|    WOLVERINE/LOGAN |SABRETOOTH/VICTOR...|           1278|
|THING/BENJAMIN J. GR|CHORD, ANDREW,CAT...|           1262|
|SCARLET WITCH/WANDA |SABRETOOTH/VICTOR...|           1246|
|HUMAN TORCH/JOHNNY S|CAT KING,BUZZ,MAK...|           1202|
|MR. FANTASTIC/REED R|ARMADILLO/ANTONIO...|           1200|
|THOR/DR. DONALD BLAK|PARKER, MAY | TIM...|           1183|
|INVISIBLE WOMAN/SUE |CAPTAIN MARVEL II...|           1143|
+--------------------+--------------------+---------------+
only showing top 10 rows



In [111]:
spark = SparkSession.builder.appName("df_missing_data").getOrCreate()
df = spark.read.csv(
    "file:///home/jovyan/work/sample/null_data.csv", header=True, inferSchema=True)
df.show()

+----+----------+------+
|  id|occupation|salary|
+----+----------+------+
|1000|  engineer|100000|
|2000|      NULL|  NULL|
|3000|      NULL| 50000|
|4000|   teacher| 80000|
|5000|    banker|120000|
+----+----------+------+



In [112]:
df.na.drop().show()

+----+----------+------+
|  id|occupation|salary|
+----+----------+------+
|1000|  engineer|100000|
|4000|   teacher| 80000|
|5000|    banker|120000|
+----+----------+------+



In [113]:
df.na.drop(thresh=2).show()

+----+----------+------+
|  id|occupation|salary|
+----+----------+------+
|1000|  engineer|100000|
|3000|      NULL| 50000|
|4000|   teacher| 80000|
|5000|    banker|120000|
+----+----------+------+



In [117]:
df.select(f.mean('salary')).collect()[0][0]

87500.0

In [118]:
df.show()

+----+----------+------+
|  id|occupation|salary|
+----+----------+------+
|1000|  engineer|100000|
|2000|      NULL|  NULL|
|3000|      NULL| 50000|
|4000|   teacher| 80000|
|5000|    banker|120000|
+----+----------+------+



In [120]:
df.na.fill(df.select(f.mean('salary')).collect()[0][0], subset=['salary']).show()

+----+----------+------+
|  id|occupation|salary|
+----+----------+------+
|1000|  engineer|100000|
|2000|      NULL| 87500|
|3000|      NULL| 50000|
|4000|   teacher| 80000|
|5000|    banker|120000|
+----+----------+------+



In [121]:
spark = SparkSession.builder.appName("df_manage_date").getOrCreate()
df = spark.read.csv(
    "file:///home/jovyan/work/sample/date_parsing.csv", header=True, inferSchema=True)


In [122]:
df.,

+----------+------+
|      date|number|
+----------+------+
|2022-01-27|  2723|
|2021-12-29|  1460|
|2022-01-22|  3411|
|2022-01-06|  1527|
|2022-04-21|  3978|
|2022-10-23|  3443|
|2021-12-23|  1641|
|2022-05-31|  1633|
|2021-12-29|  1072|
|2021-12-30|  2936|
|2022-05-04|  2494|
|2022-06-22|  2019|
|2022-04-23|  3804|
|2022-08-04|  1619|
|2022-01-26|  1306|
|2022-09-23|  3918|
|2022-05-27|  3209|
|2022-09-20|  2333|
|2022-07-05|  1861|
|2022-07-18|  3404|
+----------+------+
only showing top 20 rows



In [137]:
df.withColumn('year',f.year('date')).groupBy('year').agg(f.format_number(f.mean('number'),2).alias('avg')).orderBy('year').show()

+----+--------+
|year|     avg|
+----+--------+
|2021|2,195.68|
|2022|2,540.67|
+----+--------+



In [145]:
from pyspark.sql import (
    functions as f,
    SparkSession,
    types as t
)

spark = SparkSession.builder.appName("df_join").getOrCreate()

# user data
user_data = [
    ["1000", "Neville Hardy", "Apple"],
    ["2000", "Dacia Cohen", "Alphabet"],
    ["3000", "Elois Cox", "Neflix"],
    ["4000", "Junita Meyer", "Meta"],
    ["5000", "Cleora Banks", "Amazon"]]

user_col = ['id', 'name', 'company']
df_user = spark.createDataFrame(data=user_data, schema=user_col)
df_user.show()

# salary data
salary_data = [
    ["1000", "150000", "engineer"],
    ["2000", "240000", "manager"],
    ["3000", "120000", "human resource"],
    ["6000", "100000", "sales"]]

salary_col = ['id', 'salary', 'department']
df_salary = spark.createDataFrame(data=salary_data, schema=salary_col)
df_salary.show()


+----+-------------+--------+
|  id|         name| company|
+----+-------------+--------+
|1000|Neville Hardy|   Apple|
|2000|  Dacia Cohen|Alphabet|
|3000|    Elois Cox|  Neflix|
|4000| Junita Meyer|    Meta|
|5000| Cleora Banks|  Amazon|
+----+-------------+--------+

+----+------+--------------+
|  id|salary|    department|
+----+------+--------------+
|1000|150000|      engineer|
|2000|240000|       manager|
|3000|120000|human resource|
|6000|100000|         sales|
+----+------+--------------+



In [146]:
df_user.show()
df_salary.show()

+----+-------------+--------+
|  id|         name| company|
+----+-------------+--------+
|1000|Neville Hardy|   Apple|
|2000|  Dacia Cohen|Alphabet|
|3000|    Elois Cox|  Neflix|
|4000| Junita Meyer|    Meta|
|5000| Cleora Banks|  Amazon|
+----+-------------+--------+

+----+------+--------------+
|  id|salary|    department|
+----+------+--------------+
|1000|150000|      engineer|
|2000|240000|       manager|
|3000|120000|human resource|
|6000|100000|         sales|
+----+------+--------------+



In [154]:
df1 = df_user
df2 = df_salary

In [155]:
df1.join(df2, df1.id ==df2.id , 'left').show()

+----+-------------+--------+----+------+--------------+
|  id|         name| company|  id|salary|    department|
+----+-------------+--------+----+------+--------------+
|1000|Neville Hardy|   Apple|1000|150000|      engineer|
|2000|  Dacia Cohen|Alphabet|2000|240000|       manager|
|3000|    Elois Cox|  Neflix|3000|120000|human resource|
|4000| Junita Meyer|    Meta|NULL|  NULL|          NULL|
|5000| Cleora Banks|  Amazon|NULL|  NULL|          NULL|
+----+-------------+--------+----+------+--------------+



In [156]:
df1.createOrReplaceTempView('df1')
df2.createOrReplaceTempView('df2')

In [166]:
spark.sql('select a.id, a.name , company, nvl(department,"manager") \
    from df1 a \
    left outer join df2 b \
    on a.id = b.id '
         ).show()

+----+-------------+--------+------------------------+
|  id|         name| company|nvl(department, manager)|
+----+-------------+--------+------------------------+
|1000|Neville Hardy|   Apple|                engineer|
|2000|  Dacia Cohen|Alphabet|                 manager|
|3000|    Elois Cox|  Neflix|          human resource|
|4000| Junita Meyer|    Meta|                 manager|
|5000| Cleora Banks|  Amazon|                 manager|
+----+-------------+--------+------------------------+



In [167]:
df_user.join(df_salary,(df_user.id == df_salary.id) & (df_user.id == 1000)).show()

+----+-------------+-------+----+------+----------+
|  id|         name|company|  id|salary|department|
+----+-------------+-------+----+------+----------+
|1000|Neville Hardy|  Apple|1000|150000|  engineer|
+----+-------------+-------+----+------+----------+



In [168]:
df_user.join(df_salary,df_user.id == df_salary.id).show()

+----+-------------+--------+----+------+--------------+
|  id|         name| company|  id|salary|    department|
+----+-------------+--------+----+------+--------------+
|1000|Neville Hardy|   Apple|1000|150000|      engineer|
|2000|  Dacia Cohen|Alphabet|2000|240000|       manager|
|3000|    Elois Cox|  Neflix|3000|120000|human resource|
+----+-------------+--------+----+------+--------------+



In [169]:
df_user.join(df_salary,df_user.id == df_salary.id).where(df_user.id ==1000).show()

+----+-------------+-------+----+------+----------+
|  id|         name|company|  id|salary|department|
+----+-------------+-------+----+------+----------+
|1000|Neville Hardy|  Apple|1000|150000|  engineer|
+----+-------------+-------+----+------+----------+

