In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL Jointed Table") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [2]:
from pyspark.ml.linalg import Vectors
df = spark.sparkContext.parallelize([
    ("assert", Vectors.dense([1, 2, 3]))
]).toDF(["word", "vector"])

In [3]:
df.show()

+------+-------------+
|  word|       vector|
+------+-------------+
|assert|[1.0,2.0,3.0]|
+------+-------------+



In [4]:
from pyspark.ml.linalg import Vectors

In [5]:
def extract(row):
    return (row.word, ) + tuple(float(x) for x in row.vector.values)

In [6]:
df_new = df.rdd.map(extract).toDF(["word"]) 

In [7]:
df_new.show()

+------+---+---+---+
|  word| _2| _3| _4|
+------+---+---+---+
|assert|1.0|2.0|3.0|
+------+---+---+---+



In [8]:
df = spark.sparkContext.parallelize([(1, 2, 3, 'a b c'),
                     (4, 5, 6, 'd e f'),
                     (7, 8, 9, 'g h i')]).toDF(['col1', 'col2', 'col3','col4'])

In [9]:
df.show()

+----+----+----+-----+
|col1|col2|col3| col4|
+----+----+----+-----+
|   1|   2|   3|a b c|
|   4|   5|   6|d e f|
|   7|   8|   9|g h i|
+----+----+----+-----+



In [10]:
df.printSchema()

root
 |-- col1: long (nullable = true)
 |-- col2: long (nullable = true)
 |-- col3: long (nullable = true)
 |-- col4: string (nullable = true)



In [11]:
from pyspark.sql.functions import split, explode
new = df.withColumn('col4',explode(split('col4',' ')))

In [12]:
new.show()

+----+----+----+----+
|col1|col2|col3|col4|
+----+----+----+----+
|   1|   2|   3|   a|
|   1|   2|   3|   b|
|   1|   2|   3|   c|
|   4|   5|   6|   d|
|   4|   5|   6|   e|
|   4|   5|   6|   f|
|   7|   8|   9|   g|
|   7|   8|   9|   h|
|   7|   8|   9|   i|
+----+----+----+----+



In [13]:
new.printSchema()

root
 |-- col1: long (nullable = true)
 |-- col2: long (nullable = true)
 |-- col3: long (nullable = true)
 |-- col4: string (nullable = true)



In [14]:
df = spark.sparkContext.parallelize([(1, 2, 3, 'a,b,c'),
                     (4, 5, 6, 'd,e,f'),
                     (7, 8, 9, 'g,h,i')]).toDF(['col1', 'col2', 'col3','col4'])

In [15]:
new = df.withColumn('col4',explode(split('col4',',')))

In [16]:
new.show()

+----+----+----+----+
|col1|col2|col3|col4|
+----+----+----+----+
|   1|   2|   3|   a|
|   1|   2|   3|   b|
|   1|   2|   3|   c|
|   4|   5|   6|   d|
|   4|   5|   6|   e|
|   4|   5|   6|   f|
|   7|   8|   9|   g|
|   7|   8|   9|   h|
|   7|   8|   9|   i|
+----+----+----+----+



In [17]:
a = spark.sparkContext.\
    parallelize([['a', 'foo'], ['b', 'hem'], ['c', 'haw']]).toDF(['a_id', 'extra'])

In [18]:
a.show()

+----+-----+
|a_id|extra|
+----+-----+
|   a|  foo|
|   b|  hem|
|   c|  haw|
+----+-----+



In [19]:
b = spark.sparkContext.parallelize([['p1', 'a'], ['p2', 'b'], ['p3', 'c']]).toDF(["other", "b_id"])

In [20]:
b.show()

+-----+----+
|other|b_id|
+-----+----+
|   p1|   a|
|   p2|   b|
|   p3|   c|
+-----+----+



In [21]:
c = a.join(b, a.a_id == b.b_id,'outer')

In [22]:
c.show()

+----+-----+-----+----+
|a_id|extra|other|b_id|
+----+-----+-----+----+
|   c|  haw|   p3|   c|
|   b|  hem|   p2|   b|
|   a|  foo|   p1|   a|
+----+-----+-----+----+



In [23]:
from tqdm import tqdm, tqdm_notebook

ImportError: No module named tqdm

In [None]:
for i in tqdm_notebook(range(int(1e4))):
    pass 

In [None]:
import matplotlib as plt
import seaborn as sns
import random

# create an RDD of 100 random numbers
x = [random.normalvariate(0,1) for i in range(100)]
rdd = spark.sparkContext.parallelize(x)

# plot data in RDD - use .collect() to bring data to local
num_bins = 50
n, bins, patches = plt.histogram(rdd.collect(), num_bins, normed=1, facecolor='green', alpha=0.5)

In [None]:
import pandas as pd

# Let's use UCLA's college admission dataset
file_name = "http://www.ats.ucla.edu/stat/data/binary.csv"

# Creating a pandas dataframe from Sample Data
pandas_df = pd.read_csv(file_name)


# Creating a Spark DataFrame from a pandas dataframe
spark_df = spark.createDataFrame(df)

spark_df.show(5)