## Dataframe Basics

In [None]:
%pyspark
# Read in data from S3 Buckets
from pyspark import SparkFiles
url = "https://s3.amazonaws.com/dataviz-curriculum/day_1/food.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("food.csv"), sep=",", header=True)

# Show DataFrame
df.show()

In [None]:
%pyspark
# Print our schema
df.printSchema()

In [None]:
%pyspark
# Show the columns
df.columns

In [None]:
%pyspark
# Describe our data
df.describe()

In [None]:
%pyspark
# Import struct fields that we can use
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

In [None]:
%pyspark
# Next we need to create the list of struct fields
schema = [StructField("food", StringType(), True), StructField("price", IntegerType(), True),]
schema

In [None]:
%pyspark
# Pass in our fields
final = StructType(fields=schema)
final

In [None]:
%pyspark
# Read our data with our new schema
dataframe = spark.read.csv(SparkFiles.get("food.csv"), schema=final, sep=",", header=True)
dataframe.show()

In [None]:
%pyspark
# Print it out
dataframe.printSchema()

### Accessing data

In [None]:
%pyspark
dataframe['price']

In [None]:
%pyspark
type(dataframe['price'])

In [None]:
%pyspark
dataframe.select('price')

In [None]:
%pyspark
type(dataframe.select('price'))

In [None]:
%pyspark
dataframe.select('price').show()

### Manipulating Columns

In [None]:
%pyspark
# Add new column
dataframe.withColumn('newprice', dataframe['price']).show()

In [None]:
%pyspark
# Update column name
dataframe.withColumnRenamed('price','newerprice').show()

In [None]:
%pyspark
# Double the price
dataframe.withColumn('doubleprice',dataframe['price']*2).show()

In [None]:
%pyspark
# Add a dollar to the price
dataframe.withColumn('add_one_dollar',dataframe['price']+1).show()

In [None]:
%pyspark
# Half the price
dataframe.withColumn('half_price',dataframe['price']/2).show()

In [None]:
%pyspark
# Collecting a column as a list
dataframe.select("price").collect()

# Converting PySpark DataFrame to Pandas DataFrame

In [None]:
%pyspark
import pandas as pd
pandas_df = dataframe.toPandas() 

In [None]:
%pyspark
pandas_df.head()