In [None]:
!pip install pyspark


# What for?

In this Notebook, I gone through how to use Pyspark to load the data and creating Data gen without out running out of memory it will help in the case of training large models within the the kaggle resource itself

# Why Pyspark?

In Apache Spark, a DataFrame is a distributed collection of rows under named columns. In simple terms, it is same as a table in relational database or an Excel sheet with Column headers. It also shares some common characteristics with RDD:

* **Immutable in nature** : We can create DataFrame / RDD once but can’t change it. And we can transform a DataFrame / RDD  after applying transformations.
* **Lazy Evaluations**: Which means that a task is not executed until an action is performed.
* **Distributed**: RDD and DataFrame both are distributed in nature.

My first exposure to DataFrames was when I learnt about Pandas. Today, it is difficult for me to run my data science workflow with out Pandas DataFrames. So, when I saw similar functionality in Apache Spark, I was excited about the possibilities it opens up!

<a href=https://www.analyticsvidhya.com/blog/2016/10/spark-dataframe-and-operations/> click hear to learn more...<a/>

Here the main purpose of pyspark is all because of it's lazzy evalution property.

**Pandas** load the whole data into memory when we do some transformation to our data which was already loaded into our memory, will hurts the memory further.

**RDD's** are useful because they allow users to process data at the "row" level without having to load all data into memory.

### Saves Computation and increases Speed
Spark Lazy Evaluation plays a key role in saving calculation overhead. Since only necessary values get compute.

### Reduces Complexities
The two main complexities of any operation are time and space complexity. Using pySpark lazy evaluation we can overcome both. Since we do not execute every operation. The action is triggered only when the data is required, it reduces overhead.

In Our case, Data gen outputs the chunk of data that we need for training. so, we no need to load the whole stuff in the memory we can load it on demand basis.

In [None]:
import pyspark
import gc
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, when, count, col, lit, udf
from pyspark.sql.types import FloatType, StructType, StructField, IntegerType, Row
import pandas as pd

In [None]:
data = pd.read_csv("../input/riiid-test-answer-prediction/train.csv", usecols=['user_id'])

In [None]:
user_ids = data.user_id.value_counts().reset_index().values

In [None]:
user_ids.shape

In [None]:
del data
gc.collect()

In [None]:

spark = SparkSession.builder.appName("SparkByExamples.com").getOrCreate()

df = spark.read.csv("../input/riiid-test-answer-prediction/train.csv",inferSchema = True, header = True)

In [None]:
df.printSchema()

In [None]:
df.columns

In [None]:
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

In [None]:
df = df.repartition('user_id')

In [None]:
df.rdd.getNumPartitions()

In [None]:
df2 = df.withColumn('prior_question_elapsed_time', when(col('prior_question_elapsed_time').isNull(), when(col('content_type_id') == 0, lit(0)).otherwise(lit(-1))).otherwise(df.prior_question_elapsed_time))
df2 = df2.withColumn("prior_question_had_explanation", df2["prior_question_had_explanation"].cast(FloatType()))
df2 = df2.withColumn('prior_question_had_explanation', when(col('prior_question_had_explanation').isNull(), lit(-1.0)).otherwise(df2.prior_question_had_explanation))

In [None]:
questions_data = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')
lecture_data = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')

In [None]:
qution_dict = {}

for i in range(len(questions_data)):
    qution_dict[questions_data.question_id[i]] = [questions_data.tags[i], questions_data.part[i], questions_data.bundle_id[i]]

lecture_dict = {}

for i in range(len(lecture_data)):
    lecture_dict[lecture_data.lecture_id[i]] = [lecture_data.tag[i], lecture_data.part[i], -1]

In [None]:
del questions_data, lecture_data

In [None]:
def leg_que_merge(content_id, content_type_id):
    if content_type_id == 0:
        to_return = qution_dict[content_id]
        try:
            tag = [float(i) for i in to_return[0].split(" ")]
        except:
            tag = [-1.0]
        dif = 6 - len(tag)
        tag.extend([-1.0]*dif)
        part = float(to_return[1])
        bundle = float(to_return[2])
        return Row('part', 'tag1', 'tag2', 'tag3', 'tag4', 'tag5', 'tag6', 'bundle')(part, tag[0], tag[1], tag[2], tag[3], tag[4], tag[5], bundle)
    else:
        to_return = lecture_dict[content_id]
        try:
            tag = [float(i) for i in to_return[0].split(" ")]
        except:
            tag = [-1.0]
        dif = 6 - len(tag)
        tag.extend([-1.0]*dif)
        part = float(to_return[1])
        bundle = float(to_return[2])
        return Row('part', 'tag1', 'tag2', 'tag3', 'tag4', 'tag5', 'tag6', 'bundle')(part, tag[0], tag[1], tag[2], tag[3], tag[4], tag[5], bundle)

In [None]:
schema = StructType([
    StructField("part", FloatType(), False),
    StructField("tag1", FloatType(), False),
    StructField("tag2", FloatType(), False),
    StructField("tag3", FloatType(), False),
    StructField("tag4", FloatType(), False),
    StructField("tag5", FloatType(), False),
    StructField("tag6", FloatType(), False),
    StructField("bundle", FloatType(), False)])

mapper = udf(leg_que_merge, schema)

df2 = df2.withColumn("Output", mapper(df2['content_id'], df2['content_type_id']))

In [None]:
e_features = df2.select("user_id", "content_type_id", "Output.*")

In [None]:
e_features.show()

In [None]:
df3 = df.select("user_id","content_id", "content_type_id")

In [None]:
df3 = df3.repartition(60)
df3.explain()

In [None]:
def data_generatore(include_lecutes = False):
    for i in user_ids:
        dat = df3.filter(col('user_id') == float(i))
        yield dat.collect()

In [None]:
%time
next(iter(data_generatore()))

## Tensorflow

# Why tensorflow Dataset API?

Tensorflow Dataset API has an ability to load data from plenty of sources like pandas dataframe, Numpy array, csv file, tfrecord file etc., After that it also providing bunch of operations to transform data, to make its consuption optimal and more robust.
<div style="padding-left:5%;background-color:grey">
    <ul>
    <li>shuffle: randomly mix the data to remove the correlation</li>
    <li>map: apply a user-defined function to multiple data entries at the same time. (This is very useful for preprocessing)</li>
    <li>batch : structure the data in mini-batches for training</li>
    <li>prefetch: cache batches in memory, ready to be consumed instantly</li>
    </ul>
    </div>
    
 Achieving peak performance requires an efficient input pipeline that delivers data for the next step before the current step has finished. The tf.data API helps to build flexible and efficient input pipelines
 
TensorFlow Profiler aims to help users diagnose and fix input pipeline performance issues by finding the performance bottleneck


**NOTE:**
By design, TensorFlow is based on lazy execution (though we can force eager execution). That means, it does not actually process the data available till it has to. It just gathers all the information that we feed into it. It processes only when we finally ask it to process.

<a href = "https://www.tensorflow.org/guide/data">To learn more...</a>

* Loading the csv data using <a href="https://www.tensorflow.org/api_docs/python/tf/data/experimental/CsvDataset">CsvDataset</a>

In my case I am loading only 5 columns (timestamp, user_id, content_id, content_type_id, answered_correctly). If you want whole column just remove "select_cols"

Note:
   Loading Whole Column won't affect the execution speed

In [None]:
import numpy as np
import tensorflow as tf
import pandas as pd


filenames = '../input/riiid-test-answer-prediction/train.csv'
dtype = (np.int(), np.float(), np.int(), np.int(), np.float(), np.float(), np.float(), np.float(), np.str(), np.str())

df_ = tf.data.experimental.CsvDataset(
    filenames, record_defaults = dtype,
    header=True
)

In [None]:
next(iter(df_))

In [None]:
data = pd.read_csv('../input/riiid-test-answer-prediction/train.csv', usecols = ['user_id'])

In [None]:
user_ids = data.user_id.value_counts().reset_index().values

In [None]:
def preprocessing(*args):
    elapsetime = -1.0
    bollean = -1.0
    if args[-2] == b"" and args[4] == 0:
        elapsetime = 0.0
    elif args[-2] == b'' and args[4]== 1:
        pass
    else:
        elapsetime = float(args[-2])
    
    if args[-2] == b'':
        pass
    elif args[-2] == 'True':
        bollea = 1.0
    else:
        bollean = 0.0
    return (*args[:8], elapsetime, bollean)

In [None]:
newdf_ = df_.map(preprocessing, num_parallel_calls = tf.data.experimental.AUTOTUNE)

In [None]:
next(iter(newdf_))


**Created a Lookup table for qeustions and leture data which is used to map the content id to it's corresponding tags and part**

#### Why we are using Lookup table?
In pyspark we had used dictionary as an look up table but in tensorflow we can't use it because of eager execution. As an alternative to dictionary, I had used Look table from tensorflow. In this case,the whole operation that going to be happen in dataset will be in graph format.


**note**: If you want to use dictionary, eager model has to disabled at the begining.

In [None]:
questions_data = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')
lecture_data = pd.read_csv('../input/riiid-test-answer-prediction/lectures.csv')

ques_key = []
ques_val = []
ques_idx = []
for i in range(len(questions_data)):
    da = questions_data.tags[i]
    if str(da) == 'nan':
        da = [-1.0]
    else:
        da = [float(i) for i in da.split(" ")]
    diff = 6 - len(da)
    ques_key.append(questions_data.question_id[i])
    ques_val.append([*da, *[-1.0]*diff, questions_data.part[i], questions_data.bundle_id[i]])
    ques_idx.append(i)
    
lec_key = []
lec_val = []
lec_idx = []
for i in range(len(lecture_data)):
    da = str(lecture_data.tag[i])
    if da == 'nan':
        da = [-1.0]
    else:
        da = [float(i) for i in da.split(" ")]
    diff = 6 - len(da)
    lec_key.append(lecture_data.lecture_id[i])
    lec_val.append([*da, *[-1.0]*diff, lecture_data.part[i], -1])
    lec_idx.append(i)

In [None]:
del lecture_data, questions_data

In [None]:
input_tensor = tf.constant([5692])

question_values = tf.constant(ques_val)
lecture_values = tf.constant(lec_val)

que_init = tf.lookup.KeyValueTensorInitializer(tf.constant(ques_key), tf.constant(ques_idx))
question_lookup_table = tf.lookup.StaticHashTable(
    que_init,
    default_value=-1)

lec_init = tf.lookup.KeyValueTensorInitializer(tf.constant(lec_key), tf.constant(lec_idx))
lecture_lookup_table = tf.lookup.StaticHashTable(
    lec_init,
    default_value=-1)


question_values[question_lookup_table.lookup(input_tensor).numpy()[0]]

In [None]:
def leg_que_merge(*args):
    content_id = args[3]
    if args[4] == 0:
        va = question_lookup_table.lookup(tf.cast(content_id, dtype = tf.int32))
        to_return = question_values[va]
        return (args[1], args[2],args[7], to_return)
    else:
        va = lecture_lookup_table.lookup(tf.cast(content_id, dtype = tf.int32))
        to_return = lecture_values[va]
        return (args[1], args[2],args[7], to_return)

In [None]:
newdf_ = newdf_.map(leg_que_merge, num_parallel_calls = tf.data.experimental.AUTOTUNE)

In [None]:
iterator = iter(newdf_.batch(1))
timestamp, user_id, answered_correctly, features = next(iterator)

print("User : ", user_id.numpy()[0], " " * 50 + "TimeStamp : ", timestamp.numpy()[0])
print()
print("Features : ", features.numpy().astype(np.int))
print("Label : ", answered_correctly.numpy()[0])

print("="*80)
timestamp, user_id, answered_correctly, features = next(iterator)

print("User : ", user_id.numpy()[0], " " * 50 + "TimeStamp : ", timestamp.numpy()[0])
print()
print("Features : ", features.numpy().astype(np.int))
print("Label : ", answered_correctly.numpy()[0])

## Conclution:

Both pyspark and tensorfow are working as lazzy execution. The whole data is not pulled into the memory instead we can fetch it on demand. It helps us to avoid out of memory issue when working with pandas.

But we have a sight over execution speed tensorflow seems to be much faster than pyspark.