In [20]:
import pyspark                          # for sparkContext and Dataframes
from pyspark.sql.types import *         # for defining schema with various datatypes
import pyspark.sql.functions as func    # for ETL, data processing on Dataframes

import pandas as pd                     # converting PysparkDF to PandasDF when passing it as a parameter to Bokeh invokes 

from datetime import *                  # for datetime datatype for schema
from dateutil.parser import parse       # for string parse to date

from bokeh.io import push_notebook, show, output_notebook  # various output methods for jupyter notebook
from bokeh.plotting import figure                          # creating a figure variable
from bokeh.charts import Bar, output_file, show            # creating bar charts, and displaying it
from bokeh.charts.attributes import cat                    # extracting column for 'label' category in bar charts
from bokeh.palettes import *                               # brewer color palette
from bokeh.models import HoverTool, ColumnDataSource       # for hover feature, and columnDS
output_notebook()

sql = SQLContext(sc)

In [21]:
#Extracting and Transforming csv data

data_path = "../input/csv/"                                # path directory to input csv files
match_rdd = sc.textFile(data_path + "matches.csv")         # reading csv files into RDD

match_header = match_rdd.filter(lambda l: "id,season" in l)     # storing the header tuple
match_no_header = match_rdd.subtract(match_header)              # subtracting it from RDD
match_temp_rdd = match_no_header.map(lambda k: k.split(','))\
.map(lambda p: (int(p[0]), p[1],p[2],parse(p[3]).date(),p[4]\
                ,p[5],p[6],p[7],p[8],p[9]=='1',p[10],int(p[11])\
                ,int(p[12]),p[13],p[14],p[15],p[16],p[17]))     # Transforming csv file data

match_df = sql.createDataFrame(match_temp_rdd, match_rdd.first().split(','))  # converting to PysparkDF
match_df = match_df.orderBy(match_df.id.asc())                                # asc sort by id

In [22]:
def getCleanRange(tmp_list, sort_req):
    item_range = []
    for item in tmp_list:
        if item[0]=='"':
            item_range.append(item[1:])
        else:
            item_range.append(item)
    if sort_req:
        item_range.sort()
    return item_range

def getRange(season, attr, distinct_req, sort_req):
    if distinct_req:
        attr_df = match_df.filter(match_df.season == season).select(attr).distinct()
    else:
        attr_df = match_df.filter(match_df.season == season).select(attr)
    
    if sort_req:
        attr_df = attr_df.orderBy(attr)
        
    attr_range = attr_df.rdd.map(lambda x: str(x[0])).collect()
    return attr_range

def displaySeasonOverview(season_num):
    figure_season_overview = figure(title="Season Overview : "+season_num, tools="hover, save",
               x_range=city_range, y_range=list(reversed(date_range)))
    figure_season_overview.plot_width = 700
    figure_season_overview.plot_height = 700
    figure_season_overview.xaxis.major_label_orientation = 45
    figure_season_overview.xaxis.axis_label = 'Stadium Cities'
    figure_season_overview.yaxis.axis_label = 'Dates'

    figure_season_overview.rect("cities","dates", 0.9, 0.9, source=src, fill_alpha=0.6, color="type_color")

    figure_season_overview.select_one(HoverTool).tooltips = [
                ("Date", "@dates"),
                ("Team1", "@team1"),
                ("Team2", "@team2"),
                ("Venue", "@venues"),
                ("City", "@cities"),
                ("Winner", "@winners"),
                ("Man of the match","@player_of_match")
            ]

    handle_season_overview = show(figure_season_overview, notebook_handle=True)

def getSeasonOverview(season_num):
    date_range = [str(x) for x in getRange(season_num,"date", 1, 1) ]
    city_range = [str(x) for x in getRange(season_num,"city",1,1) ]

    colorMap = {
        ''                              : '#000000',
        'Chennai Super Kings'           : '#EED200',
        'Deccan Chargers'               : '#EA290B',
        'Delhi Daredevils'              : '#0043A8',
        'Gujarat Lions'                 : '#9467BD',
        'Kings XI Punjab'               : '#DB0033',
        'Kochi Tuskers Kerala'          : '#E377C2',
        'Kolkata Knight Riders'         : '#6600DE',
        'Mumbai Indians'                : '#0092CD',
        'Pune Warriors'                 : '#BCBD22',
        'Rajasthan Royals'              : '#B19237',
        'Rising Pune Supergiants'       : '#BCBD22',
        'Royal Challengers Bangalore'   : '#4FC730',
        'Sunrisers Hyderabad'           : '#EA290B'
    }

    src = ColumnDataSource(
        data=dict(
            dates = [str(x) for x in getRange(season_num,"date",0,0)],
            venues = [str(x) for x in getCleanRange(getRange(season_num,"venue",0,0), 0)],
            cities = getRange(season_num,"city",0,0),
            team1 = getRange(season_num,"team1",0,0),
            team2 = getRange(season_num,"team2",0,0),
            toss_winner = getRange(season_num,"toss_winner",0,0),
            toss_decision = getRange(season_num,"toss_decision",0,0),
            result = getRange(season_num,"result",0,0),
            winners = getRange(season_num,"winner",0,0),
            win_by_runs = getRange(season_num,"win_by_runs",0,0),
            win_by_wickets = getRange(season_num,"win_by_wickets",0,0),
            player_of_match = getRange(season_num,"player_of_match",0,0),
            umpire1 = getRange(season_num,"umpire1",0,0),
            umpire2 = getRange(season_num,"umpire2",0,0),
            umpire3 = getRange(season_num,"umpire3",0,0),        
            type_color=[colorMap[x] for x in getRange(season_num,"winner",0,0)],
        )
    )
    
    displaySeasonOverview(season_num)


In [24]:
getSeasonOverview("2008")