In [1]:
import pandas as pd
from pyspark.ml.recommendation import ALS
from pyspark.sql.session import SparkSession
import math
from sklearn import preprocessing

In [2]:
data_delivery = pd.read_csv('/Users/swayam/Documents/Projects/Data_Mining/Data/Delivery_Info.csv')

In [3]:
data_delivery.head(5)

Unnamed: 0,Match_ID,Innings,Batting_Team,Bowling_Team,Over,Ball,Batsman,Non_Striker,Bowler,Is_Super_Over,...,Bye_Runs,Legbye_Runs,Noball_Runs,Penalty_Runs,Batsman_Runs,Extras,Total_Runs,Player_Dismissed,Dismissal_Type,Fielder
0,1,1,Pune Warriors,Delhi Daredevils,1,1,RV Uthappa,AJ Finch,IK Pathan,0,...,0,0,0,0,0,0,0,,,
1,1,1,Pune Warriors,Delhi Daredevils,1,2,RV Uthappa,AJ Finch,IK Pathan,0,...,0,0,0,0,2,0,2,,,
2,1,1,Pune Warriors,Delhi Daredevils,1,3,RV Uthappa,AJ Finch,IK Pathan,0,...,0,0,0,0,4,0,4,,,
3,1,1,Pune Warriors,Delhi Daredevils,1,4,RV Uthappa,AJ Finch,IK Pathan,0,...,0,0,0,0,0,0,0,,,
4,1,1,Pune Warriors,Delhi Daredevils,1,5,RV Uthappa,AJ Finch,IK Pathan,0,...,0,0,0,0,1,0,1,,,


## Creating a dataframe with the available batsman and boller data

In [4]:
le = preprocessing.LabelEncoder()
all_players = set(pd.concat([data_delivery['Batsman'],data_delivery['Non_Striker'],data_delivery['Bowler'],data_delivery['Fielder']], axis = 0))
le.fit(list(all_players))

batsman_boller_pair = list()
batsman_data = data_delivery.groupby('Batsman')
for batsman_name, value1 in batsman_data:
    boller_data = value1.groupby('Bowler')
    for boller_name, value2 in boller_data:
        batsman_boller_pair.append([batsman_name, boller_name, len(value2)])

df = pd.DataFrame.from_records(batsman_boller_pair, columns = ['Batsman', 'Boller', 'Balls'])
temp = df.copy(deep = True)
df.Batsman = le.transform(df.Batsman)
df.Boller = le.transform(df.Boller)
df.head(5)

Unnamed: 0,Batsman,Boller,Balls
0,0,11,9
1,0,24,7
2,0,34,12
3,0,36,3
4,0,73,2


## Find missing pairs

Create dataframe using boller as row and batsman as columns. Replace the 'nan' values with 0 and iterate and find the pairs where values are 0

In [5]:
data_sheet = df.pivot(index='Boller', columns='Batsman', values='Balls').fillna(0)
data_sheet.head(5)

Batsman,0,1,2,3,4,5,6,7,8,9,...,612,613,614,615,616,617,618,619,620,621
Boller,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0


In [6]:
missing_combination = list()
for column in data_sheet.columns:
    for index, row_value in data_sheet[column].iteritems():
        if index != column and row_value == 0:
            missing_combination.append([column, index])

missing_combination = pd.DataFrame.from_records(missing_combination, columns = ['Batsman', 'Boller'])
missing_combination.head(5)

Unnamed: 0,Batsman,Boller
0,0,1
1,0,3
2,0,4
3,0,5
4,0,6


## Start a spark session for doing a collaborative filtering using ALS package

In [7]:
spark = SparkSession.builder.appName("CF_Boller_Batsman").getOrCreate()

In [8]:
rdd_df = spark.createDataFrame(df)

In [9]:
als = ALS(maxIter=1,regParam=0.085, userCol="Batsman", itemCol="Boller", ratingCol="Balls", coldStartStrategy="nan", nonnegative=True)
model= als.fit(rdd_df)

In [10]:
rdd_missing_data = spark.createDataFrame(missing_combination)
predictions = model.transform(rdd_missing_data)
missing_result = predictions.select("*").toPandas()
missing_result

Unnamed: 0,Batsman,Boller,prediction
0,463,148,0.009313
1,471,148,5.190045
2,496,148,3.072480
3,392,148,0.403013
4,540,148,2.455387
...,...,...,...
187518,208,422,1.542497
187519,315,422,0.060913
187520,89,422,1.935444
187521,401,422,13.542753


In [11]:
spark.stop()

## Write the data to a csv

In [12]:
missing_result = missing_result.sort_values('Batsman')
missing_result.Batsman = le.inverse_transform(missing_result.Batsman)
missing_result.Boller = le.inverse_transform(missing_result.Boller)
result = pd.concat([missing_result, temp], axis = 0, sort = True)
result.to_csv('/Users/swayam/Documents/Projects/Data_Mining/Data/boller_batsman_data.csv', index = False)