In [168]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Cricket-Prediction").getOrCreate()

directory = r'D:\github\Cricket-Prediction\data\2_processedData'

# Load the data
teams = spark.read.csv(directory + r'\teamStats.csv', header=True, inferSchema=True)
matches = spark.read.csv(directory + r'\matches.csv', header=True, inferSchema=True)
teams.show(5)

+-----------+-------+--------------+---------------+---------------+-------------+--------------+-----------------+-----------------+
|       Team| Season|Cumulative Won|Cumulative Lost|Cumulative Tied|Cumulative NR|Cumulative W/L|Cumulative AveRPW|Cumulative AveRPO|
+-----------+-------+--------------+---------------+---------------+-------------+--------------+-----------------+-----------------+
|Afghanistan|2009/10|             0|              0|              0|            0|           0.0|              0.0|              0.0|
|Afghanistan|   2010|             4|              2|              0|            0|           2.0|            20.22|             6.89|
|Afghanistan|2011/12|             4|              4|              0|            0|           1.0|            17.87|             6.52|
|Afghanistan|2012/13|             6|              5|              0|            0|           1.2|            19.18|             6.92|
|Afghanistan|2013/14|             8|              7|          

In [169]:
tdt = teams.select("Team").distinct().rdd.map(lambda row: row.Team).collect()
mdt = matches.select("Team1").distinct().rdd.map(lambda row: row.Team1).collect()

In [170]:
for i in tdt:
    if i not in mdt:
        print(i)

Czech Rep.
P.N.G.
Peru
Cayman
U.S.A.
World-XI
U.A.E.
ICC World XI


In [171]:
for i in mdt:
    if i not in tdt:
        print(i)

Cayman Islands
Barbados
United States of America
United Arab Emirates
Czech Republic
Papua New Guinea


In [172]:
# Teams that need to be mapped between `tdt` and `mdt`
team_name_mapping = {
    'U.S.A.': 'United States of America',
    'U.A.E.': 'United Arab Emirates',
    'Czech Rep.': 'Czech Republic',
    'P.N.G.': 'Papua New Guinea',
    'Cayman': 'Cayman Islands'
}

# Teams that do not have a direct match
unmatched_tdt = [team for team in tdt if team not in mdt and team not in team_name_mapping]
unmatched_mdt = [team for team in mdt if team not in tdt and team not in team_name_mapping.values()]

print("Mapped Team Names Dictionary:", team_name_mapping)
print("Unmatched Teams in tdt:", unmatched_tdt)
print("Unmatched Teams in mdt:", unmatched_mdt)

Mapped Team Names Dictionary: {'U.S.A.': 'United States of America', 'U.A.E.': 'United Arab Emirates', 'Czech Rep.': 'Czech Republic', 'P.N.G.': 'Papua New Guinea', 'Cayman': 'Cayman Islands'}
Unmatched Teams in tdt: ['Peru', 'World-XI', 'ICC World XI']
Unmatched Teams in mdt: ['Barbados']


In [173]:
unmatched_teams = unmatched_tdt + unmatched_mdt
unmatched_teams

['Peru', 'World-XI', 'ICC World XI', 'Barbados']

In [174]:
print(teams.count(), matches.count())
teams = teams.filter(~teams.Team.isin(unmatched_teams))
matches = matches.filter(~matches.team1.isin(unmatched_teams)).filter(~matches.team2.isin(unmatched_teams))
print(teams.count(), matches.count())

907 3683
904 3676


In [175]:
teams = teams.replace(team_name_mapping, subset='Team')
matches = matches.replace(team_name_mapping, subset='team1').replace(team_name_mapping, subset='team2')

# Flipping

In [176]:
from pyspark.sql import functions as F

matches1 = matches
matches1 = matches1.withColumn('flip', F.lit(0))
matches2 = matches.withColumnRenamed('team1', 'temp_team').withColumnRenamed('team2', 'team1').withColumnRenamed('temp_team', 'team2').select(
    ['team1', 'team2', 'gender', 'season', 'date', 'venue', 'city', 'toss_winner', 'toss_decision', 'winner','match_id'])
matches2 = matches2.withColumn('flip', F.lit(1))
matchesflip = matches1.union(matches2).sort('match_id')
matchesflip.show(5)


+-----------+------------+------+-------+----------+--------------------+------------+-----------+-------------+-----------+--------+----+
|      team1|       team2|gender| season|      date|               venue|        city|toss_winner|toss_decision|     winner|match_id|flip|
+-----------+------------+------+-------+----------+--------------------+------------+-----------+-------------+-----------+--------+----+
|  Australia|     England|  male|   2005|2005/06/13|       The Rose Bowl| Southampton|    England|          bat|    England|  211028|   1|
|    England|   Australia|  male|   2005|2005/06/13|       The Rose Bowl| Southampton|    England|          bat|    England|  211028|   0|
|  Australia| New Zealand|  male|2004/05|2005/02/17|           Eden Park|    Auckland|  Australia|          bat|  Australia|  211048|   1|
|New Zealand|   Australia|  male|2004/05|2005/02/17|           Eden Park|    Auckland|  Australia|          bat|  Australia|  211048|   0|
|New Zealand|South Africa| 

In [177]:
matchesflip.join(teams, on=[matchesflip.team1 == teams.Team, matchesflip.season == teams.Season], how='inner').drop("Team",teams.Season).show(5)

+---------+---------+------+-------+----------+--------------------+-----------+-----------+-------------+---------+--------+----+--------------+---------------+---------------+-------------+--------------+-----------------+-----------------+
|    team1|    team2|gender| season|      date|               venue|       city|toss_winner|toss_decision|   winner|match_id|flip|Cumulative Won|Cumulative Lost|Cumulative Tied|Cumulative NR|Cumulative W/L|Cumulative AveRPW|Cumulative AveRPO|
+---------+---------+------+-------+----------+--------------------+-----------+-----------+-------------+---------+--------+----+--------------+---------------+---------------+-------------+--------------+-----------------+-----------------+
|Australia|Sri Lanka|  male|2016/17|2017/02/17|Melbourne Cricket...|       NULL|  Sri Lanka|        field|Sri Lanka| 1001349|   0|            46|             41|              2|            1|          1.12|            26.54|             8.36|
|Australia|Sri Lanka|  male|

In [178]:
# matchesflip.join(team_data, left_on=['team1','season'], right_on=['Team','Season'], how='inner',suffix='_team1')

matchesflip = matchesflip.join(teams, on=[matchesflip.team1 == teams.Team, matchesflip.season == teams.Season], how='inner').drop("Team",teams.Season)
matchesflip = matchesflip.withColumnsRenamed({
    "Cumulative Won": "Cumulative Won team1",
    "Cumulative Lost": "Cumulative Lost team1",
    "Cumulative Tied": "Cumulative Tied team1",
    "Cumulative NR": "Cumulative NR team1",
    "Cumulative W/L": "Cumulative W/L team1",
    "Cumulative AveRPW": "Cumulative AveRPW team1", 
    "Cumulative AveRPO": "Cumulative AveRPO team1", 
})
matchesflip.show(5)

+---------+---------+------+-------+----------+--------------------+-----------+-----------+-------------+---------+--------+----+--------------------+---------------------+---------------------+-------------------+--------------------+-----------------------+-----------------------+
|    team1|    team2|gender| season|      date|               venue|       city|toss_winner|toss_decision|   winner|match_id|flip|Cumulative Won team1|Cumulative Lost team1|Cumulative Tied team1|Cumulative NR team1|Cumulative W/L team1|Cumulative AveRPW team1|Cumulative AveRPO team1|
+---------+---------+------+-------+----------+--------------------+-----------+-----------+-------------+---------+--------+----+--------------------+---------------------+---------------------+-------------------+--------------------+-----------------------+-----------------------+
|Australia|Sri Lanka|  male|2016/17|2017/02/17|Melbourne Cricket...|       NULL|  Sri Lanka|        field|Sri Lanka| 1001349|   0|               

In [179]:
matchesflip.join(teams, on=[matchesflip.team2 == teams.Team, matchesflip.season == teams.Season], how='inner').drop("Team",teams.Season).show(5)

+---------+---------+------+-------+----------+--------------------+-----------+-----------+-------------+---------+--------+----+--------------------+---------------------+---------------------+-------------------+--------------------+-----------------------+-----------------------+-------+--------------+---------------+---------------+-------------+--------------+-----------------+-----------------+
|    team1|    team2|gender| season|      date|               venue|       city|toss_winner|toss_decision|   winner|match_id|flip|Cumulative Won team1|Cumulative Lost team1|Cumulative Tied team1|Cumulative NR team1|Cumulative W/L team1|Cumulative AveRPW team1|Cumulative AveRPO team1| Season|Cumulative Won|Cumulative Lost|Cumulative Tied|Cumulative NR|Cumulative W/L|Cumulative AveRPW|Cumulative AveRPO|
+---------+---------+------+-------+----------+--------------------+-----------+-----------+-------------+---------+--------+----+--------------------+---------------------+-----------------

In [180]:

teams_renamed = teams.withColumnRenamed("Season", "Team_Season")

matchesflip = matchesflip.join(teams_renamed, on=[matchesflip.team2 == teams_renamed.Team, matchesflip.season == teams_renamed.Team_Season], how='inner').drop("Team", "Team_Season")
matchesflip = matchesflip.withColumnsRenamed({
    "Cumulative Won": "Cumulative Won team2",
    "Cumulative Lost": "Cumulative Lost team2",
    "Cumulative Tied": "Cumulative Tied team2",
    "Cumulative NR": "Cumulative NR team2",
    "Cumulative W/L": "Cumulative W/L team2",
    "Cumulative AveRPW": "Cumulative AveRPW team2",
    "Cumulative AveRPO": "Cumulative AveRPO team2",
})
matchesflip.show(5)

+---------+---------+------+-------+----------+--------------------+-----------+-----------+-------------+---------+--------+----+--------------------+---------------------+---------------------+-------------------+--------------------+-----------------------+-----------------------+--------------------+---------------------+---------------------+-------------------+--------------------+-----------------------+-----------------------+
|    team1|    team2|gender| season|      date|               venue|       city|toss_winner|toss_decision|   winner|match_id|flip|Cumulative Won team1|Cumulative Lost team1|Cumulative Tied team1|Cumulative NR team1|Cumulative W/L team1|Cumulative AveRPW team1|Cumulative AveRPO team1|Cumulative Won team2|Cumulative Lost team2|Cumulative Tied team2|Cumulative NR team2|Cumulative W/L team2|Cumulative AveRPW team2|Cumulative AveRPO team2|
+---------+---------+------+-------+----------+--------------------+-----------+-----------+-------------+---------+------

In [181]:
# male 0 female 1

matchesflip = matchesflip.withColumn("gender", F.when(matchesflip['gender']=="male",0).otherwise(1).cast("int"))
matchesflip.show(5)

+---------+---------+------+-------+----------+--------------------+-----------+-----------+-------------+---------+--------+----+--------------------+---------------------+---------------------+-------------------+--------------------+-----------------------+-----------------------+--------------------+---------------------+---------------------+-------------------+--------------------+-----------------------+-----------------------+
|    team1|    team2|gender| season|      date|               venue|       city|toss_winner|toss_decision|   winner|match_id|flip|Cumulative Won team1|Cumulative Lost team1|Cumulative Tied team1|Cumulative NR team1|Cumulative W/L team1|Cumulative AveRPW team1|Cumulative AveRPO team1|Cumulative Won team2|Cumulative Lost team2|Cumulative Tied team2|Cumulative NR team2|Cumulative W/L team2|Cumulative AveRPW team2|Cumulative AveRPO team2|
+---------+---------+------+-------+----------+--------------------+-----------+-----------+-------------+---------+------

In [186]:
# match_id|flip|gender| season|      date|               venue|       city|toss_winner|toss_decision|   winner|Cumulative Won team1|Cumulative Lost team1|Cumulative Tied team1|Cumulative NR team1|Cumulative W/L team1|Cumulative AveRPW team1|Cumulative AveRPO team1|Cumulative Won team2|Cumulative Lost team2|Cumulative Tied team2|Cumulative NR team2|Cumulative W/L team2|Cumulative AveRPW team2|Cumulative AveRPO team2|

matchesflip = matchesflip.select("match_id","flip","gender","Cumulative Won team1","Cumulative Lost team1","Cumulative Tied team1","Cumulative NR team1","Cumulative W/L team1","Cumulative AveRPW team1","Cumulative AveRPO team1","Cumulative Won team2","Cumulative Lost team2","Cumulative Tied team2","Cumulative NR team2","Cumulative W/L team2","Cumulative AveRPW team2","Cumulative AveRPO team2").sort("match_id",'flip')
matchesflip.show(5)

+--------+----+------+--------------------+---------------------+---------------------+-------------------+--------------------+-----------------------+-----------------------+--------------------+---------------------+---------------------+-------------------+--------------------+-----------------------+-----------------------+
|match_id|flip|gender|Cumulative Won team1|Cumulative Lost team1|Cumulative Tied team1|Cumulative NR team1|Cumulative W/L team1|Cumulative AveRPW team1|Cumulative AveRPO team1|Cumulative Won team2|Cumulative Lost team2|Cumulative Tied team2|Cumulative NR team2|Cumulative W/L team2|Cumulative AveRPW team2|Cumulative AveRPO team2|
+--------+----+------+--------------------+---------------------+---------------------+-------------------+--------------------+-----------------------+-----------------------+--------------------+---------------------+---------------------+-------------------+--------------------+-----------------------+-----------------------+
|  2110

In [187]:
directory = r'D:\github\Cricket-Prediction\data\3_aftermerging'

matchesflip.toPandas().to_csv(directory + r'\team12Statsflip.csv', index=False)