In [1]:
import pandas as pd
import glob


csv_files = glob.glob("WorldCup_Stats/*.csv") 
dataframes = [pd.read_csv(file) for file in csv_files]


crick_df = pd.concat(dataframes, ignore_index=True)


crick_df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,date,venue,match_category,team_1,team_2,team_1_runs,team_1_wickets,team_2_runs,team_2_wickets,result,pom,best_batters,best_bowlers,commentary_line,world_cup_year,host_country
0,0,6,,Manchester,League-Match,CAN,ENG,45.0,0.0,46.0,2.0,England won by 8 wickets (with 277 balls remai...,Chris Old,,,,1979,England
1,1,2,,Lord's,League-Match,AUS,ENG,159.0,9.0,160.0,4.0,England won by 6 wickets (with 77 balls remain...,Graham Gooch,,,,1979,England
2,2,8,1979-06-16,Manchester,League-Match,SL,IND,238.0,5.0,191.0,0.0,Sri Lanka won by 47 runs,Duleep Mendis,,,,1979,England
3,3,5,,Nottingham,League-Match,PAK,AUS,286.0,7.0,197.0,0.0,Pakistan won by 89 runs,Asif Iqbal,,,,1979,England
4,4,10,,Birmingham,League-Match,CAN,AUS,105.0,0.0,106.0,3.0,Australia won by 7 wickets (with 204 balls rem...,Alan Hurst,,,,1979,England


Assume null recodes as which contains all columns are null

In [2]:
#Data exploration
print(crick_df.info())
print(crick_df.describe())

# Remove duplicate records & Remove rows with null values
crick_df.drop_duplicates(inplace=True)
crick_df.dropna(how='all' ,inplace=True)

crick_df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 528 entries, 0 to 527
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0.1     528 non-null    int64  
 1   Unnamed: 0       528 non-null    int64  
 2   date             364 non-null    object 
 3   venue            528 non-null    object 
 4   match_category   528 non-null    object 
 5   team_1           528 non-null    object 
 6   team_2           528 non-null    object 
 7   team_1_runs      518 non-null    float64
 8   team_1_wickets   518 non-null    float64
 9   team_2_runs      513 non-null    float64
 10  team_2_wickets   513 non-null    float64
 11  result           528 non-null    object 
 12  pom              510 non-null    object 
 13  best_batters     250 non-null    object 
 14  best_bowlers     250 non-null    object 
 15  commentary_line  83 non-null     object 
 16  world_cup_year   528 non-null    int64  
 17  host_country    

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,date,venue,match_category,team_1,team_2,team_1_runs,team_1_wickets,team_2_runs,team_2_wickets,result,pom,best_batters,best_bowlers,commentary_line,world_cup_year,host_country
0,0,6,,Manchester,League-Match,CAN,ENG,45.0,0.0,46.0,2.0,England won by 8 wickets (with 277 balls remai...,Chris Old,,,,1979,England
1,1,2,,Lord's,League-Match,AUS,ENG,159.0,9.0,160.0,4.0,England won by 6 wickets (with 77 balls remain...,Graham Gooch,,,,1979,England
2,2,8,1979-06-16,Manchester,League-Match,SL,IND,238.0,5.0,191.0,0.0,Sri Lanka won by 47 runs,Duleep Mendis,,,,1979,England
3,3,5,,Nottingham,League-Match,PAK,AUS,286.0,7.0,197.0,0.0,Pakistan won by 89 runs,Asif Iqbal,,,,1979,England
4,4,10,,Birmingham,League-Match,CAN,AUS,105.0,0.0,106.0,3.0,Australia won by 7 wickets (with 204 balls rem...,Alan Hurst,,,,1979,England
5,5,11,,Leeds,League-Match,ENG,PAK,165.0,9.0,151.0,0.0,England won by 14 runs,Mike Hendrick,,,,1979,England
6,6,13,,The Oval,Semi-Final,WI,PAK,293.0,6.0,250.0,0.0,West Indies won by 43 runs,Gordon Greenidge,,,,1979,England
7,7,6,,Manchester,League-Match,CAN,ENG,45.0,0.0,46.0,2.0,England won by 8 wickets (with 277 balls remai...,Chris Old,,,,1979,England
8,8,12,1979-06-20,Manchester,Semi-Final,ENG,NZ,221.0,8.0,212.0,9.0,England won by 9 runs,Graham Gooch,,,,1979,England
9,9,1,,Nottingham,League-Match,SL,NZ,189.0,0.0,190.0,1.0,New Zealand won by 9 wickets (with 74 balls re...,Geoff Howarth,,,,1979,England


This is to clean the data by removing matches with abnormally high or low runs scored by either team, as these might be errors or extreme anomalies that could skew the analysis.

In [3]:
#Removing outliers for numerical columns
from scipy.stats import zscore

num_cols = ['team_1_runs', 'team_2_runs']
crick_df = crick_df[(zscore(crick_df[num_cols]) < 3).all(axis=1)]

crick_df.head(10)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,date,venue,match_category,team_1,team_2,team_1_runs,team_1_wickets,team_2_runs,team_2_wickets,result,pom,best_batters,best_bowlers,commentary_line,world_cup_year,host_country


In [33]:
#Adding new columns to the DataFrame: match_status 
crick_df['match_status'] = crick_df['result'].apply(
    lambda x: 'abandoned' if 'abandoned' in x.lower() else 'played'
)

crick_df.head(10)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,date,venue,match_category,team_1,team_2,team_1_runs,team_1_wickets,team_2_runs,team_2_wickets,result,pom,best_batters,best_bowlers,commentary_line,world_cup_year,host_country,match_status
326,0,3,2011-02-21,Ahmedabad,League-Match,AUS,ZIM,262.0,6.0,171.0,0.0,Australia won by 91 runs,Shane Watson,"['SR Watson - 79 runs ', 'MJ Clarke - 58 runs ']","['MG Johnson - 4', 'JJ Krejza - 2']",The presentation ceremony is about to begin.,2011,India,played
327,1,48,2011-04-02,Wankhede,Final,SL,IND,274.0,6.0,277.0,4.0,India won by 6 wickets (with 10 balls remaining),MS Dhoni,"['DPMD Jayawardene - 103 runs ', 'G Gambhir - ...","['SL Malinga - 2', 'Yuvraj Singh - 2']",Now Yusuf Pathan and Raina carry Tendulkar. Th...,2011,India,played
328,2,28,2011-03-12,Nagpur,League-Match,IND,SA,296.0,0.0,300.0,7.0,South Africa won by 3 wickets (with 2 balls re...,Dale Steyn,"['SR Tendulkar - 111 runs ', 'V Sehwag - 73 ru...","['DW Steyn - 5', 'Harbhajan Singh - 3']","Robin Peterson: ""I'm extremely ecstatic at the...",2011,India,played
329,3,40,2011-03-20,Eden Gardens,League-Match,ZIM,KENYA,308.0,6.0,147.0,0.0,Zimbabwe won by 161 runs,Craig Ervine,"['CR Ervine - 66 runs ', 'V Sibanda - 61 runs ']","['RW Price - 2', 'GA Lamb - 2']","Steve Tikolo: ""We are disappointed. We wanted ...",2011,India,played
332,6,42,2011-03-23,Mirpur,Final,WI,PAK,112.0,0.0,113.0,0.0,Pakistan won by 10 wickets (with 175 balls rem...,Mohammad Hafeez,"['Mohammad Hafeez - 61 runs ', 'Kamran Akmal -...","['Shahid Afridi - 4', 'Mohammad Hafeez - 2']",'Clinical Pakistan storm into semi-final' is t...,2011,Bangladesh,played
338,12,45,2011-03-26,Colombo (RPS),Final,ENG,SL,229.0,6.0,231.0,0.0,Sri Lanka won by 10 wickets (with 63 balls rem...,Tillakaratne Dilshan,"['TM Dilshan - 108 runs ', 'WU Tharanga - 102 ...","['M Muralidaran - 2', 'BAW Mendis - 1']","Kumar Sangakkara ""Losing the toss it was an im...",2011,Sri Lanka,played
341,15,43,2011-03-24,Ahmedabad,Final,AUS,IND,260.0,6.0,261.0,5.0,India won by 5 wickets (with 14 balls remaining),Yuvraj Singh,"['RT Ponting - 104 runs ', 'Yuvraj Singh - 57 ...","['Yuvraj Singh - 2', 'R Ashwin - 2']",After hurtling through the group stages unconv...,2011,India,played
342,16,7,2011-02-25,Nagpur,League-Match,NZ,AUS,206.0,0.0,207.0,3.0,Australia won by 7 wickets (with 96 balls rema...,Mitchell Johnson,"['SR Watson - 62 runs ', 'BJ Haddin - 55 runs ']","['MG Johnson - 4', 'SW Tait - 3']","Ponting: ""Good game for us today. NZ are a ver...",2011,India,played
343,17,38,2011-03-19,Mirpur,League-Match,SA,BAN,284.0,8.0,78.0,0.0,South Africa won by 206 runs,Lonwabo Tsotsobe,"['JH Kallis - 69 runs ', 'F du Plessis - 52 ru...","['RJ Peterson - 4', 'LL Tsotsobe - 3']",Please do wait for the quotes from the post-ma...,2011,Bangladesh,played
347,21,46,2011-03-29,Colombo (RPS),Semi-Final,NZ,SL,217.0,0.0,220.0,5.0,Sri Lanka won by 5 wickets (with 13 balls rema...,Kumar Sangakkara,"['TM Dilshan - 73 runs ', 'SB Styris - 57 runs ']","['BAW Mendis - 3', 'TG Southee - 3']","As for the big match tomorrow, there's some ba...",2011,Sri Lanka,played
