First, we import the libraries we need. 

In [176]:
import pandas as pd
import os
import numpy as np
from scipy import stats
from sklearn import linear_model

We iterate through every match in our folder, represented by a csv file, and create a dataframe containing the relevant data for each match. We call this function process_csv_all(). 

In [177]:
def clean_ball_notation(match):
    roots = [0.0 + i for i in range(20)]
    ball_lst = []
    for root in roots:
        ball_lst.append(root + 0.1)
        ball_lst.append(root + 0.2)
        ball_lst.append(root + 0.3)
        ball_lst.append(root + 0.4)
        ball_lst.append(root + 0.5)
        ball_lst.append(root + 0.6)
        
    
        
    
    new_ball_notation = []
    num_of_match_rows = 0
    ball_notation = 0.1
    ball_counter = 0
    for index, row in match.iterrows():
        num_of_match_rows = index
        
    for index, row in match.iterrows():
                    
        current_innings = row["innings"]
        if (ball_counter > 1):          
            previous_innings = match["innings"][index-1]
            if (current_innings != previous_innings):
                ball_counter = 0
                
        ball_notation = ball_lst[ball_counter]
        
        if index == num_of_match_rows:
            new_ball_notation.append(str(ball_notation))
            break
            
            
        ball_score = row["runs_scored"] + row["extras_scored"]
        
        condition_odd_score = (ball_score % 2 == 1)
        
        current_striker = row["batsman_on_strike"]
        next_striker = match["batsman_on_strike"][index+1]
        condition_same_striker = (current_striker == next_striker)
        
        current_nonstriker = row["batsman_off_strike"]
        next_nonstriker = match["batsman_off_strike"][index+1]
        condition_same_nonstriker = (current_nonstriker == next_nonstriker) 
        
        condition_strike_correct = (condition_same_striker or condition_same_nonstriker)
        
        condition_wicket = False
        if not pd.isna(row["type_of_dismissal"]):
            condition_wicket = True
            new_ball_notation.append(str(ball_notation))
            ball_counter += 1
            continue
        
        current_bowler = row["bowler"]
        next_bowler = next_striker = match["bowler"][index+1]
        condition_new_over = (current_bowler != next_bowler)
        
        if (condition_odd_score == condition_strike_correct):
            if not (condition_new_over):
                new_ball_notation.append(str(ball_notation))
            else:
                new_ball_notation.append(str(ball_notation))
                ball_counter += 1
            
        else:
            if not (condition_new_over):
                new_ball_notation.append(str(ball_notation))
                ball_counter += 1
            else:
                new_ball_notation.append(str(ball_notation))
            
    match["ball_notation"] = new_ball_notation
    return match

In [178]:
def process_csv_all():
    not_able_to_clean = 0
    files = os.listdir(r"/Users/suchirjoshi/Desktop/Cricket Analytics/T20 + Major Leagues folder/")
    folder = files[0:len(files)-1]
    csvs = []
    for file in folder:
        if file.endswith('.csv'):
            cleaned_file = pd.read_csv((r"/Users/suchirjoshi/Desktop/Cricket Analytics/T20 + Major Leagues folder/" + 
                        file), names=(["type", "innings", "ball_notation", "team_batting", 
                                                      "batsman_on_strike", "batsman_off_strike", "bowler",
                                                       "runs_scored", "extras_scored", "type_of_dismissal", 
                                                       "person_of_dismissal"]), engine='python')
            
            try:
                cleaned_file = clean_ball_notation(cleaned_file)
            except IndexError:
                not_able_to_clean += 1
        
        
            csvs.append(cleaned_file.dropna(thresh=4))
        
    return csvs

Now, we call process_csv_all() to generate this list of dataframes. 

In [179]:
list_of_matches = process_csv_all()

This adds some features to a given match table. 

In [180]:
def convert(notation):
    num = float(notation)
    overs = int(num)
    balls = (num * 10) % 10
    return (overs * 6) + balls

In [187]:
def add_cols(match):
    try:
    
        runs = []
        runs_so_far = []
        wickets_so_far = []

        balls_done = []
        first_innings = match.loc[match["innings"] == "1"]
        total_balls_first = len(first_innings)
        first_index = 1

        count = 0
        run_count = 0
        wickets = 0

        total_score = []
        for i in np.arange(len(first_innings)):
            if pd.isna(first_innings["type_of_dismissal"].iloc[i]):
                count += first_innings["runs_scored"].iloc[i]
                count += first_innings["extras_scored"].iloc[i]
                runs.append(count)

                run_count += first_innings["runs_scored"].iloc[i]
                run_count += first_innings["extras_scored"].iloc[i]
                runs_so_far.append(run_count)
                wickets_so_far.append(wickets)
            else:
                count = 0
                run_count += first_innings["runs_scored"].iloc[i]
                run_count += first_innings["extras_scored"].iloc[i]
                runs_so_far.append(run_count)
                runs.append(count)
                wickets += 1
                wickets_so_far.append(wickets)

            balls_done.append(convert(first_innings["ball_notation"].iloc[i]))
            first_index += 1

        for i in np.arange(len(first_innings)):
            total_score.append(runs_so_far[len(runs_so_far)-1])

        second_innings = match.loc[match["innings"] == "2"]
        count = 0
        run_count = 0
        wickets = 0
        total_balls_second = len(second_innings)
        second_index = 1

        for i in np.arange(len(second_innings)):
            if pd.isna(second_innings["type_of_dismissal"].iloc[i]):
                count += second_innings["runs_scored"].iloc[i]
                count += second_innings["extras_scored"].iloc[i]
                runs.append(count)

                run_count += second_innings["runs_scored"].iloc[i]
                run_count += second_innings["extras_scored"].iloc[i]
                runs_so_far.append(run_count)
                wickets_so_far.append(wickets)
            else:
                count = 0
                run_count += second_innings["runs_scored"].iloc[i]
                run_count += second_innings["extras_scored"].iloc[i]
                runs_so_far.append(run_count)
                runs.append(count)
                wickets += 1
                wickets_so_far.append(wickets)

            balls_done.append(convert(second_innings["ball_notation"].iloc[i]))
            second_index += 1

        for i in np.arange(len(second_innings)):
            total_score.append(runs_so_far[len(runs_so_far)-1])

        if (len(runs) != len(match)):
            return None

        if (len(runs_so_far) != len(match)):
            return None

        if (len(balls_done) != len(match)):
            return None

        match["Runs since last wicket"] = runs
        match["Total runs"] = runs_so_far
        match["Innings balls done"] = balls_done
        match["Total Inings Score"] = total_score
        match["Total Wickets"] = wickets_so_far
        return match
    
    except:
        return None

In [188]:
sixth_match = list_of_matches[6]
add_cols(sixth_match)

Unnamed: 0,type,innings,ball_notation,team_batting,batsman_on_strike,batsman_off_strike,bowler,runs_scored,extras_scored,type_of_dismissal,person_of_dismissal,Runs since last wicket,Total runs,Innings balls done,Total Inings Score,Total Wickets
21,ball,1,0.1,India,G Gambhir,RG Sharma,JE Taylor,0.0,0.0,,,0.0,0.0,1.0,153.0,0
22,ball,1,0.2,India,G Gambhir,RG Sharma,JE Taylor,4.0,0.0,,,4.0,4.0,2.0,153.0,0
23,ball,1,0.3,India,G Gambhir,RG Sharma,JE Taylor,0.0,0.0,,,4.0,4.0,3.0,153.0,0
24,ball,1,0.4,India,G Gambhir,RG Sharma,JE Taylor,1.0,0.0,,,5.0,5.0,4.0,153.0,0
25,ball,1,0.5,India,RG Sharma,G Gambhir,JE Taylor,1.0,0.0,,,6.0,6.0,5.0,153.0,0
26,ball,1,0.6,India,G Gambhir,RG Sharma,JE Taylor,2.0,0.0,,,8.0,8.0,6.0,153.0,0
27,ball,1,1.1,India,RG Sharma,G Gambhir,FH Edwards,4.0,0.0,,,12.0,12.0,7.0,153.0,0
28,ball,1,1.2,India,RG Sharma,G Gambhir,FH Edwards,0.0,0.0,caught,RG Sharma,0.0,12.0,8.0,153.0,1
29,ball,1,1.3,India,G Gambhir,SK Raina,FH Edwards,0.0,0.0,,,0.0,12.0,9.0,153.0,1
30,ball,1,1.4,India,G Gambhir,SK Raina,FH Edwards,0.0,0.0,,,0.0,12.0,10.0,153.0,1


In [189]:
list_of_matches_with_features = [add_cols(match) for match in list_of_matches if add_cols(match) is not None]

In [190]:
len(list_of_matches_with_features)

1504

In [192]:
first_innings_with_features = [match.loc[match["innings"] == "1"] for match in list_of_matches_with_features]

In [193]:
len(first_innings_with_features)

1504

In [329]:
def convert_to_standard_units(column):
    
    mean = np.mean(column)
    std_dev = np.std(column)
    
    return [(element - mean) / std_dev for element in column]
    #return column

In [330]:
from sklearn.model_selection import train_test_split

no_convert = 0
reg = linear_model.LinearRegression()
total_runs = []
innings_balls_done = []
total_wickets = []
total_innings_score = []
runs_since_last_wicket = []
team_batting = []
for match in first_innings_with_features:
    
    try:
        total_runs.extend(convert_to_standard_units(match["Total runs"]))
        innings_balls_done.extend(convert_to_standard_units(match["Innings balls done"]))
        total_wickets.extend(convert_to_standard_units(match["Total Wickets"]))
        total_innings_score.extend(match["Total Inings Score"])
        runs_since_last_wicket.extend(convert_to_standard_units(match["Runs since last wicket"]))
        #team_batting.extend(match["team_batting"])
    except:
        no_convert += 1
    
x = []
for i in np.arange(len(total_wickets)):
    x.append([total_runs[i], innings_balls_done[i], total_wickets[i], runs_since_last_wicket[i]])

y = total_innings_score

In [346]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
reg.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [347]:
from sklearn.metrics import mean_squared_error, r2_score

y_prediction = reg.predict(x_test)
r2_score(y_test, y_prediction)

-0.00014637996504718309

In [366]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [367]:
rf.score(x_test, y_test)

0.72210263881644976

In [373]:
rf.predict([[0.1, 0, 0, 0]])

array([ 170.8])

In [381]:
from sklearn.neighbors import KNeighborsRegressor
neighbor = KNeighborsRegressor(n_neighbors=3, weights="distance")
neighbor.fit(x_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=3, p=2,
          weights='distance')

In [382]:
pred = neighbor.predict(x_test)
r2_score(y_test, pred)

0.25569132291446217

In [383]:
neighbor.predict([[1, 0, 0, 0]])

array([ 130.81631761])