In [1]:
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
def info_preprocess(folder_name):
    
    folder_path = f'D:\\Projects\\Mini_Projects\\CricSheet_Analysis\\{folder_name}'
    folder_name_only = os.path.basename(folder_path)
    output_folder = f'D:\\Projects\\Mini_Projects\\CricSheet_Analysis\\PreProcessed_{folder_name_only}'
    
    if not os.path.exists(folder_path) or not os.listdir(folder_path):
        print(f"Warning: No files found in {folder_path}")
        return  

    os.makedirs(output_folder, exist_ok=True)

    for file in os.listdir(folder_path):
        if file.endswith('.csv'):
            file_path = os.path.join(folder_path, file)
            
            print(f"Processing file: {file_path}")

            data = pd.read_csv(file_path)

            if 'Season' in data.columns:
                data['Season'] = data['Season'].str.split('/').str[0]
                print("Extracted Only the Season Year !!")

            columns_to_check = ['Date','Match_Number','City','Venue','Season','Name','Match_Type','Total_Overs','Teams_Participated','Team_1','Team_2','Team_Type','Toss_Winner',
                                'Choose_To','Match_Winner','Match_Result','Win_By_Runs','Win_By_Wickets','Win_By_Innings','Man_Of_Match']

            for col in columns_to_check:
                if col in data.columns and data[col].isna().all():
                    
                    data.drop(columns=[col], inplace=True)
                    print(f"Dropped column: {col} (all values were NaN)")
            
            output_file_path = os.path.join(output_folder, file)

            data.to_csv(output_file_path, index=False)

            print(f"Processed file saved to: {output_file_path}")

In [3]:
info_preprocess('General_Datasets')

Processing file: D:\Projects\Mini_Projects\CricSheet_Analysis\General_Datasets\IPL_DATA.csv
Extracted Only the Season Year !!
Dropped column: Win_By_Innings (all values were NaN)
Processed file saved to: D:\Projects\Mini_Projects\CricSheet_Analysis\PreProcessed_General_Datasets\IPL_DATA.csv
Processing file: D:\Projects\Mini_Projects\CricSheet_Analysis\General_Datasets\ipl_json.csv
Extracted Only the Season Year !!
Dropped column: Win_By_Innings (all values were NaN)
Processed file saved to: D:\Projects\Mini_Projects\CricSheet_Analysis\PreProcessed_General_Datasets\ipl_json.csv
Processing file: D:\Projects\Mini_Projects\CricSheet_Analysis\General_Datasets\odis_json.csv
Extracted Only the Season Year !!
Dropped column: Win_By_Innings (all values were NaN)
Processed file saved to: D:\Projects\Mini_Projects\CricSheet_Analysis\PreProcessed_General_Datasets\odis_json.csv
Processing file: D:\Projects\Mini_Projects\CricSheet_Analysis\General_Datasets\ODI_DATA.csv
Extracted Only the Season Year

In [4]:
def innings_preprocess(folder_name):
    
    folder_path = f'D:\\Projects\\Mini_Projects\\CricSheet_Analysis\\{folder_name}'
    folder_name_only = os.path.basename(folder_path)
    output_folder = f'D:\\Projects\\Mini_Projects\\CricSheet_Analysis\\PreProecessed_{folder_name_only}'
    
    if not os.path.exists(folder_path) or not os.listdir(folder_path):
        print(f"Warning: No files found in {folder_path}")
        return  

    os.makedirs(output_folder, exist_ok=True)

    for file in os.listdir(folder_path):
        if file.endswith('.csv'):
            file_path = os.path.join(folder_path, file)

            print(f"Processing file: {file_path}")

            data = pd.read_csv(file_path)

            columns_to_check = ['Team','Over','Batter','Bowler','Non_striker','Batter_runs','Extras','Total_runs','Type','Player_Out','Fielder_Name',
                                'Powerplay_Type','Target_Runs','Target_Overs']

            for col in columns_to_check:

                if col in data.columns and data[col].isna().all():
                    
                    data.drop(columns=[col], inplace=True)
                    print(f"Dropped column: {col} (all values were NaN)")

            output_file_path = os.path.join(output_folder, file)

            data.to_csv(output_file_path, index=False)

            print(f"Processed file saved to: {output_file_path}")

In [6]:
innings_preprocess('Innings_Datasets')

Processing file: D:\Projects\Mini_Projects\CricSheet_Analysis\Innings_Datasets\IPL_DATA_INNINGS.csv
Processed file saved to: D:\Projects\Mini_Projects\CricSheet_Analysis\PreProecessed_Innings_Datasets\IPL_DATA_INNINGS.csv
Processing file: D:\Projects\Mini_Projects\CricSheet_Analysis\Innings_Datasets\ODI_DATA_INNINGS.csv
Processed file saved to: D:\Projects\Mini_Projects\CricSheet_Analysis\PreProecessed_Innings_Datasets\ODI_DATA_INNINGS.csv
Processing file: D:\Projects\Mini_Projects\CricSheet_Analysis\Innings_Datasets\T20_DATA_INNINGS.csv
Processed file saved to: D:\Projects\Mini_Projects\CricSheet_Analysis\PreProecessed_Innings_Datasets\T20_DATA_INNINGS.csv
Processing file: D:\Projects\Mini_Projects\CricSheet_Analysis\Innings_Datasets\TEST_DATA_INNINGS.csv
Dropped column: Powerplay_Type (all values were NaN)
Dropped column: Target_Runs (all values were NaN)
Dropped column: Target_Overs (all values were NaN)
Processed file saved to: D:\Projects\Mini_Projects\CricSheet_Analysis\PreProece