In [1]:
pip install pandas





In [9]:
pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Using cached scikit_learn-1.6.1-cp312-cp312-win_amd64.whl (11.1 MB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Installing collected packages: joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.6.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def extract_info(commentary):
    bowler_pattern = r'([A-Za-z]+\s[A-Za-z]+) to'
    batter_pattern = r'to ([A-Za-z]+\s[A-Za-z]+)'
    ball_type_pattern = r'\b(yorker|bouncer|full toss|good length|short ball)\b'
    shot_type_pattern = r'\b(boundary|six|four|single|double|triple)\b'
    speed_pattern = r'\b(\d{2,3})\s?kph\b'
    runs_pattern = r'\b(\d+) runs?\b'
    
    bowler = re.search(bowler_pattern, commentary)
    batter = re.search(batter_pattern, commentary)
    ball_type = re.search(ball_type_pattern, commentary, re.IGNORECASE)
    shot_type = re.search(shot_type_pattern, commentary, re.IGNORECASE)
    speed = re.search(speed_pattern, commentary)
    runs = re.search(runs_pattern, commentary)
    
    return {
        "Bowler": bowler.group(1) if bowler else None,
        "Batter": batter.group(1) if batter else None,
        "Ball Type": ball_type.group(1) if ball_type else None,
        "Shot Type": shot_type.group(1) if shot_type else None,
        "Speed (kph)": speed.group(1) if speed else None,
        "Runs Scored": runs.group(1) if runs else None
    }


commentary_data = [
    "Jasprit Bumrah to Virat Kohli, good length ball, played for a single.",
    "Mitchell Starc to Rohit Sharma, yorker at 145 kph, driven for four runs!",
    "Pat Cummins to KL Rahul, short ball, pulled for a six!",
    "Rashid Khan to MS Dhoni, full toss, flicked for a double."
]


data = [extract_info(comment) for comment in commentary_data]


df = pd.DataFrame(data)
print(df)

count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()


count_matrix = count_vectorizer.fit_transform(commentary_data)
tfidf_matrix = tfidf_vectorizer.fit_transform(commentary_data)


count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print("\nCount Vectorizer Output:\n", count_df)
print("\nTF-IDF Vectorizer Output:\n", tfidf_df)

           Bowler        Batter    Ball Type Shot Type Speed (kph) Runs Scored
0  Jasprit Bumrah   Virat Kohli  good length    single        None        None
1  Mitchell Starc  Rohit Sharma       yorker      four         145        None
2     Pat Cummins      KL Rahul   short ball       six        None        None
3     Rashid Khan      MS Dhoni    full toss    double        None        None

Count Vectorizer Output:
    145  at  ball  bumrah  cummins  dhoni  double  driven  flicked  for  ...  \
0    0   0     1       1        0      0       0       0        0    1  ...   
1    1   1     0       0        0      0       0       1        0    1  ...   
2    0   0     1       0        1      0       0       0        0    1  ...   
3    0   0     0       0        0      1       1       0        1    1  ...   

   runs  sharma  short  single  six  starc  to  toss  virat  yorker  
0     0       0      0       1    0      0   1     0      1       0  
1     1       1      0       0    0      1