In [None]:
# Part 1: Synthetic Cricket Dataset Generation

import numpy as np
import pandas as pd

np.random.seed(42)  # Reproducibility

n_samples = 1000  # Number of players

# Generate features using the given ranges
ages = np.random.randint(14, 41, n_samples)
runs_scored = np.random.randint(0, 10001, n_samples)
wickets_taken = np.random.randint(0, 501, n_samples)
catches_taken = np.random.randint(0, 351, n_samples)

# Assign Player Type (Batsman: 40%, Bowler: 40%, All-rounder: 20%)
player_types = np.random.choice(
    ['Batsman', 'Bowler', 'All-rounder'],
    size=n_samples,
    p=[0.4, 0.4, 0.2]
)

# Show how to convert to DataFrame and display results
df = pd.DataFrame({
    'Age': ages,
    'Runs Scored': runs_scored,
    'Wickets Taken': wickets_taken,
    'Catches Taken': catches_taken,
    'Player Type': player_types
})

# Print first 5 rows to verify output
print(df.head())


   Age  Runs Scored  Wickets Taken  Catches Taken Player Type
0   20         5240              7            182     Batsman
1   33         6493            497            145     Batsman
2   28         2878            340            318      Bowler
3   24         4143            128            115      Bowler
4   21         3644            263            112     Batsman


In [None]:
# Part 2: Logical assignment of Batting and Bowling Averages

batting_average = []
bowling_average = []

for i in range(n_samples):
    if player_types[i] == 'Batsman':
        # Good batsmen: higher runs, higher batting avg, poor bowling avg
        avg = np.random.uniform(35, 60) if runs_scored[i] > 3000 else np.random.uniform(20, 35)
        batting_average.append(round(avg,2))
        bowling_average.append(round(np.random.uniform(70, 120),2))  # Not a specialist bowler
    elif player_types[i] == 'Bowler':
        # Good bowlers: higher wickets, lower bowling avg, poor batting avg
        avg = np.random.uniform(15, 30) if wickets_taken[i] > 200 else np.random.uniform(30, 45)
        bowling_average.append(round(avg,2))
        batting_average.append(round(np.random.uniform(10, 25),2))  # Not a specialist batsman
    else:  # All-rounder
        # Moderate in both, have both runs & wickets
        batting_avg = np.random.uniform(25,45) if runs_scored[i] > 2000 else np.random.uniform(15,30)
        bowling_avg = np.random.uniform(20, 40) if wickets_taken[i] > 100 else np.random.uniform(30, 60)
        batting_average.append(round(batting_avg,2))
        bowling_average.append(round(bowling_avg,2))

# Create dataframe
df = pd.DataFrame({
    'Age': ages,
    'Runs Scored': runs_scored,
    'Wickets Taken': wickets_taken,
    'Catches Taken': catches_taken,
    'Batting Average': batting_average,
    'Bowling Average': bowling_average,
    'Player Type': player_types
})

# See sample
print(df.head())


   Age  Runs Scored  Wickets Taken  Catches Taken  Batting Average  \
0   20         5240              7            182            42.31   
1   33         6493            497            145            42.34   
2   28         2878            340            318            18.96   
3   24         4143            128            115            12.88   
4   21         3644            263            112            42.39   

   Bowling Average Player Type  
0            86.61     Batsman  
1            72.07     Batsman  
2            24.56      Bowler  
3            39.80      Bowler  
4           100.98     Batsman  


In [None]:
# Part 3: Apply K-Nearest Neighbors (KNN) to predict Player Type

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Encode player types for ML
label_enc = LabelEncoder()
df['Player Type Enc'] = label_enc.fit_transform(df['Player Type'])

# Features (excluding Player Type itself)
feature_cols = ['Age', 'Runs Scored', 'Wickets Taken', 'Catches Taken', 'Batting Average', 'Bowling Average']
X = df[feature_cols]
y = df['Player Type Enc']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

# Results
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_enc.classes_))


Accuracy: 0.925
              precision    recall  f1-score   support

 All-rounder       0.85      0.74      0.79        38
     Batsman       1.00      0.98      0.99        66
      Bowler       0.90      0.96      0.93        96

    accuracy                           0.93       200
   macro avg       0.92      0.89      0.90       200
weighted avg       0.92      0.93      0.92       200

