In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error
import pickle

# Load the dataset
df = pd.read_excel('D:\cricket_score_predictor-main\cricket_with_synthetic_weather.xlsx')

# Fill missing values in the 'city' column by extracting the city name from 'venue'
df['city'] = df['city'].fillna(df['venue'].apply(lambda x: x.split(' ')[0]))

# Fill missing values for weather features
weather_numerics = ['temperature', 'humidity', 'wind_speed', 'precipitation']
for col in weather_numerics:
    df[col] = df[col].fillna(df[col].mean())
df['weather_description'] = df['weather_description'].fillna(df['weather_description'].mode()[0])

# Ensure player_dismissed is properly encoded as integers and calculate cumulative values
df['player_dismissed'] = df['player_dismissed'].apply(lambda x: 1 if x != '0' else 0).astype(int)
df['player_dismissed'] = df.groupby('match_id')['player_dismissed'].cumsum()
df['wicket_left'] = 10 - df['player_dismissed']

# Calculate cumulative score, overs, balls bowled, balls left, and current run rate
df['current_score'] = df.groupby('match_id')['runs'].cumsum()
df['over'] = df['ball'].apply(lambda x: int(str(x).split(".")[0]))
df['ball_no'] = df['ball'].apply(lambda x: int(str(x).split(".")[1]))
df['ball_bowled'] = df['over'] * 6 + df['ball_no']
df['balls_left'] = (120 - df['ball_bowled']).apply(lambda x: max(0, x))
df['current_run_rate'] = (df['current_score'] * 6) / df['ball_bowled']

# Calculate the last six balls' cumulative score
df['last_six'] = df.groupby('match_id')['runs'].rolling(window=36, min_periods=1).sum().reset_index(level=0, drop=True)

# Select eligible cities where matches played are greater than 600
eligible_cities = df['city'].value_counts()[df['city'].value_counts() > 600].index.tolist()
df = df[df['city'].isin(eligible_cities)]

# Prepare the final dataframe for prediction
final_df = df.groupby('match_id').sum()['runs'].reset_index().merge(df, on='match_id')
final_df = final_df[['batting_team', 'bowling_team', 'city', 'current_score', 'balls_left',
                     'wicket_left', 'current_run_rate', 'last_six', 'temperature',
                     'humidity', 'wind_speed', 'precipitation', 'weather_description', 'runs_x']]

# Drop any remaining NaN values
final_df.dropna(inplace=True)

# Define features (X) and target (y)
X = final_df.drop(columns=['runs_x'])
y = final_df['runs_x']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the column transformer with OneHotEncoder and StandardScaler
transformer = ColumnTransformer([
    ('team_city_encoder', OneHotEncoder(sparse_output=False, drop='first'),
     ['batting_team', 'bowling_team', 'city', 'weather_description']),
    ('scaler', StandardScaler(),
     ['current_score', 'balls_left', 'wicket_left', 'current_run_rate', 'last_six',
      'temperature', 'humidity', 'wind_speed', 'precipitation'])
], remainder='passthrough')

# Create the model pipeline with adjusted XGBoost parameters to reduce memory usage
pipe = Pipeline(steps=[
    ('transform', transformer),
    ('model', XGBRegressor(n_estimators=500, learning_rate=0.2, max_depth=6, random_state=1))
])

# Train the model
pipe.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = pipe.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("R^2 Score:", r2)
print("Mean Absolute Error:", mae)

# Save the trained pipeline model to a file
pickle.dump(pipe, open('pipe.pkl', 'wb'))


  df = pd.read_excel('D:\cricket_score_predictor-main\cricket_with_synthetic_weather.xlsx')


ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.