In [94]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

In [95]:
# Read the data
datafile_train = "/content/counterfeit_train.csv"
datafile_test = "/content/counterfeit_test.csv"
bd_train = pd.read_csv(datafile_train)
bd_test = pd.read_csv(datafile_test)


In [96]:
# Store Medicine_ID before dropping
test_medicine_ids = bd_test['Medicine_ID'].copy()

In [97]:
# Print column names to verify
print("Available columns in training data:")
print(bd_train.columns.tolist())

Available columns in training data:
['Medicine_ID', 'Counterfeit_Weight', 'DistArea_ID', 'Active_Since', 'Medicine_MRP', 'Medicine_Type', 'SidEffect_Level', 'Availability_rating', 'Area_Type', 'Area_City_Type', 'Area_dist_level', 'Counterfeit_Sales']


In [98]:
# Check null values
print("Null values in training data:")
print(bd_train.isnull().sum())

Null values in training data:
Medicine_ID               0
Counterfeit_Weight     1166
DistArea_ID               0
Active_Since              0
Medicine_MRP              0
Medicine_Type             0
SidEffect_Level           0
Availability_rating       0
Area_Type                 0
Area_City_Type            0
Area_dist_level           0
Counterfeit_Sales         0
dtype: int64


In [99]:
# Get unique Medicine_IDs
print("\nUnique Medicine_IDs:", bd_train['Medicine_ID'].nunique())

print("\nTraining data shape:", bd_train.shape)


Unique Medicine_IDs: 1557

Training data shape: (6818, 12)


In [100]:
# Drop Medicine_ID and Counterfeit_Weight
for col in ['Medicine_ID', 'Counterfeit_Weight']:
    bd_train.drop(col, axis = 1, inplace=True)
    bd_test.drop(col, axis = 1, inplace=True)

In [103]:
# Create dummies for categorical variables
categorical_columns = bd_train.select_dtypes(include=['object']).columns
print("\nCategorical columns found:", categorical_columns.tolist())

for col in categorical_columns:
    temp = pd.get_dummies(bd_train[col], prefix=col, drop_first=True)
    bd_train = pd.concat([temp, bd_train], axis = 1)
    bd_train.drop([col], axis = 1, inplace=True)

    temp = pd.get_dummies(bd_test[col], prefix=col, drop_first=True)
    bd_test = pd.concat([temp, bd_test], axis = 1)
    bd_test.drop([col], axis = 1, inplace=True)

print("\nProcessed training data shape:", bd_train.shape)


Categorical columns found: ['Medicine_Type', 'SidEffect_Level', 'Area_City_Type']

Processed training data shape: (6818, 37)


In [104]:
# Check for any remaining null values
print("\nRemaining null values in training data:")
print(bd_train.isnull().sum()[bd_train.isnull().sum() > 0])
print("\nRemaining null values in test data:")
print(bd_test.isnull().sum()[bd_test.isnull().sum() > 0])


Remaining null values in training data:
Series([], dtype: int64)

Remaining null values in test data:
Series([], dtype: int64)


In [105]:
target = 'Counterfeit_Sales'
x_train = bd_train.drop(target, axis = 1)
y_train = bd_train[target]

In [106]:
# Define simpler model parameters for initial test
params = {
    'n_estimators': [100],
    'learning_rate': [0.1],
    'max_depth': [3],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}


In [90]:
# Define model parameters
params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

In [107]:
# Initialize model
model = GradientBoostingRegressor(random_state=42)

In [108]:
# Perform grid search
grid_search = GridSearchCV(
    model,
    param_grid=params,
    cv=5,
    n_jobs=-1,
    verbose=2,
    scoring='neg_mean_absolute_error'
)

In [114]:
# Fit the model
try:
    grid_search.fit(x_train, y_train)
    print("\nBest parameters:", grid_search.best_params_)
    print("Best MAE score:", -grid_search.best_score_)
    print("Expected score:", 1 - (-grid_search.best_score_/1660))

    # Make predictions
    predictions = grid_search.predict(bd_test)

    # Create submission file with Medicine_ID
    submissions = pd.DataFrame({
        'Medicine_ID': test_medicine_ids,
        'Counterfeit_Sales': predictions
    })

    # Save submission file
    submissions.to_csv('output.csv', index=False)
    print("\nSubmission file created successfully!")

except Exception as e:
    print("\nError occurred during model fitting:")
    print(e)

    # Print additional debugging information
    print("\nFeature names:", x_train.columns.tolist())
    print("Number of features:", len(x_train.columns))
    print("Sample of X values:\n", x_train.head())
    print("Sample of y values:\n", y_train.head())

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best MAE score: 752.9667352164341
Expected score: 0.5464055811949192

Submission file created successfully!
