In [1]:
import warnings
warnings.filterwarnings("ignore")

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

In [10]:
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

In [13]:
bd_train=pd.read_csv("/content/counterfeit_train.csv")
bd_test=pd.read_csv("/content/counterfeit_test.csv")

In [17]:
bd_train.head()

Unnamed: 0,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level,Counterfeit_Sales
0,13.1,Area046,1995,160.2366,Antimalarial,critical,0.070422,DownTown,Tier 1,Small,1775.5026
1,13.8,Area027,1983,110.4384,Mstablizers,mild,0.013,CityLimits,Tier 3,Medium,3069.152
2,9.025,Area046,1995,259.4092,Cardiac,mild,0.060783,DownTown,Tier 1,Small,2603.092
3,11.8,Area046,1995,99.983,OralContraceptives,mild,0.065555,DownTown,Tier 1,Small,1101.713
4,13.8,Area019,1983,56.4402,Hreplacements,critical,0.248859,MidTownResidential,Tier 1,Small,158.9402


In [18]:
bd_test.head()

Unnamed: 0,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level
0,13.8,Area027,1983,85.5328,Antibiotics,mild,0.112747,CityLimits,Tier 3,Medium
1,13.45,Area045,2000,257.146,OralContraceptives,mild,0.144446,DownTown,Tier 2,Unknown
2,7.1,Area045,2000,98.1172,Antipyretics,mild,0.144221,DownTown,Tier 2,Unknown
3,18.3,Area010,1996,135.373,Tranquilizers,mild,0.100388,MidTownResidential,Tier 3,Unknown
4,13.8,Area019,1983,112.8016,OralContraceptives,mild,0.022585,MidTownResidential,Tier 1,Small


In [14]:
imputer=SimpleImputer(strategy="median")
bd_train["Counterfeit_Weight"]=imputer.fit_transform(bd_train[["Counterfeit_Weight"]])
bd_test["Counterfeit_Weight"]=imputer.transform(bd_test[["Counterfeit_Weight"]])


In [15]:
if "Medicine_ID" in bd_train.columns:
  bd_train.drop("Medicine_ID",axis=1,inplace=True)
if "Medicine_ID" in bd_test.columns:
  bd_test.drop("Medicine_ID",axis=1,inplace=True)

In [21]:
categorical_cols=["Medicine_Type","SidEffect_Level","Area_Type","Area_City_Type","Area_dist_level","DistArea_ID"]
bd_train=pd.get_dummies(bd_train,columns=categorical_cols,drop_first=True)
bd_test=pd.get_dummies(bd_test,columns=categorical_cols,drop_first=True)

In [None]:
bd_train,bd_test=bd_train.align(bd_test,join="left",axis=1)
bd_test.fillna(0,inplace=True)

In [22]:
target="Counterfeit_Sales"
x_train=bd_train.drop(target,axis=1)
y_train=bd_train[target]

In [23]:
model=RandomForestRegressor(random_state=42)

In [28]:
param_dist={
    "n_estimators":[100,200,300],
    "max_depth":[10,20,30,None],
    "min_samples_split":[2,5,10],
    "min_samples_leaf":[1,2,4],
    "bootstrap":[True,False]
}

In [29]:
random_search=RandomizedSearchCV(
    model,param_distributions=param_dist,
    n_iter=50,cv=5,scoring="neg_mean_absolute_error",
    n_jobs=-1,verbose=2,random_state=42
)
random_search.fit(x_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [30]:
best_model=random_search.best_estimator_
train_predictions=best_model.predict(x_train)
mae=mean_absolute_error(y_train,train_predictions)
print(f"Train MAE:{mae}")


Train MAE:648.437951509403


In [33]:
test_predictions=best_model.predict(bd_test)
submission=pd.DataFrame({target: test_predictions})

In [34]:
submission.to_csv("3rdsample_submission.csv",index=False)