In [219]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as snb
from google.colab import files
import warnings
warnings.filterwarnings('ignore')

In [220]:
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    LabelEncoder,
    PolynomialFeatures
)
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    RandomizedSearchCV
)
from sklearn.linear_model import (
    LinearRegression,
    LogisticRegression,
    LogisticRegressionCV,
    SGDRegressor,
    SGDClassifier,
    Ridge,
    RidgeCV,
    Lasso,
    LassoCV,
    ElasticNet,
    ElasticNetCV
)
from sklearn.svm import (
    SVR,
    SVC
)
from sklearn.neighbors import (
    KNeighborsClassifier,
    KNeighborsRegressor
)
from sklearn.tree import (
    DecisionTreeClassifier,
    DecisionTreeRegressor
)
from sklearn.naive_bayes import(
    GaussianNB,
    MultinomialNB,
    BernoulliNB
)
from sklearn import tree
from sklearn.ensemble import (
    RandomForestClassifier,
    RandomForestRegressor,
    GradientBoostingClassifier,
    GradientBoostingRegressor,
    AdaBoostClassifier,
    AdaBoostRegressor,
    ExtraTreesClassifier,
    ExtraTreesRegressor,
    HistGradientBoostingClassifier,
    HistGradientBoostingRegressor
)

# from xgboost import XGBClassifier, XGBRegressor
# from lightgbm import LGBMClassifier, LGBMRegressor
# from catboost import CatBoostClassifier, CatBoostRegressor

# from sklearn.discriminant_analysis import (
#     LinearDiscriminantAnalysis,
#     QuadraticDiscriminantAnalysis
# )

from sklearn.neural_network import (
    MLPClassifier,
    MLPRegressor
)


from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    roc_curve,
    auc
)
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    mean_squared_log_error,
    median_absolute_error
)
from sklearn.pipeline import Pipeline

In [221]:
def input_data_file(file_name):
  df=pd.read_csv(file_name)
  return df

In [222]:
def data_cleaning_file(df):
  num_features=df.select_dtypes(exclude="O").columns
  cat_features=df.select_dtypes(include="O").columns

  for column in df.columns:
    if column in cat_features:
      df[column]=df[column].fillna(df[column].mode()[0])
    else:
      df[column]=df[column].fillna(df[column].mean())

  return df

In [223]:
def data_transformation_file(df,output,num_scaler=StandardScaler(),cat_scaler=OneHotEncoder(drop="first",handle_unknown="ignore")):
  num_features=df.select_dtypes(exclude="O").columns
  cat_features=df.select_dtypes(include="O").columns

  num_features=[num for num in num_features if num!=output]

  x=df.drop([output],axis=1)
  y=pd.DataFrame(df[output],columns=[output])
  x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=43)

  scaler=ColumnTransformer([
      ("num_feature_scaler",num_scaler,num_features),
      ("cat_feature_scaler",cat_scaler,cat_features)
  ])

  scaler.fit_transform(x_train)
  x_train_scaled=scaler.transform(x_train)
  x_test_scaled=scaler.transform(x_test)

  return x_train_scaled,x_test_scaled,y_train,y_test

In [228]:
def model_tuner_file(model_name,params,x_train,y_train,tuner=RandomizedSearchCV):
  tuner_model=tuner(
      estimator=model_name(),
      param_distributions=params,
      scoring="accuracy",
      cv=5,
      n_jobs=-1,
      verbose=2
  )

  tuner_model.fit(x_train,y_train)

  return tuner_model.best_estimator_,tuner_model.best_score_

In [225]:
def test_model_file(model,x_test,y_test,regression=True):
  y_pred=model.predict(x_test)

  if regression:
    r2=r2_score(y_test,y_pred)
    mae=mean_absolute_error(y_test,y_pred)
    mse=mean_squared_error(y_test,y_pred)
    rmse=np.sqrt(mse)
    mape=np.mean(np.abs((y_test-y_pred)/y_test))*100
    print(f"R2 Score: {r2}")
    print(f"Mean Absolute Error: {mae}")
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {rmse}")

  else:
    accuracy=accuracy_score(y_test,y_pred)
    precision=precision_score(y_test,y_pred)
    recall=recall_score(y_test,y_pred)
    f1=f1_score(y_test,y_pred)
    roc_auc=roc_auc_score(y_test,y_pred)
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"ROC AUC Score: {roc_auc}")

In [229]:
file_name="/content/Travel.csv"
df=input_data_file(file_name)
df=data_cleaning_file(df)
x_train_scaled,x_test_scaled,y_train,y_test=data_transformation_file(df,output="ProdTaken")


model_name=DecisionTreeClassifier
params={
    "max_depth":[None,5,10,15,20,30],
    "min_samples_split":[2,5,10,15,20],
    "min_samples_leaf":[1,2,5,10,15,20]
}
tuner=RandomizedSearchCV
model,score=model_tuner_file(model_name,params,x_train_scaled,y_train,tuner)
test_model_file(model,x_test_scaled,y_test,regression=False)

print(f"best score for {model_name} : {score}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Accuracy: 0.869120654396728
Precision: 0.6901408450704225
Recall: 0.5384615384615384
F1 Score: 0.6049382716049383
ROC AUC Score: 0.7415925782759953
best score for <class 'sklearn.tree._classes.DecisionTreeClassifier'> : 0.8613810741687979
