In [1]:
### importing basic liabary

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

In [5]:
## loading dataset
df=pd.read_csv("tip.csv")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [6]:
## top 5 record
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [7]:
## bottom 5 record
df.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.0,Female,Yes,Sat,Dinner,2
241,22.67,2.0,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2
243,18.78,3.0,Female,No,Thur,Dinner,2


In [8]:
## shape of dataset
df.shape

(244, 7)

In [9]:
## checking null values
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [10]:
## data information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


In [11]:
## checking duplicate value
df.duplicated().sum()

1

In [12]:
df=df.drop_duplicates()

In [14]:
df.duplicated().sum()

0

In [16]:
## Statatics test
df.describe()

Unnamed: 0,total_bill,tip,size
count,243.0,243.0,243.0
mean,19.813868,3.002387,2.572016
std,8.910071,1.385002,0.952356
min,3.07,1.0,1.0
25%,13.38,2.0,2.0
50%,17.81,2.92,2.0
75%,24.175,3.575,3.0
max,50.81,10.0,6.0


In [23]:
median_tips=df['tip'].median()
df['tips_label'] = (df['tip']>median_tips).astype(int)

In [24]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tips_label
0,16.99,1.01,Female,No,Sun,Dinner,2,0
1,10.34,1.66,Male,No,Sun,Dinner,3,0
2,21.01,3.5,Male,No,Sun,Dinner,3,1
3,23.68,3.31,Male,No,Sun,Dinner,2,1
4,24.59,3.61,Female,No,Sun,Dinner,4,1


In [25]:
df['tips_label'].value_counts()

0    122
1    121
Name: tips_label, dtype: int64

In [26]:
df=df.drop("tip",axis=1)
df.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size,tips_label
0,16.99,Female,No,Sun,Dinner,2,0
1,10.34,Male,No,Sun,Dinner,3,0
2,21.01,Male,No,Sun,Dinner,3,1
3,23.68,Male,No,Sun,Dinner,2,1
4,24.59,Female,No,Sun,Dinner,4,1


In [27]:
## split the data into independent and dependent variable
X = df.drop("tips_label",axis=1)
y = df['tips_label']
X.shape, y.shape

((243, 6), (243,))

In [28]:
## split the data into train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, random_state=42)
X_train.shape, X_test.shape

((194, 6), (49, 6))

In [30]:
## feature engineering
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


numerical_features=X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_feature = X.select_dtypes(include=['object']).columns.tolist()

print("We have numerical feature: {} ".format(numerical_features))
print("We have categorical feature: {} ".format(categorical_feature))

We have numerical feature: ['total_bill', 'size'] 
We have categorical feature: ['sex', 'smoker', 'day', 'time'] 


In [31]:
numerical_pipe = Pipeline(
    steps=[
        ("num_pipeline",SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

categorical_pipe = Pipeline(
    steps=[
        ("cat_pipeline", SimpleImputer(strategy="most_frequent")),
        ("one_HotEncoder",OneHotEncoder(handle_unknown="ignore"))
        
    ]
)

preprocessor = ColumnTransformer(
    [
        ("num",numerical_pipe,numerical_features),
        ("cat",categorical_pipe,categorical_feature)
    ]
)

In [32]:
X_train_transformer=preprocessor.fit_transform(X_train)

In [33]:
X_train_transformer

array([[-0.31281158, -0.63021145,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.48521345, -0.63021145,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.76227396, -0.63021145,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-1.63826695, -0.63021145,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 1.6730783 , -0.63021145,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 2.7818286 ,  0.42376287,  1.        , ...,  0.        ,
         1.        ,  0.        ]])

In [35]:
X_test_transformer=preprocessor.transform(X_test)
X_test_transformer

array([[-0.02501807, -0.63021145,  0.        ,  1.        ,  1.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ],
       [-1.29199747, -0.63021145,  0.        ,  1.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ],
       [-0.31854452,  0.42376287,  0.        ,  1.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ],
       [-1.06267993, -0.63021145,  0.        ,  1.        ,  1.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ],
       [ 1.03098919,  0.42376287,  0.        ,  1.        ,  1.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ],
       [ 1.47471863, -0.63021145,  0.        ,  1.        ,  0.        ,
         1.        ,  

In [36]:
## evaluate the metrics for model training
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score

def evaluate_mertrics(pred,true):
    f1=f1_score(pred,true)
    precission=precision_score(pred,true)
    recall=recall_score(pred,true)
    accuracy=accuracy_score(pred,true)
    return f1, precission, recall, accuracy

In [43]:
## training multiple model 

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier


## Initiliaze the model
models = {
    "Logistic regssion": LogisticRegression(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Adaboosting Classifier": AdaBoostClassifier(),
    "Gradient Decent Classifier": GradientBoostingClassifier(),
    "Support Vector Machine Classfier": SVC(),
    "Catboosting Classifier": CatBoostClassifier(),
    "Random forest Classifier": RandomForestClassifier()
}



model_list = []
f1_score_list = []
precission_score_list = []
recall_score_list = []
accuracy_score_list = []


for name, model in models.items():
    model.fit(X_train_transformer,y_train)
    y_pred = model.predict(X_test_transformer)


    y_test_f1, y_test_precission, y_test_recall,y_test_accuracy = evaluate_mertrics(y_pred,y_test)

    model_list.append(name)
    f1_score_list.append(y_test_f1)
    precission_score_list.append(y_test_precission)
    recall_score_list.append(y_test_recall)
    accuracy_score_list.append(y_test_accuracy)

    print("Model name: ",{name})
    print("F1 Score: {} ".format(f1_score_list))
    print("Precission Score: {} ".format(precission_score_list))
    print("recall score: {} ".format(recall_score_list))
    print("accuracy score: {} ".format(accuracy_score_list))
    
    

Model name:  {'Logistic regssion'}
F1 Score: [0.7659574468085106] 
Precission Score: [0.8571428571428571] 
recall score: [0.6923076923076923] 
accuracy score: [0.7755102040816326] 
Model name:  {'Decision Tree Classifier'}
F1 Score: [0.7659574468085106, 0.6086956521739131] 
Precission Score: [0.8571428571428571, 0.6666666666666666] 
recall score: [0.6923076923076923, 0.56] 
accuracy score: [0.7755102040816326, 0.6326530612244898] 
Model name:  {'Adaboosting Classifier'}
F1 Score: [0.7659574468085106, 0.6086956521739131, 0.7755102040816326] 
Precission Score: [0.8571428571428571, 0.6666666666666666, 0.9047619047619048] 
recall score: [0.6923076923076923, 0.56, 0.6785714285714286] 
accuracy score: [0.7755102040816326, 0.6326530612244898, 0.7755102040816326] 
Model name:  {'Gradient Decent Classifier'}
F1 Score: [0.7659574468085106, 0.6086956521739131, 0.7755102040816326, 0.7555555555555555] 
Precission Score: [0.8571428571428571, 0.6666666666666666, 0.9047619047619048, 0.8095238095238095

In [45]:
pd.DataFrame({
    "Model Name": model_list,
    "F1 Score": f1_score_list,
    "Precission score":precission_score_list,
    "Recall score:":recall_score_list,
    "accuracy score": accuracy_score_list
})

Unnamed: 0,Model Name,F1 Score,Precission score,Recall score:,accuracy score
0,Logistic regssion,0.765957,0.857143,0.692308,0.77551
1,Decision Tree Classifier,0.608696,0.666667,0.56,0.632653
2,Adaboosting Classifier,0.77551,0.904762,0.678571,0.77551
3,Gradient Decent Classifier,0.755556,0.809524,0.708333,0.77551
4,Support Vector Machine Classfier,0.765957,0.857143,0.692308,0.77551
5,Catboosting Classifier,0.816327,0.952381,0.714286,0.816327
6,Random forest Classifier,0.782609,0.857143,0.72,0.795918


In [49]:
## Data validation
from sklearn.model_selection import cross_val_score
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

catboost_model = CatBoostClassifier(verbose=0, random_state=42)
random_model = RandomForestClassifier(random_state=42)


catboost_model_f1=cross_val_score(catboost_model,X_train_transformer,y_train, cv=5,scoring="f1_weighted")
print("catboost f1 score: ",catboost_model_f1)
print("catboost f1 mean score",catboost_model_f1.mean())

random_model_f1=cross_val_score(random_model,X_train_transformer,y_train, cv=5,scoring="f1_weighted")
print("Random f1 score: ",random_model_f1)
print("Random f1 mean score",random_model_f1.mean())

catboost f1 score:  [0.53663004 0.79212944 0.74325236 0.71420053 0.64931145]
catboost f1 mean score 0.687104763263083
Random f1 score:  [0.43589744 0.73627369 0.79487179 0.64055331 0.62529904]
Random f1 mean score 0.6465790540504498


In [56]:
## applying hpyerparameter 
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

hyper_models = {
    "Catboost Classifier": CatBoostClassifier(verbose=0, random_state=42),
    "Random forest Classifier": RandomForestClassifier(random_state=42)
}


params = {
    "Random forest Classifier":{
        'n_estimators':[50,100,200],
        'max_depth':[6, 10,None],
        'max_features':['sqrt','log2']
    },
    "Catboost Classifier":{
        'iterations':[50,100,200],
        'learning_rate':[0.01,0.1,0.05],
        'depth':[4,6,10]
    }
}


for name,model in hyper_models.items():
    if name in params:
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=params[name],
            cv=5,
            scoring='f1_weighted',
            n_jobs=1
        )

        grid_search.fit(X_train_transformer,y_train)
        print(f"Best Parameter for {name}: {grid_search.best_params_}")
        print(f"Best cv f1 score for {name}: {grid_search.best_score_:.4f}/n ")

Best Parameter for Catboost Classifier: {'depth': 4, 'iterations': 100, 'learning_rate': 0.01}
Best cv f1 score for Catboost Classifier: 0.7598/n 
Best Parameter for Random forest Classifier: {'max_depth': 6, 'max_features': 'sqrt', 'n_estimators': 100}
Best cv f1 score for Random forest Classifier: 0.7072/n 


In [68]:
from catboost import CatBoostClassifier
cat_final_model = CatBoostClassifier(verbose=0,random_state=42)
catboost_param = {
        'iterations':[50,100,200],
        'learning_rate':[0.01,0.1,0.05],
        'depth':[4,6,10]
}

cat_boost_grid_model=GridSearchCV(
    estimator=cat_final_model,
    param_grid=catboost_param,
    cv=5,
    scoring='f1_weighted',
    n_jobs=1
)


cat_boost_grid_model.fit(X_train_transformer,y_train)
print(f"Catboost Best parameter for {cat_boost_grid_model.best_params_}")
print(f"catboost f1 score for: {cat_boost_grid_model.best_score_} ")
   

Catboost Best parameter for {'depth': 4, 'iterations': 100, 'learning_rate': 0.01}
catboost f1 score for: 0.7598423402231859 


In [71]:
## prediction

cat_pred=cat_boost_grid_model.predict(X_test_transformer)
print(f"Catboost prediction:{cat_pred}")

## evaluate the model
from sklearn.metrics  import classification_report
print(classification_report(cat_pred,y_test))

Catboost prediction:[1 0 1 0 1 1 1 0 0 1 1 1 0 1 0 0 1 1 1 0 1 1 1 0 1 1 0 1 0 1 0 0 1 1 1 0 1
 0 0 1 0 0 1 0 0 0 1 1 1]
              precision    recall  f1-score   support

           0       0.68      0.90      0.78        21
           1       0.90      0.68      0.78        28

    accuracy                           0.78        49
   macro avg       0.79      0.79      0.78        49
weighted avg       0.81      0.78      0.78        49



In [73]:
accuracy_score(cat_pred,y_test)

0.7755102040816326