In [1]:
import seaborn as sns
import pandas as pd

In [2]:
data = pd.read_csv("tips.csv")
data

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [3]:
data["day"].unique()

array(['Sun', 'Sat', 'Thur', 'Fri'], dtype=object)

In [4]:
data["time"].unique()

array(['Dinner', 'Lunch'], dtype=object)

In [5]:
data.isnull().sum() # its mandatory to check null values,

# If we did not replace null values, otherwise we will get wrong analysis, wrong predictions

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [6]:
# In data set I am having the categorical feature, so I should convert into numerical

from sklearn.preprocessing import LabelEncoder

In [7]:
encoder = LabelEncoder()

In [8]:
data["time"] = encoder.fit_transform(data["time"])

In [9]:
X = data.drop(labels=["time"], axis=1)
y = data.time

In [10]:
X

Unnamed: 0,total_bill,tip,sex,smoker,day,size
0,16.99,1.01,Female,No,Sun,2
1,10.34,1.66,Male,No,Sun,3
2,21.01,3.50,Male,No,Sun,3
3,23.68,3.31,Male,No,Sun,2
4,24.59,3.61,Female,No,Sun,4
...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,3
240,27.18,2.00,Female,Yes,Sat,2
241,22.67,2.00,Male,Yes,Sat,2
242,17.82,1.75,Male,No,Sat,2


In [11]:
y

0      0
1      0
2      0
3      0
4      0
      ..
239    0
240    0
241    0
242    0
243    0
Name: time, Length: 244, dtype: int32

In [12]:
X["day"].value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [14]:
X_train

Unnamed: 0,total_bill,tip,sex,smoker,day,size
228,13.28,2.72,Male,No,Sat,2
208,24.27,2.03,Male,Yes,Sat,2
96,27.28,4.00,Male,Yes,Fri,2
167,31.71,4.50,Male,No,Sun,4
84,15.98,2.03,Male,No,Thur,2
...,...,...,...,...,...,...
106,20.49,4.06,Male,Yes,Sat,2
14,14.83,3.02,Female,No,Sun,2
92,5.75,1.00,Female,Yes,Fri,2
179,34.63,3.55,Male,Yes,Sun,2


In [15]:
X_test

Unnamed: 0,total_bill,tip,sex,smoker,day,size
24,19.82,3.18,Male,No,Sat,2
6,8.77,2.0,Male,No,Sun,2
153,24.55,2.0,Male,No,Sun,4
211,25.89,5.16,Male,Yes,Sat,4
198,13.0,2.0,Female,Yes,Thur,2
176,17.89,2.0,Male,Yes,Sun,2
192,28.44,2.56,Male,Yes,Thur,2
124,12.48,2.52,Female,No,Thur,2
9,14.78,3.23,Male,No,Sun,2
101,15.38,3.0,Female,Yes,Fri,2


In [16]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [17]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 195 entries, 228 to 102
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  195 non-null    float64
 1   tip         195 non-null    float64
 2   sex         195 non-null    object 
 3   smoker      195 non-null    object 
 4   day         195 non-null    object 
 5   size        195 non-null    int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 10.7+ KB


In [18]:
cat_cols = ["sex", "smoker", "day"]
num_cols = ["total_bill", "tip", "size"]

In [19]:
num_pipeline = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler",StandardScaler())
    ]
)

cat_pipeline = Pipeline(

    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder())
    ]
)

In [20]:
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, num_cols),
    ('cat_pipeline', cat_pipeline, cat_cols)
    ])

In [21]:
# Now I just need to fit my data

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.fit_transform(X_test)

In [22]:
X_train[0]  # Why in output 0,1,0,1,...? Because encoding

array([-0.79306155, -0.2580329 , -0.61214068,  0.        ,  1.        ,
        1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ])

In [23]:
y_train

228    0
208    0
96     0
167    0
84     1
      ..
106    0
14     0
92     0
179    0
102    0
Name: time, Length: 195, dtype: int32

In [24]:
# Fitting Random forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [33]:
models = {
    "random_forest" : RandomForestClassifier(oob_score=True),
    "logistic_regression" : LogisticRegression(),
    "decision_tree" : DecisionTreeClassifier()
}

In [43]:
X_train.shape

(195, 11)

In [44]:
from sklearn.metrics import accuracy_score

def evaluate_model(X_train, X_test, y_train, y_test, models):
    report = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        model.fit(X_train, y_train)
        print(model.score(X_train, y_train)) # For training acc
        print("OOB score:", model.oob_score)

        y_pred = model.predict(X_test)

        accuracy = round(accuracy_score(y_test, y_pred)*100,2)

        report[list(model.keys())[i]] = accuracy

    return report

In [45]:
evaluate_model(X_train, y_train, X_test, y_test, models)

# Idhyla avungalukku result vandhuchu, why he got 100 acc for lr?
# Because dataset is very very small

ValueError: Found input variables with inconsistent numbers of samples: [195, 49]

In [46]:
# Hyperparameter tuning

params = {
    "n_estimators":[50,100,200],
    "criterion" : ["gini", "entropy"],
    "max_depth" : [3,5,10]
}

In [47]:
# Random search cv

from sklearn.model_selection import RandomizedSearchCV

In [49]:
model = RandomForestClassifier(oob_score=True)

In [52]:
cv = RandomizedSearchCV(model, param_distributions=params, scoring="accuracy", cv=5, verbose=5)

In [53]:
cv

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(oob_score=True),
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [3, 5, 10],
                                        'n_estimators': [50, 100, 200]},
                   scoring='accuracy', verbose=5)

In [54]:
cv.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=gini, max_depth=3, n_estimators=50;, score=0.974 total time=   0.1s
[CV 2/5] END criterion=gini, max_depth=3, n_estimators=50;, score=0.949 total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=3, n_estimators=50;, score=0.974 total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=3, n_estimators=50;, score=0.923 total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=3, n_estimators=50;, score=0.923 total time=   0.0s
[CV 1/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.974 total time=   0.1s
[CV 2/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.923 total time=   0.1s
[CV 3/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.974 total time=   0.1s
[CV 4/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.923 total time=   0.1s
[CV 5/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.949 total time=   0.1s
[CV 1/5] 

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(oob_score=True),
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [3, 5, 10],
                                        'n_estimators': [50, 100, 200]},
                   scoring='accuracy', verbose=5)

In [56]:
# Now I need to chec the best parameter

cv.best_params_

{'n_estimators': 100, 'max_depth': 3, 'criterion': 'entropy'}

In [58]:
bestmodel = RandomForestClassifier(n_estimators=200, max_depth=10, criterion="gini", oob_score=True)

# idhla highest acc 97 vandhuchu andha parameter ah potu namma test ku prediction kandupuduchom
# But adhuku 95 dha vandhuchu, because idhu train acc, adhu test acc

In [59]:
y_pred = bestmodel.predict(X_test)
accuracy = round(accuracy_score(y_test, y_pred)*100,2) # It has runned for him

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [60]:
accuracy  # it has runned for him.

NameError: name 'accuracy' is not defined

                                                                            Completed