# **Random Forest Assignment Quiz**

In [1]:
# Import the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

#### Census_income.csv

In [2]:
# Import the required dataset
df_census = pd.read_csv(r'D:\Intellipaat\Datasets\Census_income.csv')
df_census

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,annual_income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [3]:
# Display the records of unmarried people working less than 20 hours per week
df_census.loc[(df_census['marital-status'] == 'Never-married') & (df_census['hours-per-week'] < 20)]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,annual_income
168,18,Private,25828,11th,7,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,16,United-States,<=50K
178,18,Private,183930,HS-grad,9,Never-married,Other-service,Own-child,White,Male,0,0,12,United-States,<=50K
209,17,Private,65368,11th,7,Never-married,Sales,Own-child,White,Female,0,0,12,United-States,<=50K
262,17,Private,245918,11th,7,Never-married,Other-service,Own-child,White,Male,0,0,12,United-States,<=50K
280,22,Private,34918,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,15,Germany,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32345,18,Private,347336,12th,8,Never-married,Other-service,Own-child,White,Male,0,0,12,United-States,<=50K
32356,23,Private,133355,Some-college,10,Never-married,Adm-clerical,Own-child,White,Male,0,0,15,United-States,<=50K
32494,82,?,403910,HS-grad,9,Never-married,?,Not-in-family,White,Male,0,0,3,United-States,<=50K
32524,26,Private,191648,Assoc-acdm,12,Never-married,Machine-op-inspct,Other-relative,White,Female,0,0,15,United-States,<=50K


In [4]:
# select object dtype for encoding
obj_cols = df_census.select_dtypes('object').columns

In [5]:
# Encode the object dtype to numeric
le = LabelEncoder()

for col in obj_cols:
    if df_census[col].nunique() <= 2:
        df_census[col] = le.fit_transform(df_census[col])
    else :
        df_census = pd.get_dummies(df_census, columns=[col])

df_census = df_census.astype(int)

In [6]:
# Check for the null values
df_census.isna().sum().sum()

0

In [7]:
# Check for the duplicates
df_census.duplicated().sum()

24

In [8]:
# Remove the duplicates
df_census.drop_duplicates(inplace=True)

In [9]:
# Initialize independent & dependent variable
X = df_census.drop(columns=['annual_income'])
y = df_census['annual_income']

In [10]:
# Split the data into training & testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [11]:
# Set the parameters for random search
param_grid = {'n_estimators' : [25, 50, 75, 100], 
              'max_depth' : [3, 5, 7, 9, 10], 
              'max_leaf_nodes' : [3, 6, 9, 10, 12, 14, 16, 18, 20]}

In [12]:
# initialize the model
model = RandomForestClassifier()

In [13]:
# Random seach CV
random_search = RandomizedSearchCV(model, param_grid)
random_search.fit(X_train, y_train)

In [14]:
# Get the best parameter for the model
random_search.best_params_

{'n_estimators': 50, 'max_leaf_nodes': 18, 'max_depth': 10}

In [15]:
# Initialize the model on best parameters
model2 = RandomForestClassifier(n_estimators=25, max_leaf_nodes=16, max_depth=9)
model2.fit(X_train, y_train)    # Train the model

In [16]:
# Predict the value using model2
y_pred = model2.predict(X_test)

In [17]:
# Accuracy of model2
accuracy_score(y_test, y_pred)

0.8414259373079287

1. What is the biggest advantage that helps random forest classifiers to triumph over the decision trees?
- A. It has shown great predictive results over decision tree models.
- B. It Combines all positive predictions from all decision trees
- C. It works on the bagging method(bootstrap method)
- D. All of the above


-> D. All of the above

2. In a given problem where you have a very large dataset with both continuous and categorical features, why would you choose the random forest classifier?
- A. Random forest can work on both regression and classification problem
- B. High accuracy with less need for interpretation
- C. Works well with the high dimensional data
- D. All of the above

-> D. All of the above

3. Which of the following techniques is used in the Random Forest model?
- A. Bagging
- B. Boosting
- C. Ensemble
- D. None of the these

-> A. Bagging

4. Choose the total population with income greater than 50% income?
- A. 75%
- B. 25%
- C. 24.08%
- D. 35%


-> C. 24.08 %

5. Compute how many samples of the population are unmarried and working hours less than 20 hours?
- A. 354
- B. 467
- C. 785
- D. 123


-> C. 785

6. Choose the correct list of age with minimum age , maximum and 50th Percentile of the age group?
- A. [17,90,36]
- B. [15,95,37]
- C. [17,90,37]
- D. All

-> C. [17, 90, 37]

7. From above census data which country has the highest population and the lowest population?
- A. United-States and scotland
- B. United-States and Holland-Netherlands
- C. Scotland and Holland-Netherlands
- D. Mexico and Holland-Netherlands


-> B. United-States & Holland-Netherlands

8. How does n_estimators work in the random forest classifier?
- A. Number of random forests for the classifier.
- B. Number of iterations
- C. Training epochs
- D. Number of decision trees

-> D. Number of decision trees

9. Can the target data for the random forest model be categorical or continuous value?
- A. Yes
- B. No

-> Yes

10. How can you use hyperparameter tuning to your advantage while working with the random
forest classifier?
- A. Improve the model’s performance
- B. Normalizes the features
- C. Standardization of the data
- D. All of the above

-> A. Improve the model's performance

11. Select the best hyperparameters by RandomSearchCV and fit the model with the best hyperparameters and compute the accuracy score of the model.
- A. 90% and above
- B. 50% to 70%
- C. 30% to 50&
- D. None of the above

-> D. None of the above

12. Which of the following Two features are most important in Random forest model?
- A. Predict_proba
- B. Correlation between 2 trees and how strong an individual tree is
- C. sensitivity and specificity
- D. None of the above

-> D. None of the above

13. Based on what values, the feature importance will be calculated?
- A. mean increase gini and mean decrease accuracy
- B. Mean decrease gini and mean decrease accuracy
- C. mean increase gini and mean increase accuracy
- D. All of the above

-> B. Mean decrease gini and mean decrease accuracy

14. From the above model, state the disadvantage of the random forest?
- A. It is a time consuming model building process
- B. It is same as all other model
- C. It’s training time is huge due to the complexity of the model
- D. None of the above

-> C. It's training time is huge due to the complexity of model

15. Which are the two methods used for hyperparameter tuning and cross-validation?
- A. RandomForestCLassifier
- B. RandomizedSearchCV
- C. GridSearchCV
- D. RandomizedSearchCV and GridSearchCV

-> D. Randomized Search CV & Grid Search CV

END OF CENSUS INCOME

---

#### Airport_passengers_satisfication.csv

In [33]:
# Import the required dataset
df_aps = pd.read_csv(r'D:\Intellipaat\Datasets\Airport_passengers_satisfication.csv')
df_aps

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,3,...,5,5,5,5,2,5,5,50,44.0,satisfied
1,90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,3,...,4,4,4,4,3,4,5,0,0.0,satisfied
2,12360,Male,disloyal Customer,20,Business travel,Eco,192,2,0,2,...,2,4,1,3,2,2,2,0,0.0,dissatisfied
3,77959,Male,Loyal Customer,44,Business travel,Business,3377,0,0,0,...,1,1,1,1,3,1,4,0,6.0,satisfied
4,36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,4,...,2,2,2,2,4,2,4,0,20.0,satisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25971,78463,Male,disloyal Customer,34,Business travel,Business,526,3,3,3,...,4,3,2,4,4,5,4,0,0.0,dissatisfied
25972,71167,Male,Loyal Customer,23,Business travel,Business,646,4,4,4,...,4,4,5,5,5,5,4,0,0.0,satisfied
25973,37675,Female,Loyal Customer,17,Personal Travel,Eco,828,2,5,1,...,2,4,3,4,5,4,2,0,0.0,dissatisfied
25974,90086,Male,Loyal Customer,14,Business travel,Business,1127,3,3,3,...,4,3,2,5,4,5,4,0,0.0,satisfied


In [34]:
# Display the records of unsatisfied female passengers
df_aps.loc[(df_aps['Gender'] == 'Female') & (df_aps['satisfaction'] == 'dissatisfied')]

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
11,115550,Female,Loyal Customer,33,Business travel,Business,325,2,5,5,...,2,2,2,2,3,2,4,18,7.0,dissatisfied
16,124915,Female,Loyal Customer,31,Business travel,Eco,728,2,5,5,...,2,4,3,3,4,3,2,2,0.0,dissatisfied
18,76872,Female,Loyal Customer,43,Personal Travel,Eco,1927,3,4,3,...,5,5,3,5,4,5,3,0,0.0,dissatisfied
24,82602,Female,disloyal Customer,30,Business travel,Eco,528,4,3,5,...,2,3,2,3,4,4,2,0,0.0,dissatisfied
38,70990,Female,disloyal Customer,32,Business travel,Business,802,4,4,4,...,2,4,2,4,3,5,2,0,10.0,dissatisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25965,15949,Female,Loyal Customer,51,Personal Travel,Eco,528,4,4,4,...,5,5,4,5,4,5,4,0,0.0,dissatisfied
25967,90347,Female,disloyal Customer,39,Business travel,Business,404,1,1,1,...,2,5,3,4,4,4,2,0,0.0,dissatisfied
25970,25309,Female,disloyal Customer,36,Business travel,Eco,432,1,5,1,...,4,5,2,5,2,3,4,0,0.0,dissatisfied
25973,37675,Female,Loyal Customer,17,Personal Travel,Eco,828,2,5,1,...,2,4,3,4,5,4,2,0,0.0,dissatisfied


In [35]:
# Display the records of female passengers
df_aps[df_aps['Gender'] == 'Female']

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,3,...,5,5,5,5,2,5,5,50,44.0,satisfied
1,90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,3,...,4,4,4,4,3,4,5,0,0.0,satisfied
4,36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,4,...,2,2,2,2,4,2,4,0,20.0,satisfied
6,79433,Female,Loyal Customer,77,Business travel,Business,3987,5,5,5,...,5,5,5,5,4,5,3,0,0.0,satisfied
7,97286,Female,Loyal Customer,43,Business travel,Business,2556,2,2,2,...,4,4,4,4,5,4,3,77,65.0,satisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25965,15949,Female,Loyal Customer,51,Personal Travel,Eco,528,4,4,4,...,5,5,4,5,4,5,4,0,0.0,dissatisfied
25967,90347,Female,disloyal Customer,39,Business travel,Business,404,1,1,1,...,2,5,3,4,4,4,2,0,0.0,dissatisfied
25970,25309,Female,disloyal Customer,36,Business travel,Eco,432,1,5,1,...,4,5,2,5,2,3,4,0,0.0,dissatisfied
25973,37675,Female,Loyal Customer,17,Personal Travel,Eco,828,2,5,1,...,2,4,3,4,5,4,2,0,0.0,dissatisfied


In [36]:
# Initialize a dataframe having rating columns only
rating_cols = df_aps.drop(columns=['id', 'Gender',	'Customer Type', 'Age', 'Type of Travel', 'Class', 'Flight Distance', 'Departure Delay in Minutes', 'Arrival Delay in Minutes', 'satisfaction'])

In [37]:
rating_cols.columns

Index(['Inflight wifi service', 'Departure/Arrival time convenient',
       'Ease of Online booking', 'Gate location', 'Food and drink',
       'Online boarding', 'Seat comfort', 'Inflight entertainment',
       'On-board service', 'Leg room service', 'Baggage handling',
       'Checkin service', 'Inflight service', 'Cleanliness'],
      dtype='object')

In [38]:
# Initialize average score column having row wise average of rating columns
df_aps['ave_score'] = df_aps[rating_cols.columns].mean(axis=1)
df_aps

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction,ave_score
0,19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,3,...,5,5,5,2,5,5,50,44.0,satisfied,4.142857
1,90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,3,...,4,4,4,3,4,5,0,0.0,satisfied,3.428571
2,12360,Male,disloyal Customer,20,Business travel,Eco,192,2,0,2,...,4,1,3,2,2,2,0,0.0,dissatisfied,2.142857
3,77959,Male,Loyal Customer,44,Business travel,Business,3377,0,0,0,...,1,1,1,3,1,4,0,6.0,satisfied,1.785714
4,36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,4,...,2,2,2,4,2,4,0,20.0,satisfied,2.642857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25971,78463,Male,disloyal Customer,34,Business travel,Business,526,3,3,3,...,3,2,4,4,5,4,0,0.0,dissatisfied,3.357143
25972,71167,Male,Loyal Customer,23,Business travel,Business,646,4,4,4,...,4,5,5,5,5,4,0,0.0,satisfied,4.285714
25973,37675,Female,Loyal Customer,17,Personal Travel,Eco,828,2,5,1,...,4,3,4,5,4,2,0,0.0,dissatisfied,3.000000
25974,90086,Male,Loyal Customer,14,Business travel,Business,1127,3,3,3,...,3,2,5,4,5,4,0,0.0,satisfied,3.642857


In [39]:
# Counts of satisfied & dissatisfied passengers
df_aps['satisfaction'].value_counts()

satisfaction
dissatisfied    14573
satisfied       11403
Name: count, dtype: int64

In [40]:
# Replace the values
df_aps.replace(('dissatisfied', 'satisfied'), (0, 1), inplace=True)
df_aps

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction,ave_score
0,19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,3,...,5,5,5,2,5,5,50,44.0,1,4.142857
1,90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,3,...,4,4,4,3,4,5,0,0.0,1,3.428571
2,12360,Male,disloyal Customer,20,Business travel,Eco,192,2,0,2,...,4,1,3,2,2,2,0,0.0,0,2.142857
3,77959,Male,Loyal Customer,44,Business travel,Business,3377,0,0,0,...,1,1,1,3,1,4,0,6.0,1,1.785714
4,36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,4,...,2,2,2,4,2,4,0,20.0,1,2.642857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25971,78463,Male,disloyal Customer,34,Business travel,Business,526,3,3,3,...,3,2,4,4,5,4,0,0.0,0,3.357143
25972,71167,Male,Loyal Customer,23,Business travel,Business,646,4,4,4,...,4,5,5,5,5,4,0,0.0,1,4.285714
25973,37675,Female,Loyal Customer,17,Personal Travel,Eco,828,2,5,1,...,4,3,4,5,4,2,0,0.0,0,3.000000
25974,90086,Male,Loyal Customer,14,Business travel,Business,1127,3,3,3,...,3,2,5,4,5,4,0,0.0,1,3.642857


In [41]:
# Initialize independent & dependent variable
X = pd.DataFrame(df_aps['ave_score'])
y = pd.DataFrame(df_aps['satisfaction'])

### Decision tree classifier

In [42]:
# Split the data into train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
# Initialize and train the model
dec_tree = DecisionTreeClassifier()
dec_tree.fit(X_train, y_train)

In [44]:
# Predict the data using model
y_pred = dec_tree.predict(X_test)

In [45]:
# Accuracy of model
accuracy_score(y_test, y_pred)

0.722671285604311

### Random forest classifier

In [46]:
# Split the data into training & testing
X_train, X_test, y_train, y_test = train_test_split(rating_cols, y, test_size=0.3, random_state=24)

In [47]:
# Initialize & train the model
model = RandomForestClassifier(n_estimators=200, random_state=24)
model.fit(X_train, y_train)

In [48]:
# Predict the data using model
y_pred = model.predict(X_test)

In [49]:
# Accuracy of model
accuracy_score(y_test, y_pred)

0.9358398562812781

#### Take the original data

In [50]:
df_aps

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction,ave_score
0,19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,3,...,5,5,5,2,5,5,50,44.0,1,4.142857
1,90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,3,...,4,4,4,3,4,5,0,0.0,1,3.428571
2,12360,Male,disloyal Customer,20,Business travel,Eco,192,2,0,2,...,4,1,3,2,2,2,0,0.0,0,2.142857
3,77959,Male,Loyal Customer,44,Business travel,Business,3377,0,0,0,...,1,1,1,3,1,4,0,6.0,1,1.785714
4,36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,4,...,2,2,2,4,2,4,0,20.0,1,2.642857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25971,78463,Male,disloyal Customer,34,Business travel,Business,526,3,3,3,...,3,2,4,4,5,4,0,0.0,0,3.357143
25972,71167,Male,Loyal Customer,23,Business travel,Business,646,4,4,4,...,4,5,5,5,5,4,0,0.0,1,4.285714
25973,37675,Female,Loyal Customer,17,Personal Travel,Eco,828,2,5,1,...,4,3,4,5,4,2,0,0.0,0,3.000000
25974,90086,Male,Loyal Customer,14,Business travel,Business,1127,3,3,3,...,3,2,5,4,5,4,0,0.0,1,3.642857


In [51]:
# Drop the unwanted columns
df_aps.drop(columns=['id', 'Arrival Delay in Minutes'], inplace=True)
df_aps

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,satisfaction,ave_score
0,Female,Loyal Customer,52,Business travel,Eco,160,5,4,3,4,...,5,5,5,5,2,5,5,50,1,4.142857
1,Female,Loyal Customer,36,Business travel,Business,2863,1,1,3,1,...,4,4,4,4,3,4,5,0,1,3.428571
2,Male,disloyal Customer,20,Business travel,Eco,192,2,0,2,4,...,2,4,1,3,2,2,2,0,0,2.142857
3,Male,Loyal Customer,44,Business travel,Business,3377,0,0,0,2,...,1,1,1,1,3,1,4,0,1,1.785714
4,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,4,3,...,2,2,2,2,4,2,4,0,1,2.642857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25971,Male,disloyal Customer,34,Business travel,Business,526,3,3,3,1,...,4,3,2,4,4,5,4,0,0,3.357143
25972,Male,Loyal Customer,23,Business travel,Business,646,4,4,4,4,...,4,4,5,5,5,5,4,0,1,4.285714
25973,Female,Loyal Customer,17,Personal Travel,Eco,828,2,5,1,5,...,2,4,3,4,5,4,2,0,0,3.000000
25974,Male,Loyal Customer,14,Business travel,Business,1127,3,3,3,3,...,4,3,2,5,4,5,4,0,1,3.642857


In [52]:
# Get dummies (Each categorical column values will have it's own column in boolean value)
df_aps = pd.get_dummies(df_aps)

In [53]:
# Convert dtype 
df_aps = df_aps.astype(int)

In [54]:
df_aps.dtypes

Age                                  int32
Flight Distance                      int32
Inflight wifi service                int32
Departure/Arrival time convenient    int32
Ease of Online booking               int32
Gate location                        int32
Food and drink                       int32
Online boarding                      int32
Seat comfort                         int32
Inflight entertainment               int32
On-board service                     int32
Leg room service                     int32
Baggage handling                     int32
Checkin service                      int32
Inflight service                     int32
Cleanliness                          int32
Departure Delay in Minutes           int32
satisfaction                         int32
ave_score                            int32
Gender_Female                        int32
Gender_Male                          int32
Customer Type_Loyal Customer         int32
Customer Type_disloyal Customer      int32
Type of Tra

In [55]:
# Initialize independent & dependent variables
X = df_aps.drop(columns=['satisfaction'])
y = df_aps['satisfaction']

In [56]:
# Split the data into training & testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)

In [57]:
# Initialize random forest classifier model
model = RandomForestClassifier(n_estimators=500, criterion='entropy')

In [58]:
# Train the model
model.fit(X_train, y_train)

In [59]:
# Predict the data using model
y_pred = model.predict(X_test)

In [60]:
# Check F1 score of model
f1_score(y_test, y_pred)

0.9508636092912448

END

---