# Airline Passenger Satisfaction

## Table of contents
---------------------------------------------
* Project Methodology  
    - 01. Import CPU Python Libraries 
    - 02. Function Helper
    - 03. Import Dataset & Data Description
    - 04. Data Understanding
    - 05. Select the Featurs
    - 06. Data Pre-Processing
    - 07. Exploratory Data Analysis
    - 08. Data Transformation
    - 9. Feature Selection
    - 10. Feature Engineering 
    - 11. Statistics
    - 12. Resampling Data
    - 13. Data Splitting 
    - 14. Machine Learning Models 
    - 15. Accuracy Score Summary 
---------------------------------------------


# 01. Import CPU Python Libraries

In [None]:
import pandas as pd
import numpy as np 
np.iinfo(np.uint64).max

import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
from termcolor import colored 
import seaborn as sns  
from tabulate import tabulate

# Importing plotly and cufflinks in offline mode
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

# Ignore Warnings
import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

# Figure&Display options
plt.rcParams["figure.figsize"] = (10,6)
pd.set_option('max_colwidth',200)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 200)
pd.set_option('display.float_format', lambda x: '%.3f' % x)


# Feature Engineering
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le_orig = LabelEncoder()

# Data Transformation 
from sklearn.preprocessing import StandardScaler

# Feature Selection
from mlxtend.feature_selection import SequentialFeatureSelector

# Import Resampling Library
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from numpy import mean
from imblearn.over_sampling import SVMSMOTE
from imblearn.pipeline import Pipeline

# Data Splitting 
from sklearn.model_selection import train_test_split

# Feature Selection
from mlxtend.feature_selection import SequentialFeatureSelector

# sklearn Classifiers Evaluation libraries
from sklearn.metrics import classification_report # To get classification report
from sklearn.metrics import confusion_matrix # To get the confusion matrix
from sklearn.metrics import accuracy_score # To get the accuracy score 

# Supervised Machine Learning Models

## Random Forest Classifiers
from sklearn.ensemble import RandomForestClassifier

## Gradient Boosting Classifier 
from sklearn.ensemble import GradientBoostingClassifier
## Histogram-based Gradient Boosting Classification Tree
from sklearn.ensemble import HistGradientBoostingClassifier
## AdaBoost Classifier
from sklearn.ensemble import AdaBoostClassifier

## Extra Trees Classifier
from sklearn.ensemble import ExtraTreesClassifier

## K-Nearest Neighbors Classifier
from sklearn.neighbors import KNeighborsClassifier

## Naive Bayes Classifiers
from sklearn.naive_bayes import GaussianNB # DV
## Naive Bayes classifier for multivariate Bernoulli models
from sklearn.naive_bayes import BernoulliNB # 2 | 3 DV


## Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

## Logistic Regression Classifiers
from sklearn.linear_model import LogisticRegression
## Logistic Regression CV classifier
from sklearn.linear_model import LogisticRegressionCV
## Linear classifiers with stochastic gradient descent SGD training.
from sklearn.linear_model import SGDClassifier
## Linear Perceptron Classifier
from sklearn.linear_model import Perceptron

## XGBoost Classifiers
from xgboost import XGBClassifier

## Support Vector Machines Classifiers
from sklearn.svm import SVC
## Linear Support Vector Classification
from sklearn.svm import LinearSVC

## Multilayer Perceptron Classifier
from sklearn.neural_network import MLPClassifier

# 02. Function Helper

In [None]:
'''
Drop Varibales
'''
def DropVariables(dfDrop, col):
      dfDrop = dfDrop.drop(col, axis=1)

      return dfDrop

'''
Convert Data Type
'''
def Convert_Data_Typt_to_str(Con_df_str, col):
      for colu in col :
            Con_df_str[colu] = Con_df_str[colu].apply(str)
            print ('\nData Type Changed to Objective for Variable: [', colu , '] Data type now is: ' , Con_df_str[colu].dtype)

      return Con_df_str
      
'''
Missong Value Information
'''
def missing_values(df_missing_value_per):
      missing_number = df_missing_value_per.isnull().sum().sort_values(ascending=False)
      missing_percent = (df_missing_value_per.isnull().sum()/df_missing_value_per.shape[0]).sort_values(ascending=False) 
      missing_values = pd.concat([missing_number, missing_percent], axis=1, keys=['Missing_Number', 'Missing_Percent'])

      return missing_values[missing_values['Missing_Number']>=0]

def missing_values_info(df_missing_value):
      print(colored(f"Missing Values for Catuogirical Dataset:\n", attrs=['bold']), missing_values(df_missing_value),'\n',  
            colored('-'*79, 'red', attrs=['bold']), sep='')

def SelectRowsHavingMissing_Value(df_Missing_Rows):
      null_data = df_Missing_Rows[df_Missing_Rows.isnull().any(axis=1)]
      
      return null_data

'''
Split the Dataset Variables to Obj and Num
'''
def Data_Splitting_Num_Obj(df_split):
      Obj = df_split.select_dtypes(include = ['object'])
      Num = df_split.select_dtypes(include = np.number) 

      return Obj, Num

'''
Exploratory Data Analysis (EDA)
'''
def EDA_target(df_EDA,target):
      if df_EDA[target].dtype=='object':
            print(df_EDA[target].value_counts())
            plt.figure(figsize=(10,10))
            explode = [0.1,0.1]
            plt.pie(df_EDA[target].value_counts(), explode=explode,autopct='%1.1f%%', shadow=True,startangle=140)
            plt.title(target)
            plt.axis('off');

def obj_EDA(df_EDA, cols):
      for col in cols:
            table = pd.DataFrame(df_EDA[col].value_counts())
            print(tabulate(table, headers = 'keys', tablefmt = 'psql'))
            plt.figure(figsize = (8,5))
            df_EDA[col].value_counts(normalize = True).plot(kind='bar', color= ['darkorange','steelblue'], alpha = 0.9, rot=0)
            plt.title(col)
            plt.show()

def EDA_obj(df_EDA, cols, Target):
      for col in cols:
            print('Variable Name: ', df_EDA[col].name)
            pd.crosstab(df_EDA[col], df_EDA[Target]).iplot(kind="bar")

def EDA_num(df_EDA, cols):
    for col in cols: 
      fig, ax = plt.subplots(figsize=(20, 10))
      df_EDA.hist(column=[col], ax=ax  )
    df_EDA[cols].plot.box( figsize=(20, 10))
    df_EDA[cols].plot(subplots=True, figsize=(20, 10))
    
def EDA_num_with_DV(df_eda, NUMcols_eda): 
    for col in NUMcols_eda:
        sns.set_style('whitegrid')
        plt.figure(figsize=(20,10))
        sns.set_context('paper', font_scale=1.5)

        sns.histplot(x=col, data = df_eda, bins = 30, hue =target,palette="Blues", kde = True).set_title(col,fontsize=20)
        plt.legend(['not satisfaction','satisfaction'],shadow = True, loc = 0);

'''
Feature Selection
'''
def Feature_selection_forward(df_forward, target):
      ## Split the df to Obj and num
      obj, num = Data_Splitting_Num_Obj(df_forward)
      list_df_num = list(num.columns)
      list_df_num.append(target)
      print('All the Num Variables:',list_df_num)
      list_df_Obj = list(obj.columns)
      print('\nAll the obj Variables:',list_df_Obj)

      # Create New DataFrame Hvae only the Num Variables
      df_num = df_forward[list_df_num]

      # define dataset
      X = df_num.drop(target, axis=1)
      y = df_num[target]
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

      forward_FS = SequentialFeatureSelector(RandomForestClassifier(n_jobs= -1, ), # How many course you want to use. '-1 mwans all the course'
                                                                  k_features= (1, X.shape[1]),
                                                                  forward=True,
                                                                  floating=False,
                                                                  verbose=2,
                                                                  scoring='accuracy',
                                                                  cv= 5
                  ).fit(X_train, y_train)

      print ('Most Variables Can Effect The Target Variables:\n',forward_FS.k_feature_names_)
      print ('\nWith Highest Score:\n',forward_FS.k_score_)

      new_list_num = list(forward_FS.k_feature_names_)
      print('All the Num Variables Selected:',new_list_num)

      new_df_list = new_list_num + list_df_Obj
      df_forward = df_forward[new_df_list]

      forward_FS = pd.DataFrame(forward_FS.get_metric_dict()).T

      return df_forward, forward_FS

'''
Data Transformation
'''
def NumStandardScaler(dataframe_series):
      for col in list(dataframe_series.columns):
            if (dataframe_series[col].dtype == 'float64' or dataframe_series[col].dtype == 'int64'):
                  print ('\nStandardization Applied On:', col)
                  dataframe_series[col] = StandardScaler().fit_transform(dataframe_series[col].values.reshape(-1,1))

      return dataframe_series

'''
Feature Engineering 
'''
def AllObjLabelEncoder(dataframe_series):
      if dataframe_series.dtype=='object':
            print('\nLableEncoding Applied On:', dataframe_series.name)
            dataframe_series = LabelEncoder().fit_transform(dataframe_series)
            

      return dataframe_series
      
'''
Resampling Data
'''
def resampling_by_SMOTE(x_s, y_s): 
      model = DecisionTreeClassifier()
      cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

      # define SMOTE
      smote = SMOTE() 
      # fit predictor and target variable
      x_smote, y_smote = smote.fit_resample(x_s, y_s)
      # summarize the new class distribution
      print('\nOriginal dataset shape', Counter(y_s))
      print('SMOTE Resample dataset shape', Counter(y_smote))
      # Evaluate pipeline
      scores_SMOTE = cross_val_score(model, x_smote, y_smote, scoring='roc_auc', cv=cv, n_jobs=-1)
      scores_SMOTE = mean(scores_SMOTE)
      print('SMOTE Mean ROC AUC: ', scores_SMOTE)


      # Define SMOTESVM
      oversample = SVMSMOTE()
      # fit predictor and target variable
      X_svm, y_svm = oversample.fit_resample(x_s, y_s)
      # Summarize the new class distribution
      print('\nOriginal dataset shape', Counter(y_s))
      print('SMOTE SVM Resample dataset shape', Counter(y_svm))
      # Evaluate pipeline
      scores_SVM = cross_val_score(model, X_svm, y_svm, scoring='roc_auc', cv=cv, n_jobs=-1)
      scores_SVM = mean(scores_SVM)
      print('SMOTE SVM Mean ROC AUC: ', scores_SVM)


      # Define Standerd SMOTE pipeline
      under = RandomUnderSampler()
      over = SMOTE()
      steps = [('over', over), ('u', under)]
      pipeline = Pipeline(steps=steps)
      # transform the dataset
      x_smote_pip, y_smote_pip = pipeline.fit_resample(x_s, y_s)
      # summarize the new class distribution
      print('\nOriginal dataset shape', Counter(y_s))
      print('Standerd SMOTE pipeline Resample dataset shape', Counter(y_smote_pip))
      # Evaluate pipeline
      scores_pip = cross_val_score(model, x_smote_pip, y_smote_pip, scoring='roc_auc', cv=cv, n_jobs=-1)
      scores_pip = mean(scores_pip)
      print('Standerd SMOTE pipeline Mean ROC AUC: ', scores_pip)


      # Define K neighbors SMOTE pipeline
      over = SMOTE(k_neighbors=3)
      steps = [('over', over), ('u', under)]
      pipeline = Pipeline(steps=steps)
      # transform the dataset
      x_smote_k, y_smote_k = pipeline.fit_resample(x_s, y_s)
      # summarize the new class distribution
      print('\nOriginal dataset shape', Counter(y_s))
      print('K neighbors SMOTE pipeline Resample dataset shape', Counter(y_smote_k))
      # Evaluate pipeline
      scores_pip_K = cross_val_score(model, x_smote_k, y_smote_k, scoring='roc_auc', cv=cv, n_jobs=-1)
      scores_pip_K = mean(scores_pip_K)
      print('K neighbors SMOTE pipeline Mean ROC AUC: ', scores_pip_K)


      if ( (scores_SMOTE >= scores_SVM) and (scores_SMOTE >= scores_pip) and (scores_SMOTE >= scores_pip_K) ): 
            print('\n\tThe Highest ROC Score is: ', scores_SMOTE)
            return x_smote, y_smote
      elif ( (scores_SVM >= scores_SMOTE) and (scores_SVM >= scores_pip) and (scores_SVM >= scores_pip_K) ): 
            print('\n\tThe Highest ROC Score is: ', scores_SVM)
            return X_svm, y_svm
      elif ( (scores_pip >= scores_SMOTE) and (scores_pip >= scores_SVM) and (scores_pip >= scores_pip_K) ): 
            print('\n\tThe Highest ROC Score is: ', scores_pip)
            return x_smote_pip, y_smote_pip
      elif ( (scores_pip_K >= scores_SMOTE) and (scores_pip_K >= scores_SVM) and (scores_pip_K >= scores_pip) ): 
            print('\n\tThe Highest ROC Score is: ', scores_pip_K)
            return  x_smote_k, y_smote_k
      

# 03. Import Dataset & Data Description

## Import CSV File

In [None]:
data_path = 'C:/Users/rbani/OneDrive/Desktop/ML DL NLP/Classification/Airline Passenger Satisfaction/airline_passenger_satisfaction.csv'
df = pd.read_csv(data_path)
pd.DataFrame(df)

## Data Description

The original dataset contains about 129880 survey entries and passenger/flight details from a US airline. the dataset collocted from https://www.kaggle.com/datasets/mysarahmadbhat/airline-passenger-satisfaction. In total, there are 24 feature columns including binary, object, int, and float data type. Out of all the features, 14 are survey entries where passengers rate the flight experience on a scale of 0 to 5. However, After removing NaN values, the resulting data set for model building has about 129,487 entries.

IDV Variables Description: 

- Gender: male or female
- Customer type: regular or non-regular airline customer
- Age: the actual age of the passenger
- Type of travel: the purpose of the passenger's flight (personal or business travel)
- Class: business, economy, economy plus
- Flight distance
- Inflight wifi service: satisfaction level with Wi-Fi service on board (0: not rated; 1-5)
- Departure/Arrival time convenient: departure/arrival time satisfaction level (0: not rated; 1-5)
- Ease of Online booking: online booking satisfaction rate (0: not rated; 1-5)
- Gate location: level of satisfaction with the gate location (0: not rated; 1-5)
- Food and drink: food and drink satisfaction level (0: not rated; 1-5)
- Online boarding: satisfaction level with online boarding (0: not rated; 1-5)
- Seat comfort: seat satisfaction level (0: not rated; 1-5)
- Inflight entertainment: satisfaction with inflight entertainment (0: not rated; 1-5)
- On-board service: level of satisfaction with on-board service (0: not rated; 1-5)
- Leg room service: level of satisfaction with leg room service (0: not rated; 1-5)
- Baggage handling: level of satisfaction with baggage handling (0: not rated; 1-5)
- Checkin service: level of satisfaction with checkin service (0: not rated; 1-5)
- Inflight service: level of satisfaction with inflight service (0: not rated; 1-5)
- Cleanliness: level of satisfaction with cleanliness (0: not rated; 1-5)
- Departure delay in minutes
- Arrival delay in minutes

DV Variable Description: 

- Satisfaction
- Neutral or dissatisfied

# 04. Data Understanding

In [None]:
print(df.info(), '\n', 
            colored('-'*79, 'red', attrs=['bold']), sep='')
missing_values_info(df)

In [None]:
NUMcols = df.select_dtypes(np.number).columns
Objcols = df.select_dtypes(include = ['object']).columns

In [None]:
if len(NUMcols) != 0 :
    su_stat = pd.DataFrame(df.describe().T)
    print(tabulate(su_stat, headers = 'keys', tablefmt = 'psql'))

In [None]:
if len(Objcols) != 0 :
    su_stat = pd.DataFrame(df.describe(include=object).T)
    print(tabulate(su_stat, headers = 'keys', tablefmt = 'psql'))

In [None]:
pd.DataFrame(df.var())

After First Looking into the dataset it found that: 
- The 'ID' Variance its unuseful Variable. 
- In addtion to that it found, 'Departure and Arrival Time Convenience', 'Ease of Online Booking', 'Check-in Service', 'Online Boarding', 'Gate Location', 'On-board Service', 'Seat Comfort', 'Leg Room Service', 'Cleanliness', 'Food and Drink', 'In-flight Service', 'In-flight Wifi Service', 'In-flight Entertainment', 'Baggage Handling' in int data type, and it shoud be in object data trpe becuse its a Ordinal level. 
- There is 393 Missing value 'NaN' in Arrival Delay Variable.

# 05. Select The Featurs

In [None]:
'''
                1)        Select the Target Varibale
'''
target = 'Satisfaction'


'''
                2)        Select the Varibales Dont have any Value 
'''
col_drop = [ 'ID'
         
        ]

'''
                3)        Select the Variables In the Wrong Data Type 
'''
# To convert variable type to str
col_convert_str = [ 'Departure and Arrival Time Convenience', 'Ease of Online Booking', 'Check-in Service',
                        'Online Boarding', 'Gate Location', 'On-board Service', 'Seat Comfort', 'Leg Room Service', 
                        'Cleanliness', 'Food and Drink', 'In-flight Service', 'In-flight Wifi Service', 'In-flight Entertainment', 
                        'Baggage Handling'
        ]

# 06. Data Pre-Processing

Foolowing what it found in the Data Understaning part, this suction will incloud: 
- Drop Variable: 'ID'
- Convert data type for 'Departure and Arrival Time Convenience', 'Ease of Online Booking', 'Check-in Service', 'Online Boarding', 'Gate Location', 'On-board Service', 'Seat Comfort', 'Leg Room Service', 'Cleanliness', 'Food and Drink', 'In-flight Service', 'In-flight Wifi Service', 'In-flight Entertainment', 'Baggage Handling' from int to object
- Drop Missing Value 

## Drop Variables

In [None]:
df = DropVariables(df, col_drop)

print(tabulate(df.info(), headers = 'keys', tablefmt = 'psql'))

## Convert Data Type

In [None]:
df = Convert_Data_Typt_to_str(df,col_convert_str)     

## Missing Value

In [None]:
NUMcols = df.select_dtypes(np.number).columns
Objcols = df.select_dtypes(include = ['object']).columns

### Drop Missing Value

In [None]:
df = df.dropna()
df = df.reset_index()
df = df.drop(columns=['index'])
missing_values_info(df)
print("Dataset size after remove all the missing value: ",df.shape[0])

# 07. Exploratory Data Analysis (EDA)

In [None]:
NUMcols = df.select_dtypes(np.number).columns

Objcols = df.select_dtypes(include = ['object']).columns
Objcols = Objcols.drop(target)

## Target

In [None]:
EDA_target(df, target)

The pie chart above represents the two possible prediction oucomes of the machine learning models to be developed. As it can be seen, the dataset consists of an imbalance in terms of its possible outcomes of 'Neutral or Dissatisfied' and 'Satisfied. 
This is a crucial insight as the the imbalance needs to be corrected in order to prevent overfitting of the machine learning model to be developed.

## IDV

### IDV Objective

In [None]:
obj_EDA(df, Objcols)

### IDV Objective With DV

In [None]:
EDA_obj(df, Objcols, target)

### IDV Numrical

In [None]:
EDA_num(df, NUMcols)

### IDV Numrical With DV

In [None]:
EDA_num_with_DV(df, NUMcols)

# 08. Data Transformation

Machine Learning algorithms perform better when numerical input variables are scaled to a standard range.
Standardization scales each input variable separately by subtracting the mean (called centering) and dividing by the standard deviation to shift the distribution to have a mean of zero and a standard deviation of one.

In [None]:
# StandardScaler
df = NumStandardScaler(df)

In [None]:
pd.DataFrame(df.head())

As noted in the table above, standardisation was performed to the variables "Age," "Flight Distance," "Departure Delay," and "Arrival Delay." each variable's value has previously been standardised.

# 09. Feature Selection

Feature Selection helps in finding the smallest set of features which results in

- Training a machine learning algorithm faster.
- Reducing the complexity of a model and making it easier to interpret.
- Building a sensible model with better prediction power.
- Reducing over-fitting by selecting the right set of features.

## Wrapper "Forward"

In [None]:
df, fs = Feature_selection_forward(df, target)

As we can see In forward selection, it starts with a null model and then starts fitting the model with each individual numerical feature one at a time and selects the feature with the minimum p-value. then it fits a model with two features by trying combinations of the earlier selected feature with all other remaining features. Again it selects the feature with the minimum p-value. then it fits a model with three features by trying combinations of two previously selected features with other remaining features. It repeats this process until it has a set of selected features with a p-value of individual features less than the significance level.

In [None]:
pd.DataFrame(fs)

The 'Flight Distance' variable, which has the highest average score and has no other numerical variables, is shown to have the highest average score in the table above that displays the average score for each step in the forward selection.
Therefore, out of all the numerical features, the model will now just choose the 'Flight Distance' variable.

In [None]:
pd.DataFrame(df.head())

# 10. Feature Engineering 

This approach is very simple and it involves converting each value in a column to a number. 
As well as, it requires the category column to be of ‘category’ datatype. 

In [None]:
df = df.apply(lambda x: AllObjLabelEncoder(x))

In [None]:
pd.DataFrame(df.head())

# 11. Statistics 

## Correlation between IDV and DV

In [None]:
pd.DataFrame(df.corr()[target].sort_values(ascending = False).reset_index())

## Correlation between all the Variables

In [None]:
pd.DataFrame(df.corr())

In [None]:
df.corr().style.background_gradient(cmap="Blues") # YlOrBr Greys GnBu

# 12. Resampling Data

The challenge of working with imbalanced datasets is that most machine learning techniques will ignore, and in turn have poor performance the minority class. In this section will develop an intuition for the SMOTE approaches by applying it to an imbalanced binary classification problem.
- Note the model will evaluate using the ROC area under curve (AUC) metric

In [None]:
# Define X, y
X = df.drop(target, axis=1)
y = df[target]

X, y = resampling_by_SMOTE(X, y)

As it provides, the K neighbors SMOTE pipeline is showing the highest ROC, so it the SMOTE approach select is K neighbors SMOTE pipeline.

In [None]:
print('\nthe new target size now after resampling is: ', Counter(y))

# 13. Data Splitting 

Based on the dataset size, it will split the dataset into 90% for traning and 10% for testing 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.1, random_state=42)


print('X_train: ', X_train.shape)
print('y_train: ', y_train.shape)

print('X_test: ', X_test.shape)
print('y_test: ', y_test.shape)

# 14. Machine Learning Models 

* In this sectuion going to fit 'Train' 16 machine learning models:
    -        Random Forest Classifier
    -        Gradient Boosting Classifier
    -        Histogram-based Gradient Boosting Classification Tree
    -        AdaBoost Classifier
    -        Extra Trees Classifier
    -        K Neighbors Classifier
    -        Naive Bayes Classifiers
    -        Naive Bayes Classifier for Multivariate Bernoulli
    -        Decision Tree Classifier
    -        Logistic Regression Classifier
    -        Logistic Regression CV Classifier
    -        Stochastic Gradient Descent Classifier
    -        Linear Perceptron Classifier
    -        XGBoost Classifiers
    -        Support Vector Machines Classifiers
    -        Linear Support Vector Classification
    -        Multilayer Perceptron Classifier
* Following that it going to test the models.

* After that it going to evaluate each model using: 
    -   Accuracy Score
    -   Classification Report
    -   Confusion Matrix

## Build the Machine Learning Models on CPU

In [None]:
# Random Forest Classifier
rf_m_cpu = RandomForestClassifier().fit(X_train, y_train)

# Gradient Boosting Classifier
gb_m_cpu = GradientBoostingClassifier().fit(X_train, y_train)

# Histogram-based Gradient Boosting Classification Tree
hgb_m_cpu = HistGradientBoostingClassifier().fit(X_train, y_train)

# AdaBoost Classifier
ad_m_cpu = AdaBoostClassifier().fit(X_train, y_train)

# Extra Trees Classifier
et_m_cpu = ExtraTreesClassifier().fit(X_train, y_train)

# K Neighbors Classifier
knn_m_cpu = KNeighborsClassifier().fit(X_train, y_train)

# Naive Bayes Classifiers
nb_m_cpu = GaussianNB().fit(X_train, y_train)

# Naive Bayes Classifier for Multivariate Bernoulli
bnb_m_cpu = BernoulliNB().fit(X_train, y_train)

# Decision Tree Classifier
dt_m_cpu = DecisionTreeClassifier().fit(X_train, y_train)

# Logistic Regression Classifier
lg_m_cpu = LogisticRegression().fit(X_train, y_train)

# Logistic Regression CV Classifier
lgcv_m_cpu = LogisticRegressionCV().fit(X_train, y_train)

# Stochastic Gradient Descent Classifier
sgdc_m_cpu = SGDClassifier().fit(X_train, y_train)

# Linear Perceptron Classifier
lpc_m_cpu = Perceptron().fit(X_train, y_train)

# XGBoost Classifiers
xgb_m_cpu = XGBClassifier().fit(X_train, y_train)

# Support Vector Machines Classifiers
svm_m_cpu = SVC().fit(X_train, y_train)

# Linear Support Vector Classification
lsvm_m_cpu = LinearSVC().fit(X_train, y_train)

# Multilayer Perceptron Classifier
mlp_m_cpu = MLPClassifier().fit(X_train, y_train)

## Predict y_test Using CPU

In [None]:
# Random Forest Classifier
rf_m_pred_vaild_cpu = rf_m_cpu.predict(X_test)

# Gradient Boosting Classifier
gb_m_pred_vaild_cpu = gb_m_cpu.predict(X_test)

# Histogram-based Gradient Boosting Classification Tree
hgb_m_pred_vaild_cpu = hgb_m_cpu.predict(X_test)

# AdaBoost Classifier
ad_m_pred_vaild_cpu = ad_m_cpu.predict(X_test)

# Extra Trees Classifier
et_m_pred_vaild_cpu = et_m_cpu.predict(X_test)

# K Neighbors Classifier
knn_m_pred_vaild_cpu = knn_m_cpu.predict(X_test)

# Naive Bayes Classifiers
nb_m_pred_vaild_cpu = nb_m_cpu.predict(X_test)

# Naive Bayes Classifier for Multivariate Bernoulli
bnb_m_pred_vaild_cpu = bnb_m_cpu.predict(X_test)

# Decision Tree Classifier
dt_m_pred_vaild_cpu = dt_m_cpu.predict(X_test)

# Logistic Regression Classifier
lg_m_pred_vaild_cpu = lg_m_cpu.predict(X_test)

# Logistic Regression CV Classifier
lgcv_m_pred_vaild_cpu =  lgcv_m_cpu.predict(X_test)

# Stochastic Gradient Descent Classifier
sgdc_m_pred_vaild_cpu =  sgdc_m_cpu.predict(X_test)

# Linear Perceptron Classifier
lpc_m_pred_vaild_cpu =  lpc_m_cpu.predict(X_test)

# XGBoost Classifiers
xgb_m_pred_vaild_cpu = xgb_m_cpu.predict(X_test)

# Support Vector Machines Classifiers
svm_m_pred_vaild_cpu = svm_m_cpu.predict(X_test)

# Linear Support Vector Classification
lsvm_m_pred_vaild_cpu = lsvm_m_cpu.predict(X_test)

# Multilayer Perceptron Classifier
mlp_m_pred_vaild_cpu = mlp_m_cpu.predict(X_test)

## Evaluate the Models CPU

### Accuracy Score

In [None]:
# Random Forest Classifier
Valid_accuracy_rf_m_cpu = accuracy_score(y_test, rf_m_pred_vaild_cpu)

# Gradient Boosting Classifier
Valid_accuracy_gb_m_cpu = accuracy_score(y_test, gb_m_pred_vaild_cpu)

# Histogram-based Gradient Boosting Classification Tree
Valid_accuracy_hgb_m_cpu = accuracy_score(y_test, hgb_m_pred_vaild_cpu)

# AdaBoost Classifier
Valid_accuracy_ad_m_cpu = accuracy_score(y_test, ad_m_pred_vaild_cpu)

# Extra Trees Classifier
Valid_accuracy_et_m_cpu = accuracy_score(y_test, et_m_pred_vaild_cpu)

# K Neighbors Classifier
Valid_accuracy_knn_m_cpu = accuracy_score(y_test, knn_m_pred_vaild_cpu)

# Naive Bayes Classifiers
Valid_accuracy_nb_m_cpu = accuracy_score(y_test, nb_m_pred_vaild_cpu)

# Naive Bayes Classifier for Multivariate Bernoulli
Valid_accuracy_bnb_m_cpu = accuracy_score(y_test, bnb_m_pred_vaild_cpu)

# Decision Tree Classifier
Valid_accuracy_dt_m_cpu = accuracy_score(y_test, dt_m_pred_vaild_cpu)

# Logistic Regression Classifier
Valid_accuracy_lg_m_cpu = accuracy_score(y_test, lg_m_pred_vaild_cpu)

# Logistic Regression CV Classifier
Valid_accuracy_lgcv_m_cpu = accuracy_score(y_test, lgcv_m_pred_vaild_cpu)

# Stochastic Gradient Descent Classifier
Valid_accuracy_sgdc_m_cpu = accuracy_score(y_test, sgdc_m_pred_vaild_cpu)

# Linear Perceptron Classifier
Valid_accuracy_lpc_m_cpu = accuracy_score(y_test, lpc_m_pred_vaild_cpu)

# XGBoost Classifiers
Valid_accuracy_xgb_m_cpu = accuracy_score(y_test, xgb_m_pred_vaild_cpu)

# Support Vector Machines Classifiers
Valid_accuracy_svm_m_cpu = accuracy_score(y_test, svm_m_pred_vaild_cpu)

# Linear Support Vector Classification
Valid_accuracy_lsvm_m_cpu = accuracy_score(y_test, lsvm_m_pred_vaild_cpu)
        
# Multilayer Perceptron Classifier
Valid_accuracy_mlp_m_cpu = accuracy_score(y_test, mlp_m_pred_vaild_cpu)

cpu_ml_m_valid_compare = pd.DataFrame({"Standered CPU Models": [
                                                "Random Forest Classifier", 
                                                "Gradient Boosting Classifier",
                                                "Histogram-based Gradient Boosting Classification Tree",
                                                "AdaBoost Classifier",
                                                "Extra Trees Classifier",
                                                "K Neighbors Classifier",
                                                "Naive Bayes Classifiers",
                                                "Naive Bayes Classifier for Multivariate Bernoulli",
                                                "Decision Tree Classifier",
                                                "Logistic Regression Classifier",
                                                "Logistic Regression CV Classifier",
                                                "Stochastic Gradient Descent Classifier",
                                                "Linear Perceptron Classifier",
                                                "XGBoost Classifiers",
                                                "Support Vector Machines Classifiers",
                                                "Linear Support Vector Classification",
                                                "Multilayer Perceptron Classifier"
                                                ],

                                "Accuracy": [
                                                Valid_accuracy_rf_m_cpu, 
                                                Valid_accuracy_gb_m_cpu,
                                                Valid_accuracy_hgb_m_cpu,
                                                Valid_accuracy_ad_m_cpu,
                                                Valid_accuracy_et_m_cpu,
                                                Valid_accuracy_knn_m_cpu,
                                                Valid_accuracy_nb_m_cpu,
                                                Valid_accuracy_bnb_m_cpu,
                                                Valid_accuracy_dt_m_cpu,
                                                Valid_accuracy_lg_m_cpu,
                                                Valid_accuracy_lgcv_m_cpu,
                                                Valid_accuracy_sgdc_m_cpu,
                                                Valid_accuracy_lpc_m_cpu,
                                                Valid_accuracy_xgb_m_cpu,
                                                Valid_accuracy_svm_m_cpu,
                                                Valid_accuracy_lsvm_m_cpu,
                                                Valid_accuracy_mlp_m_cpu
                                                ],
                                        })  
                                              
print(tabulate(cpu_ml_m_valid_compare.sort_values(by="Accuracy", ascending=False), headers = 'keys', tablefmt = 'psql'))
fig = px.bar(cpu_ml_m_valid_compare.sort_values(by="Accuracy", ascending=True), x = "Accuracy", y = "Standered CPU Models", title = "Accuracy Validation for Machines Learning Model on CPU")
fig.show()

### Classification Report and Confusion Matrix

#### Random Forest Classifier

In [None]:
print('Model Random Forest Classifier Validation Classification Report:\n ', classification_report(y_test, rf_m_pred_vaild_cpu, digits = 3))

plt.figure(figsize=(15,10))
sns.heatmap(confusion_matrix(y_test, rf_m_pred_vaild_cpu),
                    annot=True,fmt = "d",linecolor="k",linewidths=3)
        
plt.title("Model Random Forest Classifier Validation Confusion Matrix",fontsize=14)
plt.show()

#### Gradient Boosting Classifier

In [None]:
print('Model Gradient Boosting Classifier Validation Classification Report:\n ', classification_report(y_test, gb_m_pred_vaild_cpu, digits = 3))

plt.figure(figsize=(15,10))
sns.heatmap(confusion_matrix(y_test, gb_m_pred_vaild_cpu),
                    annot=True,fmt = "d",linecolor="k",linewidths=3)
        
plt.title("Model Gradient Boosting Classifier Validation Confusion Matrix",fontsize=14)
plt.show()

#### Histogram-based Gradient Boosting Classification Tree

In [None]:
print('Model Histogram-based Gradient Boosting Classification Tree Validation Classification Report:\n ', classification_report(y_test, hgb_m_pred_vaild_cpu, digits = 3))

plt.figure(figsize=(15,10))
sns.heatmap(confusion_matrix(y_test, hgb_m_pred_vaild_cpu),
                    annot=True,fmt = "d",linecolor="k",linewidths=3)
        
plt.title("Model Histogram-based Gradient Boosting Classification Tree Validation Confusion Matrix",fontsize=14)
plt.show()

#### AdaBoost Classifier

In [None]:
print('Model AdaBoost Classifier Validation Classification Report:\n ', classification_report(y_test, ad_m_pred_vaild_cpu, digits = 3))

plt.figure(figsize=(15,10))
sns.heatmap(confusion_matrix(y_test, ad_m_pred_vaild_cpu),
                    annot=True,fmt = "d",linecolor="k",linewidths=3)
        
plt.title("Model AdaBoost Classifier Validation Confusion Matrix",fontsize=14)
plt.show()

#### Extra Trees Classifier

In [None]:
print('Model Extra Trees Classifier Validation Classification Report:\n ', classification_report(y_test, et_m_pred_vaild_cpu, digits = 3))

plt.figure(figsize=(15,10))
sns.heatmap(confusion_matrix(y_test, et_m_pred_vaild_cpu),
                    annot=True,fmt = "d",linecolor="k",linewidths=3)
        
plt.title("Model Extra Trees Classifier Validation Confusion Matrix",fontsize=14)
plt.show()

#### K Neighbors Classifier

In [None]:
print('Model K Neighbors Classifier Validation Classification Report:\n ', classification_report(y_test, knn_m_pred_vaild_cpu, digits = 3))

plt.figure(figsize=(15,10))
sns.heatmap(confusion_matrix(y_test, knn_m_pred_vaild_cpu),
                    annot=True,fmt = "d",linecolor="k",linewidths=3)
        
plt.title("Model K Neighbors Classifier Validation Confusion Matrix",fontsize=14)
plt.show()

#### Naive Bayes Classifier 

In [None]:
print('Model Naive Bayes Classifier Validation Classification Report:\n ', classification_report(y_test, nb_m_pred_vaild_cpu, digits = 3))

plt.figure(figsize=(15,10))
sns.heatmap(confusion_matrix(y_test, nb_m_pred_vaild_cpu),
                    annot=True,fmt = "d",linecolor="k",linewidths=3)
        
plt.title("Model Naive Bayes Classifier Validation Confusion Matrix",fontsize=14)
plt.show()

#### Naive Bayes Classifier for Multivariate Bernoulli

In [None]:
print('Model Naive Bayes Classifier for Multivariate Bernoulli Validation Classification Report:\n ', classification_report(y_test, bnb_m_pred_vaild_cpu, digits = 3))

plt.figure(figsize=(15,10))
sns.heatmap(confusion_matrix(y_test, bnb_m_pred_vaild_cpu),
                    annot=True,fmt = "d",linecolor="k",linewidths=3)
        
plt.title("Model Naive Bayes Classifier for Multivariate Bernoulli Confusion Matrix",fontsize=14)
plt.show()

#### Decision Tree Classifier

In [None]:
print('Model Decision Tree Classifier Validation Classification Report:\n ', classification_report(y_test, dt_m_pred_vaild_cpu, digits = 3))

plt.figure(figsize=(15,10))
sns.heatmap(confusion_matrix(y_test, dt_m_pred_vaild_cpu),
                    annot=True,fmt = "d",linecolor="k",linewidths=3)
        
plt.title("Model Decision Tree Classifier Validation Confusion Matrix",fontsize=14)
plt.show()

#### Logistic Regression Classifier

In [None]:
print('Model Logistic Regression Classifier Validation Classification Report:\n ', classification_report(y_test, lg_m_pred_vaild_cpu, digits = 3))

plt.figure(figsize=(15,10))
sns.heatmap(confusion_matrix(y_test, lg_m_pred_vaild_cpu),
                    annot=True,fmt = "d",linecolor="k",linewidths=3)
        
plt.title("Model Logistic Regression Classifier Validation Confusion Matrix",fontsize=14)
plt.show()

#### Logistic Regression CV Classifier

In [None]:
print('Model Logistic Regression CV Classifier Validation Classification Report:\n ', classification_report(y_test, lgcv_m_pred_vaild_cpu, digits = 3))

plt.figure(figsize=(15,10))
sns.heatmap(confusion_matrix(y_test, lgcv_m_pred_vaild_cpu),
                    annot=True,fmt = "d",linecolor="k",linewidths=3)
        
plt.title("Model Logistic Regression CV Classifier Validation Confusion Matrix",fontsize=14)
plt.show()

#### Stochastic Gradient Descent Classifier

In [None]:
print('Model Stochastic Gradient Descent Classifier Validation Classification Report:\n ', classification_report(y_test, sgdc_m_pred_vaild_cpu, digits = 3))

plt.figure(figsize=(15,10))
sns.heatmap(confusion_matrix(y_test, sgdc_m_pred_vaild_cpu),
                    annot=True,fmt = "d",linecolor="k",linewidths=3)
        
plt.title("Model Stochastic Gradient Descent Classifier Validation Confusion Matrix",fontsize=14)
plt.show()

#### Linear Perceptron Classifier

In [None]:
print('Model Linear Perceptron Classifier Validation Classification Report:\n ', classification_report(y_test, lpc_m_pred_vaild_cpu, digits = 3))

plt.figure(figsize=(15,10))
sns.heatmap(confusion_matrix(y_test, lpc_m_pred_vaild_cpu),
                    annot=True,fmt = "d",linecolor="k",linewidths=3)
        
plt.title("Model Linear Perceptron Classifier Validation Confusion Matrix",fontsize=14)
plt.show()

#### XGBoost Classifier

In [None]:
print('Model XGBoost Classifier Validation Classification Report:\n ', classification_report(y_test, xgb_m_pred_vaild_cpu, digits = 3))

plt.figure(figsize=(15,10))
sns.heatmap(confusion_matrix(y_test, xgb_m_pred_vaild_cpu),
                    annot=True,fmt = "d",linecolor="k",linewidths=3)
        
plt.title("Model XGBoost Classifier Validation Confusion Matrix",fontsize=14)
plt.show()

#### Support Vector Machines Classifier

In [None]:
print('Model Support Vector Machines Classifier Validation Classification Report:\n ', classification_report(y_test, svm_m_pred_vaild_cpu, digits = 3))

plt.figure(figsize=(15,10))
sns.heatmap(confusion_matrix(y_test, svm_m_pred_vaild_cpu),
                    annot=True,fmt = "d",linecolor="k",linewidths=3)
        
plt.title("Model Support Vector Machines Classifier Validation Confusion Matrix",fontsize=14)
plt.show()

#### Linear Support Vector Classification

In [None]:
print('Model Linear Support Vector Classificationr Validation Classification Report:\n ', classification_report(y_test, lsvm_m_pred_vaild_cpu, digits = 3))

plt.figure(figsize=(15,10))
sns.heatmap(confusion_matrix(y_test, lsvm_m_pred_vaild_cpu),
                    annot=True,fmt = "d",linecolor="k",linewidths=3)
        
plt.title("Model Linear Support Vector Classification Validation Confusion Matrix",fontsize=14)
plt.show()

#### Multilayer Perceptron Classifier

In [None]:
print('Model Multilayer Perceptron Classifier Validation Classification Report:\n ', classification_report(y_test, mlp_m_pred_vaild_cpu, digits = 3))

plt.figure(figsize=(15,10))
sns.heatmap(confusion_matrix(y_test, mlp_m_pred_vaild_cpu),
                    annot=True,fmt = "d",linecolor="k",linewidths=3)
        
plt.title("Model Multilayer Perceptron Classifier Validation Confusion Matrix",fontsize=14)
plt.show()

# 15. Accuracy Score Summary  

## Standered Machien Learning Models 

### Predication Using CPU

In [None]:
print(tabulate(cpu_ml_m_valid_compare.sort_values(by="Accuracy", ascending=False), headers = 'keys', tablefmt = 'psql'))

In [None]:
fig = px.bar(cpu_ml_m_valid_compare.sort_values(by="Accuracy", ascending= True ), x = "Accuracy", y = "Standered CPU Models", title = "Machines Learning CPU Accuracy Validation ")
fig.show()

The results for this project it is showing Random Forest Classifier, Extra Trees Classifier, XGBoost Classifiers with accuracy (96.4, 96.4, and 95.8) respectively.