In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [53]:
airline = pd.read_csv("data/train_airline.csv")

In [54]:
airline.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [65]:
airline.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 23 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Gender                             103904 non-null  object 
 1   Customer Type                      103904 non-null  object 
 2   Age                                103904 non-null  int64  
 3   Type of Travel                     103904 non-null  object 
 4   Class                              103904 non-null  object 
 5   Flight Distance                    103904 non-null  int64  
 6   Inflight wifi service              103904 non-null  int64  
 7   Departure/Arrival time convenient  103904 non-null  int64  
 8   Ease of Online booking             103904 non-null  int64  
 9   Gate location                      103904 non-null  int64  
 10  Food and drink                     103904 non-null  int64  
 11  Online boarding                    1039

In [56]:
airline["satisfaction"] = airline["satisfaction"].map({"neutral or dissatisfied": 0, "satisfied":1})
airline = airline.drop(["id"], axis=1)

In [57]:
X = airline.drop(["satisfaction"], axis=1)
y = airline["satisfaction"]

In [58]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                    test_size=0.2, random_state=42)

In [59]:
numerical_features = X_train.select_dtypes(include='number').columns.tolist()
categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()

In [61]:
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy= "most_frequent")),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [62]:
full_processor = ColumnTransformer(transformers=[
    ('number', numeric_pipeline, numerical_features),
    ('category', categorical_pipeline, categorical_features)
])

In [63]:
svm_clf = SVC()
svm_pipeline = Pipeline(steps=[
    ('preprocess', full_processor),
    ('model', svm_clf)
])

In [64]:
svm_pipeline.fit(X_train, y_train)
y_pred = svm_pipeline.predict(X_valid)

In [66]:
single_svm_acc = accuracy_score(y_pred, y_valid)
print("Single SVC accuracy: {}%".format(single_svm_acc * 100))

Single SVC accuracy: 94.57677686348106%


In [47]:
airline_test = pd.read_csv("data/test_airline.csv")

In [48]:
airline_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25976 entries, 0 to 25975
Data columns (total 24 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   id                                 25976 non-null  int64  
 1   Gender                             25976 non-null  object 
 2   Customer Type                      25976 non-null  object 
 3   Age                                25976 non-null  int64  
 4   Type of Travel                     25976 non-null  object 
 5   Class                              25976 non-null  object 
 6   Flight Distance                    25976 non-null  int64  
 7   Inflight wifi service              25976 non-null  int64  
 8   Departure/Arrival time convenient  25976 non-null  int64  
 9   Ease of Online booking             25976 non-null  int64  
 10  Gate location                      25976 non-null  int64  
 11  Food and drink                     25976 non-null  int

In [72]:
airline_test["satisfaction"] = airline_test["satisfaction"].map({"neutral or dissatisfied": 0, "satisfied":1})
airline_test = airline_test.drop(["id"], axis=1)

KeyError: "['id'] not found in axis"

In [69]:
X = airline_test.drop(["satisfaction"], axis=1)
y = airline_test["satisfaction"]

In [70]:
y_pred = svm_pipeline.predict(X)

In [71]:
single_svm_acc = accuracy_score(y_pred, y)
print("Single SVC accuracy: {}%".format(single_svm_acc * 100))

TypeError: Labels in y_true and y_pred should be of the same type. Got y_true=[0 1] and y_pred=['neutral or dissatisfied' 'satisfied']. Make sure that the predictions provided by the classifier coincides with the true labels.