In [2]:
import pandas as pd

# Methods

In [3]:
def convert_data(df, label_name, expand=False):

    new_df = df.copy()

    categorical_cols = new_df.select_dtypes(include='object').columns
    numeric_cols = new_df.select_dtypes(include=['int64', 'float64']).columns

    # scale numeric values
    for col in numeric_cols:
        if col == label_name:
            continue
        # mix-max scaling
        new_df[col] = (new_df[col] - new_df[col].min()) / (new_df[col].max() - new_df[col].min())

    for col in categorical_cols:
        if col == label_name:
            continue

        # use one hot for categorical features with more than 2 values
        if expand is True:
            unique_values = len(df[col].unique())
            if unique_values == 2:
                new_df[col] = pd.Categorical(new_df[col]).codes
            elif unique_values > 2:
                new_df = pd.get_dummies(new_df, columns=[col], prefix=col)

        # use single column for categorical features
        else:
            new_df[col] = pd.Categorical(new_df[col]).codes

    # convert label to integers and rearrange to start from 0
    new_df[label_name] = pd.Categorical(new_df[label_name]).codes

    # reorder columns
    columns = [col for col in new_df.columns if col != label_name] + [label_name]
    new_df = new_df[columns]

    return new_df


In [4]:
def show_label_balance(df, label_name):
    label_counts = df[label_name].value_counts()
    print("Label balance:")
    for label, count in label_counts.items():
        print(f"Label {label}: {count} ({count / len(df) * 100:.2f}%)")

### Summary

In [None]:
datasets = []
sizes = []

# Data Convertion

### Apple Quality

In [120]:
apple_df = pd.read_csv("data/classification/apple_quality.csv")
apple_df.drop('A_id', axis=1, inplace=True)
apple_df.head()

Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,-3.970049,-2.512336,5.34633,-1.012009,1.8449,0.32984,-0.49159,good
1,-1.195217,-2.839257,3.664059,1.588232,0.853286,0.86753,-0.722809,good
2,-0.292024,-1.351282,-1.738429,-0.342616,2.838636,-0.038033,2.621636,bad
3,-0.657196,-2.271627,1.324874,-0.097875,3.63797,-3.413761,0.790723,good
4,1.364217,-1.296612,-0.384658,-0.553006,3.030874,-1.303849,0.501984,good


In [121]:
ready_apple_df = convert_data(apple_df, label_name='Quality', expand=False)

In [122]:
ready_apple_df.head()

Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Quality
0,0.234669,0.35837,0.922484,0.368781,0.585819,0.47277,0.452225,1
1,0.439331,0.333107,0.795706,0.558928,0.511408,0.513807,0.436185,1
2,0.505948,0.448092,0.388567,0.417732,0.660388,0.444693,0.668192,0
3,0.479014,0.376971,0.619422,0.435629,0.72037,0.187052,0.54118,1
4,0.628107,0.452317,0.490589,0.402347,0.674814,0.348084,0.52115,1


In [123]:
show_label_balance(ready_apple_df, label_name='Quality')

Label balance:
Label 1: 2004 (50.10%)
Label 0: 1996 (49.90%)


In [124]:
apple_size = len(ready_apple_df)
print(f"data size = {apple_size}")

data size = 4000


In [125]:
datasets.append('apple')
sizes.append(apple_size)

In [126]:
ready_apple_df.to_csv("data/apple_data.csv", index=False)

### Loan Status

In [127]:
loan_df = pd.read_csv("data/classification/loan_data.csv")
loan_df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [128]:
ready_loan_df = convert_data(loan_df, label_name='loan_status', expand=False)

In [129]:
ready_loan_df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,0.016129,0,4,0.008891,0.0,3,1.0,4,0.727023,0.742424,0.035714,0.371739,0,1
1,0.008065,0,3,0.000595,0.0,2,0.014493,1,0.392318,0.121212,0.0,0.247826,1,0
2,0.040323,0,3,0.000617,0.024,0,0.144928,3,0.510974,0.666667,0.035714,0.532609,0,1
3,0.024194,0,1,0.009976,0.0,3,1.0,3,0.67284,0.666667,0.0,0.619565,0,1
4,0.032258,1,4,0.008082,0.008,3,1.0,3,0.606996,0.80303,0.071429,0.426087,0,1


In [130]:
show_label_balance(ready_loan_df, label_name='loan_status')

Label balance:
Label 0: 35000 (77.78%)
Label 1: 10000 (22.22%)


In [131]:
loan_size = len(ready_loan_df)
print(f"data size = {loan_size}")

data size = 45000


In [132]:
datasets.append('loan')
sizes.append(loan_size)

In [133]:
ready_loan_df.to_csv("data/loan_data.csv", index=False)

### Wine Quality

In [134]:
wine_df = pd.read_csv("data/classification/wine_quality.csv")
wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [135]:
# keep only certain qualities
labels_to_keep = [5, 6, 7]
wine_df = wine_df[wine_df['quality'].isin(labels_to_keep)]

In [136]:
ready_wine_df = convert_data(wine_df, label_name='quality', expand=False)

In [137]:
ready_wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0.241071,0.479339,0.0,0.068493,0.106845,0.140845,0.09894,0.567548,0.565217,0.118012,0.153846,0
1,0.276786,0.628099,0.0,0.116438,0.143573,0.338028,0.215548,0.494126,0.295652,0.192547,0.215385,0
2,0.276786,0.528926,0.050633,0.09589,0.133556,0.197183,0.169611,0.508811,0.347826,0.173913,0.215385,0
3,0.580357,0.132231,0.708861,0.068493,0.105175,0.225352,0.190813,0.582232,0.26087,0.130435,0.215385,1
4,0.241071,0.479339,0.0,0.068493,0.106845,0.140845,0.09894,0.567548,0.565217,0.118012,0.153846,0


In [138]:
show_label_balance(ready_wine_df, label_name='quality')

Label balance:
Label 0: 681 (44.86%)
Label 1: 638 (42.03%)
Label 2: 199 (13.11%)


In [139]:
wine_size = len(ready_wine_df)
print(f"data size = {wine_size}")

data size = 1518


In [140]:
datasets.append('wine')
sizes.append(wine_size)

In [141]:
ready_wine_df.to_csv("data/wine_data.csv", index=False)

### Diabetes

In [142]:
diabetes_df = pd.read_csv(r"data/classification/diabetes.csv")
diabetes_df.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0


In [143]:
# keep only certain qualities
labels_to_keep = ['Yes', 'No']
diabetes_df = diabetes_df[diabetes_df['Diabetes'].isin(labels_to_keep)]

In [144]:
ready_diabetes_df = convert_data(diabetes_df, label_name='Diabetes', expand=False)

In [145]:
ready_diabetes_df.head()

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption,Diabetes
0,3,2,0,0,0,0,0,1,0,10,0.393333,0.028761,0.028863,1,0.0,0.25,0.125,0.09375,0
1,4,4,0,1,0,0,0,0,0,10,0.493333,0.194576,0.186347,0,0.0,0.25,0.0,0.03125,1
2,4,4,1,0,0,0,0,0,0,8,0.48,0.236878,0.245676,0,0.133333,0.1,0.023438,0.125,1
3,3,4,1,1,0,0,0,0,1,11,0.593333,0.255493,0.191387,0,0.0,0.25,0.234375,0.0625,1
4,2,4,0,0,0,0,0,0,1,12,0.666667,0.236878,0.14145,1,0.0,0.066667,0.03125,0.0,0


In [146]:
show_label_balance(ready_diabetes_df, label_name='Diabetes')

Label balance:
Label 0: 259141 (86.58%)
Label 1: 40171 (13.42%)


In [147]:
diabetes_size = len(ready_diabetes_df)
print(f"data size = {diabetes_size}")

data size = 299312


In [148]:
datasets.append('diabetes')
sizes.append(diabetes_size)

In [149]:
ready_diabetes_df.to_csv("data/diabetes_data.csv", index=False)

### Mountains VS Beaches

In [150]:
mb_df = pd.read_csv("data/classification/mountains_vs_beaches_preferences.csv")
mb_df.head()

Unnamed: 0,Age,Gender,Income,Education_Level,Travel_Frequency,Preferred_Activities,Vacation_Budget,Location,Proximity_to_Mountains,Proximity_to_Beaches,Favorite_Season,Pets,Environmental_Concerns,Preference
0,56,male,71477,bachelor,9,skiing,2477,urban,175,267,summer,0,1,1
1,69,male,88740,master,1,swimming,4777,suburban,228,190,fall,0,1,0
2,46,female,46562,master,0,skiing,1469,urban,71,280,winter,0,0,1
3,32,non-binary,99044,high school,6,hiking,1482,rural,31,255,summer,1,0,1
4,60,female,106583,high school,5,sunbathing,516,suburban,23,151,winter,1,1,0


In [151]:
ready_mb_df = convert_data(mb_df, label_name='Preference', expand=False)

In [152]:
ready_mb_df.head()

Unnamed: 0,Age,Gender,Income,Education_Level,Travel_Frequency,Preferred_Activities,Vacation_Budget,Location,Proximity_to_Mountains,Proximity_to_Beaches,Favorite_Season,Pets,Environmental_Concerns,Preference
0,0.745098,1,0.51477,0,1.0,1,0.439431,2,0.585284,0.892977,2,0.0,1.0,1
1,1.0,1,0.687404,3,0.111111,3,0.950656,1,0.762542,0.635452,0,0.0,1.0,0
2,0.54902,0,0.265615,3,0.0,1,0.215381,2,0.237458,0.936455,3,0.0,0.0,1
3,0.27451,2,0.790446,2,0.666667,0,0.218271,0,0.103679,0.852843,2,1.0,0.0,1
4,0.823529,0,0.865837,2,0.555556,2,0.003556,1,0.076923,0.505017,3,1.0,1.0,0


In [153]:
show_label_balance(ready_mb_df, label_name='Preference')

Label balance:
Label 0: 39296 (74.93%)
Label 1: 13148 (25.07%)


In [154]:
mb_size = len(ready_mb_df)
print(f"data size = {mb_size}")

data size = 52444


In [155]:
datasets.append('mb')
sizes.append(mb_size)

In [156]:
ready_mb_df.to_csv("data/mb_data.csv", index=False)

### Passenger Satisfaction

In [6]:
passenger_df = pd.read_csv("data/classification/passenger_satisfaction.csv")
passenger_df = passenger_df.loc[:, ~passenger_df.columns.str.contains('^Unnamed')]
passenger_df.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [7]:
passenger_df.drop(columns=['id'], inplace=True)

In [8]:
ready_passenger_df = convert_data(passenger_df, label_name='satisfaction', expand=False)

In [9]:
ready_passenger_df.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,1,0,0.076923,1,2,0.086632,0.6,0.8,0.6,0.2,...,1.0,0.8,0.6,0.75,0.8,1.0,1.0,0.015704,0.011364,0
1,1,1,0.230769,0,0,0.041195,0.6,0.4,0.6,0.6,...,0.2,0.2,1.0,0.5,0.2,0.8,0.2,0.000628,0.003788,0
2,0,0,0.24359,0,0,0.224354,0.4,0.4,0.4,0.4,...,1.0,0.8,0.6,0.75,0.8,0.8,1.0,0.0,0.0,1
3,0,0,0.230769,0,0,0.107229,0.4,1.0,1.0,1.0,...,0.4,0.4,1.0,0.5,0.2,0.8,0.4,0.00691,0.005682,0
4,1,0,0.692308,0,0,0.036955,0.6,0.6,0.6,0.6,...,0.6,0.6,0.8,0.75,0.6,0.6,0.6,0.0,0.0,1


In [10]:
passenger_size = len(ready_passenger_df)
len(ready_passenger_df)

103904

In [11]:
ready_passenger_df.dropna(inplace=True)

In [12]:
show_label_balance(ready_passenger_df, label_name='satisfaction')

Label balance:
Label 0: 58697 (56.66%)
Label 1: 44897 (43.34%)


In [13]:
datasets.append('passenger')
sizes.append(passenger_size)

In [14]:
ready_passenger_df.to_csv("data/passenger_data.csv", index=False)

### Employee

In [166]:
employee_df = pd.read_csv("data/classification/Employee.csv")
employee_df = employee_df.loc[:, ~employee_df.columns.str.contains('^Unnamed')]
employee_df.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [167]:
ready_employee_df = convert_data(employee_df, label_name='LeaveOrNot', expand=False)

In [168]:
ready_employee_df.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,0,0.833333,0,1.0,0.631579,1,0,0.0,0
1,0,0.166667,2,0.0,0.315789,0,0,0.428571,1
2,0,0.333333,1,1.0,0.842105,0,0,0.285714,0
3,1,0.666667,0,1.0,0.263158,1,0,0.714286,1
4,1,0.833333,2,1.0,0.105263,1,1,0.285714,1


In [169]:
ready_employee_df.dropna(inplace=True)

In [170]:
ready_employee_df

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,0,0.833333,0,1.0,0.631579,1,0,0.000000,0
1,0,0.166667,2,0.0,0.315789,0,0,0.428571,1
2,0,0.333333,1,1.0,0.842105,0,0,0.285714,0
3,1,0.666667,0,1.0,0.263158,1,0,0.714286,1
4,1,0.833333,2,1.0,0.105263,1,1,0.285714,1
...,...,...,...,...,...,...,...,...,...
4648,0,0.166667,0,1.0,0.210526,0,0,0.571429,0
4649,1,0.166667,2,0.5,0.789474,1,0,0.285714,1
4650,1,1.000000,1,1.0,0.263158,1,0,0.714286,1
4651,0,0.000000,0,1.0,0.421053,1,1,0.285714,0


In [171]:
employee_size = len(ready_employee_df)
employee_size

4653

In [172]:
show_label_balance(ready_employee_df, label_name='LeaveOrNot')

Label balance:
Label 0: 3053 (65.61%)
Label 1: 1600 (34.39%)


In [173]:
datasets.append('employee')
sizes.append(employee_size)

In [174]:
ready_employee_df.to_csv("data/employee_data.csv", index=False)

### shipping

In [175]:
shipping_df = pd.read_csv("data/classification/shipping.csv")
shipping_df = shipping_df.loc[:, ~shipping_df.columns.str.contains('^Unnamed')]
shipping_df.head()

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,D,Flight,4,2,177,3,low,F,44,1233,1
1,F,Flight,4,5,216,2,low,M,59,3088,1
2,A,Flight,2,2,183,4,low,M,48,3374,1
3,B,Flight,3,3,176,4,medium,M,10,1177,1
4,C,Flight,2,2,184,3,medium,F,46,2484,1


In [176]:
ready_shipping_df = convert_data(shipping_df, label_name='Reached.on.Time_Y.N', expand=False)

In [177]:
ready_shipping_df.dropna(inplace=True)

In [178]:
shipping_size = len(ready_shipping_df)
shipping_size

10999

In [179]:
show_label_balance(ready_shipping_df, label_name='Reached.on.Time_Y.N')

Label balance:
Label 1: 6563 (59.67%)
Label 0: 4436 (40.33%)


In [180]:
datasets.append('shipping')
sizes.append(shipping_size)

In [181]:
ready_shipping_df.to_csv("data/shipping_data.csv", index=False)

### hotel

In [182]:
hotel_df = pd.read_csv("data/classification/hotel_reservations.csv")
hotel_df = hotel_df.loc[:, ~hotel_df.columns.str.contains('^Unnamed')]
hotel_df.head()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
3,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
4,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled


In [183]:
ready_hotel_df = convert_data(hotel_df, label_name='booking_status', expand=False)

In [184]:
ready_hotel_df.dropna(inplace=True)

In [185]:
hotel_size = len(ready_hotel_df)
hotel_size

36275

In [186]:
show_label_balance(ready_hotel_df, label_name='booking_status')

Label balance:
Label 1: 24390 (67.24%)
Label 0: 11885 (32.76%)


In [187]:
datasets.append('hotel')
sizes.append(hotel_size)

In [188]:
ready_hotel_df.to_csv("data/hotel_data.csv", index=False)

### overall sizes

In [189]:
for d, s in zip(datasets, sizes):
    print(f"{d} - size = {s}")

apple - size = 4000
loan - size = 45000
wine - size = 1518
diabetes - size = 299312
mb - size = 52444
passenger - size = 103904
employee - size = 4653
shipping - size = 10999
hotel - size = 36275
