# 1. Loading Libraries

In [4]:
import pandas as pd
import scipy
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt

# 2. Loading and Understanding the Data 

In [6]:
df = pd.read_csv('In-vehicle Coupon Recommendation.csv')

In [9]:
df.head()

Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,1
3,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12684 entries, 0 to 12683
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   destination           12684 non-null  object
 1   passanger             12684 non-null  object
 2   weather               12684 non-null  object
 3   temperature           12684 non-null  int64 
 4   time                  12684 non-null  object
 5   coupon                12684 non-null  object
 6   expiration            12684 non-null  object
 7   gender                12684 non-null  object
 8   age                   12684 non-null  object
 9   maritalStatus         12684 non-null  object
 10  has_children          12684 non-null  int64 
 11  education             12684 non-null  object
 12  occupation            12684 non-null  object
 13  income                12684 non-null  object
 14  car                   108 non-null    object
 15  Bar                   12577 non-null

In [13]:
df.dtypes

destination             object
passanger               object
weather                 object
temperature              int64
time                    object
coupon                  object
expiration              object
gender                  object
age                     object
maritalStatus           object
has_children             int64
education               object
occupation              object
income                  object
car                     object
Bar                     object
CoffeeHouse             object
CarryAway               object
RestaurantLessThan20    object
Restaurant20To50        object
toCoupon_GEQ5min         int64
toCoupon_GEQ15min        int64
toCoupon_GEQ25min        int64
direction_same           int64
direction_opp            int64
Y                        int64
dtype: object

# 3. Handling missing data

In [16]:
df.isnull().sum()

destination                 0
passanger                   0
weather                     0
temperature                 0
time                        0
coupon                      0
expiration                  0
gender                      0
age                         0
maritalStatus               0
has_children                0
education                   0
occupation                  0
income                      0
car                     12576
Bar                       107
CoffeeHouse               217
CarryAway                 151
RestaurantLessThan20      130
Restaurant20To50          189
toCoupon_GEQ5min            0
toCoupon_GEQ15min           0
toCoupon_GEQ25min           0
direction_same              0
direction_opp               0
Y                           0
dtype: int64

In [18]:
# Car column has too many missing values. It is better to drop it (eliminate)
df = df.drop('car', axis=1)

### Impute Other Missing Values with Mode

In [21]:
# List of the categorical columns with missing values
columns_to_fill = ['Bar', 'CoffeeHouse', 'Restaurant20To50', 'CarryAway', 'RestaurantLessThan20']

# Fill missing values  of these columns with their mode
for column in columns_to_fill:
    df[column].fillna(df[column].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mode()[0], inplace=True)


###  Confirm no missing values remain


In [24]:
df.isnull().sum()

destination             0
passanger               0
weather                 0
temperature             0
time                    0
coupon                  0
expiration              0
gender                  0
age                     0
maritalStatus           0
has_children            0
education               0
occupation              0
income                  0
Bar                     0
CoffeeHouse             0
CarryAway               0
RestaurantLessThan20    0
Restaurant20To50        0
toCoupon_GEQ5min        0
toCoupon_GEQ15min       0
toCoupon_GEQ25min       0
direction_same          0
direction_opp           0
Y                       0
dtype: int64

# 4. Outlier Treatment

### Review Summary Stats for Numeric Columns

In [28]:
# Identify numeric columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Show summary statistics
df[numeric_cols].describe()


Unnamed: 0,temperature,has_children,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
count,12684.0,12684.0,12684.0,12684.0,12684.0,12684.0,12684.0,12684.0
mean,63.301798,0.414144,1.0,0.561495,0.119126,0.214759,0.785241,0.568433
std,19.154486,0.492593,0.0,0.496224,0.32395,0.410671,0.410671,0.495314
min,30.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,55.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
50%,80.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
75%,80.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
max,80.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# 5. Data Type Corrections

### Check Unique Values in Age 

In [32]:
# Look at unique values in 'age' column
print(df['age'].unique())

['21' '46' '26' '31' '41' '50plus' '36' 'below21']


#### Output shows values like '21', '26', '50plus', and 'below21',so age should stay as a category.

In [35]:
# Convert all object-type columns to 'category' type
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype('category')


In [37]:
# Confirm data types
df.dtypes

destination             category
passanger               category
weather                 category
temperature                int64
time                    category
coupon                  category
expiration              category
gender                  category
age                     category
maritalStatus           category
has_children               int64
education               category
occupation              category
income                  category
Bar                     category
CoffeeHouse             category
CarryAway               category
RestaurantLessThan20    category
Restaurant20To50        category
toCoupon_GEQ5min           int64
toCoupon_GEQ15min          int64
toCoupon_GEQ25min          int64
direction_same             int64
direction_opp              int64
Y                          int64
dtype: object

# 6. Encoding Categorical Variables

## Apply One-Hot Encoding to Categorical Columns

In [41]:
# Performing one-hot encoding on all categorical variables
df_encoded = pd.get_dummies(df, drop_first=True)

# 7.Feature Engineering (Optional)

#### Markdown Cell


#### In this step, we create or modify features to improve the model's ability to learn patterns. 
For this dataset, we will:
- Group income levels into Low, Medium, and High.
- Create a binary flag to show if someone is traveling alone.
- Create a new feature to group the time into morning, afternoon, and evening.


In [46]:
# Simplify income into 3 groups
def simplify_income(income):
    if income in ['Less than $12500', '$12500 - $24999', '$25000 - $37499']:
        return 'Low'
    elif income in ['$37500 - $49999', '$50000 - $62499']:
        return 'Medium'
    else:
        return 'High'

df['income_group'] = df['income'].apply(simplify_income)


#### Code cell: Create is_alone Feature

In [49]:
# Create binary column: 1 if alone, else 0
df['is_alone'] = df['passanger'].apply(lambda x: 1 if x == 'Alone' else 0)


#### Code Cell – Create part_of_day Feature

In [52]:
# Convert time into part of the day
def time_of_day(t):
    if t in ['7AM', '10AM']:
        return 'Morning'
    elif t in ['2PM', '6PM']:
        return 'Afternoon'
    else:
        return 'Evening'

df['part_of_day'] = df['time'].apply(time_of_day)


### Check New Features

In [55]:
# Preview the new columns
df[['income', 'income_group', 'passanger', 'is_alone', 'time', 'part_of_day']].head()


Unnamed: 0,income,income_group,passanger,is_alone,time,part_of_day
0,$37500 - $49999,Medium,Alone,1,2PM,Afternoon
1,$37500 - $49999,Medium,Friend(s),0,10AM,Morning
2,$37500 - $49999,Medium,Friend(s),0,10AM,Morning
3,$37500 - $49999,Medium,Friend(s),0,2PM,Afternoon
4,$37500 - $49999,Medium,Friend(s),0,2PM,Afternoon


# 8.Save Cleaned Dataset

In [60]:
# Save the final cleaned dataset to a CSV file
df_encoded = pd.get_dummies(df, drop_first=True)
df_encoded.to_csv('In-vehicle Coupon Recommendation-cleaned_dataset.csv', index=False)
