## Dealing with Categorical Features

### Objectives

1. Differentiate between numerical and categorical features.
2. Use Pandas 'get dummies' method
3. Use sci-kit learn OneHotEncoder

In [1]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [2]:
# data from seaborn

flight = sns.load_dataset("flights")

In [5]:
# Preview

flight.head()

Unnamed: 0,year,month,passengers
0,1949,Jan,112
1,1949,Feb,118
2,1949,Mar,132
3,1949,Apr,129
4,1949,May,121


In [6]:
flight.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   year        144 non-null    int64   
 1   month       144 non-null    category
 2   passengers  144 non-null    int64   
dtypes: category(1), int64(2)
memory usage: 2.9 KB


In [7]:
# Define target and predictor

X = flight.drop(columns=["passengers"])
y = flight["passengers"]

In [8]:
# Initialize the model

model = LinearRegression()

In [9]:
model.fit(X, y)

ValueError: could not convert string to float: 'Jan'

### Use get_dummies to handle categorical features. Which is the month column.

In [12]:
# Use get_dummies

flights_dummies = pd.get_dummies(flight, columns=['month'], drop_first=True)

# Inspect

flights_dummies.head()

Unnamed: 0,year,passengers,month_Feb,month_Mar,month_Apr,month_May,month_Jun,month_Jul,month_Aug,month_Sep,month_Oct,month_Nov,month_Dec
0,1949,112,0,0,0,0,0,0,0,0,0,0,0
1,1949,118,1,0,0,0,0,0,0,0,0,0,0
2,1949,132,0,1,0,0,0,0,0,0,0,0,0
3,1949,129,0,0,1,0,0,0,0,0,0,0,0
4,1949,121,0,0,0,1,0,0,0,0,0,0,0


In [14]:
# Define X and y

X_dummies = flights_dummies.drop(columns = ['passengers'])
y_dummies = flights_dummies['passengers']

# define our constant

X_constant_dummies = sm.add_constant(X_dummies)

# Initialize

model_ols = sm.OLS(y_dummies, X_constant_dummies)

# Fit and get results

results = model_ols.fit()

# Print summary

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:             passengers   R-squared:                       0.956
Model:                            OLS   Adj. R-squared:                  0.952
Method:                 Least Squares   F-statistic:                     236.5
Date:                Wed, 14 May 2025   Prob (F-statistic):           1.71e-82
Time:                        11:33:04   Log-Likelihood:                -668.50
No. Observations:                 144   AIC:                             1363.
Df Residuals:                     131   BIC:                             1402.
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -6.215e+04   1242.339    -50.029      0.0

### Using One Hot Encoding to handle Categorical Features

In [18]:
# Initialize the OneHotEncoder

ohe = OneHotEncoder(sparse=False, drop='first')

# Fit the transform to the month column

flights_categorical_encoded = ohe.fit_transform(flight[['month']])

# Inspecting the output

flights_categorical_encoded 

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [20]:
# Convert to dataframe

flights_categorical_encoded_df = pd.DataFrame(flights_categorical_encoded, columns=ohe.get_feature_names(['month']))

# Preview

flights_categorical_encoded_df.head()

Unnamed: 0,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [22]:
# Use concat method to have final dataframe

flights_ohe = pd.concat([flight.drop(columns=['month']), flights_categorical_encoded_df], axis = 1)

# Preview

flights_ohe.head()

Unnamed: 0,year,passengers,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep
0,1949,112,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1949,118,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1949,132,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1949,129,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1949,121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [23]:
# Define X and y

X_ohe = flights_ohe.drop(columns=['passengers'])
y_ohe = flights_ohe['passengers']

# Define our constant

X_constant_ohe = sm.add_constant(X_ohe)

# Initialize and fit model

model_ols = sm.OLS(y_ohe, X_constant_ohe)
results_ohe = model_ols.fit()

# Print summary statistics

print(results_ohe.summary())

                            OLS Regression Results                            
Dep. Variable:             passengers   R-squared:                       0.956
Model:                            OLS   Adj. R-squared:                  0.952
Method:                 Least Squares   F-statistic:                     236.5
Date:                Wed, 14 May 2025   Prob (F-statistic):           1.71e-82
Time:                        11:59:04   Log-Likelihood:                -668.50
No. Observations:                 144   AIC:                             1363.
Df Residuals:                     131   BIC:                             1402.
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -6.213e+04   1242.339    -50.009      0.0