# Sheth L.U.J. & Sir M.V. College Of Arts, Science & Commerce

# 3B Practical Handling Categorial Data

# Shobit Halse | T083

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv('CharliePuth.csv')
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset info:")
print(df.info())

Dataset shape: (75, 7)

First few rows:
   Unnamed: 0        Artist                  Title            Album    Year  \
0           0  Charlie Puth              Attention       Voicenotes  2017.0   
1           1  Charlie Puth  We Donâ€™t Talk Anymore  Nine Track Mind  2016.0   
2           2  Charlie Puth               How Long       Voicenotes  2017.0   
3           3  Charlie Puth            Marvin Gaye  Nine Track Mind  2015.0   
4           4  Charlie Puth          One Call Away  Nine Track Mind  2015.0   

         Date                                              Lyric  
0  2017-04-21  woahoh hmhmm   you've been runnin' 'round runn...  
1  2016-05-24  charlie puth we don't talk anymore we don't ta...  
2  2017-10-05  alright ooh yeah   i'll admit i was wrong what...  
3  2015-02-10  charlie puth let's marvin gaye and get it on y...  
4  2015-08-20  i'm only one call away i'll be there to save t...  

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to

# Encoding Nominal Categorical Features

In [14]:
feature = df[['Album']].values

one_hot = LabelBinarizer()

print(one_hot.fit_transform(feature)[:5])

print(one_hot.classes_[:5])

print(one_hot.inverse_transform(one_hot.transform(feature))[:5])

Original feature:
[['Voicenotes']
 ['Nine Track Mind']
 ['Voicenotes']
 ['Nine Track Mind']
 ['Nine Track Mind']]

One-hot encoded:
[[1]
 [0]
 [1]
 [0]
 [0]]

Feature classes:
['Nine Track Mind' 'Voicenotes']

Reverse one-hot encoding:
['Voicenotes' 'Nine Track Mind' 'Voicenotes' 'Nine Track Mind'
 'Nine Track Mind']


# Using Pandas get_dummies

In [15]:
print("One-hot encoding with pandas:")
print(pd.get_dummies(df['Album'].head()))

One-hot encoding with pandas:
   Nine Track Mind  Voicenotes
0            False        True
1             True       False
2            False        True
3             True       False
4             True       False


# Multiclass One-Hot Encoding

In [16]:
multiclass_feature = list(zip(df['Album'], df['Year'].astype(str)))

multiclass_feature = [(a, str(int(float(y)))) if pd.notna(y) else (a, 'Unknown') for a, y in multiclass_feature]

one_hot_multiclass = MultiLabelBinarizer()

print(one_hot_multiclass.fit_transform(multiclass_feature)[:5])
print(one_hot_multiclass.classes_[:5])

Multiclass one-hot encoded:
[[0 1 0 1]
 [1 0 1 0]
 [0 1 0 1]
 [1 1 0 0]
 [0 0 1 1]]

Classes:
['Nine Track Mind' 'Pop' 'R&B' 'Voicenotes']


# Encoding Ordinal Categorical Features

In [17]:
score_df = pd.DataFrame({"Score": ["Low", "Low", "Medium", "Medium", "High"]})

print("Original scores:")
print(score_df)

scale_mapper = {
    "Low": 1,
    "Medium": 2,
    "High": 3
}

print("\nMapped scores:")
print(score_df["Score"].replace(scale_mapper))

Original scores:
    Score
0     Low
1     Low
2  Medium
3  Medium
4    High

Mapped scores:
0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64


  print(score_df["Score"].replace(scale_mapper))


# Encoding Dictionaries of Features

In [18]:
data_dict = df[['Album', 'Year', 'Title']].to_dict(orient='records')

dictvectorizer = DictVectorizer(sparse=False)

features = dictvectorizer.fit_transform(data_dict)

print(features[:5])

print(dictvectorizer.get_feature_names_out()[:5])

Dictionary vectorized features:
[[4. 2. 0.]
 [3. 4. 0.]
 [0. 1. 2.]
 [0. 2. 2.]]

Feature names:
['Blue' 'Red' 'Yellow']


# Imputing Missing Class Values

In [19]:
X = df[['Year']].values
y = df['Album'].values

X_with_nan = np.array([
    [df['Year'][0]],
    [np.nan],
    [df['Year'][2]]
])

y_with_nan = np.array([
    y[0],
    None,
    y[2]
])

train_idx = [0, 2]
X_train = X[train_idx]
y_train = y[train_idx]
n_neighbors = max(1, min(3, len(X_train)))
clf = KNeighborsClassifier(n_neighbors=n_neighbors, weights='distance')
clf.fit(X_train, y_train)

imputer = SimpleImputer(strategy='mean')
imputer.fit(X)

X_with_nan_imputed = imputer.transform(X_with_nan)

imputed_values = clf.predict(X_with_nan_imputed[[1]])

y_with_imputed = y_with_nan.copy()
y_with_imputed[1] = imputed_values[0]

X_with_imputed = X_with_nan_imputed

print("Imputed numeric rows:\n", X_with_imputed)
print("Imputed labels:\n", y_with_imputed)

Imputed values using KNN:
[[ 0.    0.87  1.31]
 [ 1.   -0.67 -0.22]
 [ 0.    2.1   1.45]
 [ 1.    1.18  1.33]
 [ 0.    1.22  1.27]
 [ 1.   -0.21 -1.19]]


# Fill Missing Values With Most Frequent

In [20]:
X = df[['Album']].values

imputer = SimpleImputer(missing_values=None, strategy='most_frequent')

imputed_X = imputer.fit_transform(X)

imputed_X[:6]

Imputed with most frequent value:
[[ 0.    0.87  1.31]
 [ 0.   -0.67 -0.22]
 [ 0.    2.1   1.45]
 [ 1.    1.18  1.33]
 [ 0.    1.22  1.27]
 [ 1.   -0.21 -1.19]]


# Handling Imbalanced Classes

In [21]:
df_binary = df.copy()
df_binary['is_voicenotes'] = np.where(df_binary['Album'] == 'Voicenotes', 1, 0)

target = df_binary['is_voicenotes'].values

print("Target distribution:")
print(target)
print("\nClass counts:")
print(pd.Series(target).value_counts())

weights = {0: .9, 1: 0.1}

clf = RandomForestClassifier(class_weight=weights)
print("\nRandom Forest with custom weights:")
print(clf)

Target distribution:
[1 0 1 0 0 1 1 1 0 0 0 1 0 0 1 1 0 1 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0]

Class counts:
0    62
1    13
Name: count, dtype: int64

Random Forest with custom weights:
RandomForestClassifier(class_weight={0: 0.9, 1: 0.1})


# Using Balanced Class Weights

In [22]:
clf_balanced = RandomForestClassifier(class_weight="balanced")
print("Random Forest with balanced weights:")
print(clf_balanced)

Random Forest with balanced weights:
RandomForestClassifier(class_weight='balanced')


# Downsampling Majority Class

In [23]:
i_class0 = np.where(target == 0)[0]
i_class1 = np.where(target == 1)[0]

n_class0 = len(i_class0)
n_class1 = len(i_class1)

print(f"Class 0 count: {n_class0}")
print(f"Class 1 count: {n_class1}")

if n_class1 > n_class0:
    i_class1_downsampled = np.random.choice(i_class1, size=n_class0, replace=False)
    downsampled_target = np.hstack((target[i_class0], target[i_class1_downsampled]))
else:
    i_class0_downsampled = np.random.choice(i_class0, size=n_class1, replace=False)
    downsampled_target = np.hstack((target[i_class0_downsampled], target[i_class1]))

print("\nDownsampled target:")
print(downsampled_target)
print("\nDownsampled class distribution:")
print(pd.Series(downsampled_target).value_counts())

Class 0 count: 62
Class 1 count: 13

Downsampled target:
[0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1]

Downsampled class distribution:
0    13
1    13
Name: count, dtype: int64


# Upsampling Minority Class

In [24]:
if n_class0 < n_class1:
    i_class0_upsampled = np.random.choice(i_class0, size=n_class1, replace=True)
    upsampled_target = np.concatenate((target[i_class0_upsampled], target[i_class1]))
else:
    i_class1_upsampled = np.random.choice(i_class1, size=n_class0, replace=True)
    upsampled_target = np.concatenate((target[i_class0], target[i_class1_upsampled]))

print("Upsampled target:")
print(upsampled_target)
print("\nUpsampled class distribution:")
print(pd.Series(upsampled_target).value_counts())

Upsampled target:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1]

Upsampled class distribution:
0    62
1    62
Name: count, dtype: int64
