###  One Hot / Dummy Encoding Method.

In [3]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# create sample data
df = pd.DataFrame({'color': ['red', 'blue', 'green', 'red', 'blue']})

# create OneHotEncoder object
encoder = OneHotEncoder()

# encode the data
encoded_data = encoder.fit_transform(df[['color']]).toarray()

# create a new dataframe with the encoded data
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['color']))

# print the encoded dataframe
print(encoded_df)


   color_blue  color_green  color_red
0         0.0          0.0        1.0
1         1.0          0.0        0.0
2         0.0          1.0        0.0
3         0.0          0.0        1.0
4         1.0          0.0        0.0


In [1]:
!pip install scikit-learn



### Label / Ordinal encoding

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# create sample data
df = pd.DataFrame({'color': ['red', 'blue', 'green', 'red', 'blue']})

# create LabelEncoder object
encoder = LabelEncoder()

# encode the data
df['encoded_color'] = encoder.fit_transform(df['color'])

# print the encoded dataframe
print(df)


   color  encoded_color
0    red              2
1   blue              0
2  green              1
3    red              2
4   blue              0


### Target encoding

In [1]:
import pandas as pd
from category_encoders import TargetEncoder

# create sample data
df = pd.DataFrame({'color': ['red', 'blue', 'green', 'red', 'blue'], 
                   'label': [1, 0, 1, 1, 0]})

# create TargetEncoder object
encoder = TargetEncoder(cols=['color'])

# encode the data
df = encoder.fit_transform(df, df['label'])

# print the encoded dataframe
print(df)


      color  label
0  0.656740      1
1  0.514889      0
2  0.652043      1
3  0.656740      1
4  0.514889      0


###  Frequency / count encoding

In [4]:
import pandas as pd

# create sample data
df = pd.DataFrame({'color': ['red', 'blue', 'green', 'red', 'blue']})

# create count dictionary
count_dict = df['color'].value_counts().to_dict()

# map count dictionary to create new column with count values
df['count_encoded'] = df['color'].map(count_dict)

# print the encoded dataframe
print(df)


   color  count_encoded
0    red              2
1   blue              2
2  green              1
3    red              2
4   blue              2


###  Binary encoding

In [5]:
import pandas as pd
from category_encoders import BinaryEncoder

# create sample data
df = pd.DataFrame({'color': ['red', 'blue', 'green', 'red', 'blue']})

# create BinaryEncoder object
encoder = BinaryEncoder()

# encode the data
encoded_data = encoder.fit_transform(df['color'])

# create a new dataframe with the encoded data
encoded_df = pd.concat([df, encoded_data], axis=1)

# print the encoded dataframe
print(encoded_df)


   color  color_0  color_1
0    red        0        1
1   blue        1        0
2  green        1        1
3    red        0        1
4   blue        1        0


### Feature Hashing

In [5]:
import pandas as pd
from sklearn.feature_extraction import FeatureHasher

# create sample data
df = pd.DataFrame({'color': ['red', 'blue', 'green', 'red', 'blue']})

# create FeatureHasher object
hasher = FeatureHasher(n_features=3, input_type='string')

# encode the data
hashed_data = hasher.transform(df[['color']].values.tolist())

# create a new dataframe with the hashed data
hashed_df = pd.DataFrame(hashed_data.toarray(), columns=['hash_1', 'hash_2', 'hash_3'])

# print the encoded dataframe
print(hashed_df)


   hash_1  hash_2  hash_3
0     0.0    -1.0     0.0
1     0.0     0.0    -1.0
2     1.0     0.0     0.0
3     0.0    -1.0     0.0
4     0.0     0.0    -1.0


### Pearson’s coefficient.

In [7]:
import pandas as pd
from scipy.stats import pearsonr

# create sample data
df = pd.DataFrame({'x': [1, 2, 3, 4, 5], 'y': [2, 4, 6, 8, 10]})

# calculate Pearson's coefficient and p-value
pearson_coeff, p_value = pearsonr(df['x'], df['y'])

# print the results
print("Pearson's coefficient:", pearson_coeff)
print("p-value:", p_value)


Pearson's coefficient: 1.0
p-value: 0.0


### Correlation Matrix.

In [8]:
import pandas as pd

# create sample data
df = pd.DataFrame({'x': [1, 2, 3, 4, 5], 'y': [2, 4, 6, 8, 10], 'z': [1, 3, 5, 7, 9]})

# create correlation matrix
correlation_matrix = df.corr()

# print the correlation matrix
print(correlation_matrix)


     x    y    z
x  1.0  1.0  1.0
y  1.0  1.0  1.0
z  1.0  1.0  1.0


### Positive, Neutral, Negative Correlation.

In [9]:
import pandas as pd

# create sample data
df = pd.DataFrame({'x': [1, 2, 3, 4, 5], 'y': [2, 4, 6, 8, 10]})

# calculate Pearson's coefficient
pearson_coeff = df['x'].corr(df['y'])

# identify the type of correlation
if pearson_coeff > 0:
    correlation_type = 'positive'
elif pearson_coeff == 0:
    correlation_type = 'neutral'
else:
    correlation_type = 'negative'

# print the type of correlation
print('The correlation between x and y is', correlation_type)


The correlation between x and y is positive
