## One hot Encoding:

In [13]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# sample data
data = {'Color':['Red','Green','Blue']}
df = pd.DataFrame(data)

#Intialize Onehotencoder
encoder = OneHotEncoder()

# Fit and transform the data
encoded_data = encoder.fit_transform(df[['Color']])

#Convert to a Dataframe for better visualization
# encoder_df = pd.DataFrame(encoded_data.toarray(),columns=encoder.get_feature_names_out())
encoder_df = pd.DataFrame(encoded_data.toarray(),)

print(encoder_df)

     0    1    2
0  0.0  0.0  1.0
1  0.0  1.0  0.0
2  1.0  0.0  0.0


## Bag of words

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

# step 1: Create a Corpus

corpus = [
    "I love NLP",
    "NLP is fun",
    "I love machine learning"

]

# step 2: Intialize countvectorizer
vecotrizer = CountVectorizer()

# step 3: Fit the vectorizer to the corpus
X = vecotrizer.fit_transform(corpus)

# step 4: Convert the result into matrix
print(X.toarray())
# print(vecotrizer.get_feature_names_out())

# Get the words in the Bag of Words model
print("Vocabulary:", vecotrizer.get_feature_names())

[[0 0 0 1 0 1]
 [1 1 0 0 0 1]
 [0 0 1 1 1 0]]


## N-Grams

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

# Step 1: Create a corpus
corpus = [
    "I love NLP",
    "NLP is fun",
    "I love machine learning"
]

# step 2: Intialize countvectorizer with n-grams
vecotrizer = CountVectorizer(ngram_range=(1,2)) # unigrams and bigrams
X = vecotrizer.fit_transform(corpus)

# step 3: Print the n-grams
# print(vecotrizer.get_feature_names_out())
print(X.toarray())

vecotrizer.get_feature_names()

[[0 0 0 0 1 0 1 0 0 1 0]
 [1 1 1 0 0 0 0 0 0 1 1]
 [0 0 0 1 1 1 0 1 1 0 0]]


## TF - IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
    "NLP is fun",
    "NLP is interesting",
    "Machine learning is amazing"
]

# Create TF - IDF Vectorizer
vecotrizer = TfidfVectorizer()
tfidf_matrix = vecotrizer.fit_transform(documents)

# Get the TF - IDF matrix
print(tfidf_matrix.toarray())

# Get feature names
print(vecotrizer.get_feature_names_out())

[[0.         0.72033345 0.         0.42544054 0.         0.
  0.54783215]
 [0.         0.         0.72033345 0.42544054 0.         0.
  0.54783215]
 [0.54645401 0.         0.         0.32274454 0.54645401 0.54645401
  0.        ]]


## Custom Features

In [25]:
!pip3 install emoji

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Defaulting to user installation because normal site-packages is not writeable
Collecting emoji
  Using cached emoji-2.11.1-py2.py3-none-any.whl (433 kB)
Installing collected packages: emoji
Successfully installed emoji-2.11.1


In [26]:
# Sample text dataset
documents = [
    "I love NLP! 😍 It's amazing to learn machine learning.",
    "Get a free offer now!!! Win exciting prizes. 🎉",
    "Machine learning is the future of AI. It's so interesting!"
]

import re
import emoji

# Function to extract text length
def get_text_length(text):
    return len(text)

# Function to extract word count
def get_word_count(text):
    return len(text.split())

# Function to count special characters
def get_special_char_count(text):
    return len(re.findall(r'[@#]', text))

# Function to count exclamation marks
def get_exclamation_mark_count(text):
    return text.count('!')

# Function to count emojis
def get_emoji_count(text):
    return sum(1 for char in text if char in emoji.EMOJI_DATA)

import pandas as pd

# Create a DataFrame to store the features
features = pd.DataFrame(documents, columns=['Text'])

# Apply custom feature functions
features['Text_Length'] = features['Text'].apply(get_text_length)
features['Word_Count'] = features['Text'].apply(get_word_count)
features['Special_Char_Count'] = features['Text'].apply(get_special_char_count)
features['Exclamation_Mark_Count'] = features['Text'].apply(get_exclamation_mark_count)
features['Emoji_Count'] = features['Text'].apply(get_emoji_count)

# Display the feature matrix
print(features)



                                                Text  Text_Length  Word_Count  \
0  I love NLP! 😍 It's amazing to learn machine le...           53          10   
1     Get a free offer now!!! Win exciting prizes. 🎉           46           9   
2  Machine learning is the future of AI. It's so ...           58          10   

   Special_Char_Count  Exclamation_Mark_Count  Emoji_Count  
0                   0                       1            1  
1                   0                       3            1  
2                   0                       1            0  


In [None]:
#Step 4: Combine Custom Features with Standard NLP Features (e.g., TF-IDF)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import FeatureUnion

# Apply TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(features['Text'])

# Normalize custom features
scaler = MinMaxScaler()
custom_features = scaler.fit_transform(features[['Text_Length', 'Word_Count', 'Special_Char_Count', 'Exclamation_Mark_Count', 'Emoji_Count']])

# Combine TF-IDF with custom features
import numpy as np
final_features = np.hstack((tfidf_matrix.toarray(), custom_features))

print(final_features.shape)  # Check final feature matrix shape


(3, 27)


In [None]:
#Step 5: Train a Model Using Custom Features

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Sample labels (spam detection example)
labels = [0, 1, 0]  # 0 = Not Spam, 1 = Spam

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(final_features, labels, test_size=0.3, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate the model
accuracy = model.score(X_test, y_test)
print("Model Accuracy:", accuracy)


Model Accuracy: 1.0
