<a href="https://colab.research.google.com/github/Tahira2910/NLP/blob/main/Feature_Engineering_Technique.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Text to Numerical Representation**

In [None]:
!pip install spacy -q
!python -m spacy download en_core_web_sm


In [None]:
!pip install numpy -q
!pip install pandas -q

In [None]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics

import string
import spacy  # for tokenization purpose
np.random.seed(42)


In [None]:
data = pd.read_csv("/content/train.csv", error_bad_lines=False, engine="python")

In [None]:
data = data[:500]
data.toxic.value_counts()

In [None]:
data.head(500)

In [None]:
nlp = spacy.load("en_core_web_sm")  # spacy small(sm) model
stop_words = nlp.Defaults.stop_words
print(stop_words)


In [None]:
punctuations = string.punctuation
print(punctuations)

In [None]:
# Creating tokenizer function
def spacy_tokenizer(sentence):
    #  Creating our token object, which is used to create documents with
    # linguistic annotations.

    doc = nlp(sentence)    # passing the text to nlp function(spacy model)
  # print(doc), it will give tokens(word, symbol in eng language)
  # doc--> object
    # print(doc)
    # print(type(doc))
    # Lemmatizing each token and converting each token into lowercase
    # lemma--> root word for that particular english word, then make it lowercase and then strip left or right space if it has
    mytokens = [ word.lemma_.lower() for word in doc ]

    # print(mytokens)

    # Removing stop words
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]

    # return preprocessed list of tokens
    return mytokens



In [None]:
sentence = "I am eating apple?"
spacy_tokenizer(sentence)
# eating(lemma or root form)--> eat

**CountVectorizer**

In [None]:
# giving tokenizer obj to CountVectorizer
# declaring tokenizer function(spacy_tokenizer)
count_vector = CountVectorizer(tokenizer = spacy_tokenizer) #passing spacy_tokenizer function definer earlier
# tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)


In [None]:

count_vector.fit_transform(["I am eating apple","I am playing cricket"]).toarray()


In [None]:

count_vector.get_feature_names_out()

In [None]:
count_vector.vocabulary_

In [None]:
from sklearn.model_selection import train_test_split
X = data['comment_text'] # features we want to analyze
ylabels = data['toxic'] # the labels, or answers, we want to test against
# stratify=ylabels--> to maintain the balance of toxic and non toxic data of y
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, stratify=ylabels)


In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

In [None]:
# if a transformation is working on training data
# it also neends to work on testing data

In [None]:
# fitting dataset to count vectorizer just as we fit 2 sentences to count vectorizer
X_train_vectors = count_vector.fit_transform(X_train)
X_test_vectors = count_vector.transform(X_test)

In [None]:
# 1st value--> total examples, 2nd value--> total unique words(length of each example)
X_train_vectors.shape

In [None]:
X_test_vectors.shape

In [None]:
X_train_vectors.toarray()

In [None]:
classifier.fit(X_train_vectors, y_train)

In [None]:
predicted = classifier.predict(X_test_vectors)
print("Logistic Regression Accuracy:", metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:", metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:", metrics.recall_score(y_test, predicted))


**TF-IDF Vectorizer**

In [None]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)
X_train_vectors = tfidf_vector.fit_transform(X_train)
X_test_vectors = tfidf_vector.transform(X_test)

In [None]:
classifier = LogisticRegression()

In [None]:
classifier.fit(X_train_vectors, y_train)

In [None]:
predicted = classifier.predict(X_test_vectors)
print("Logistic Regression Accuracy:", metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:", metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:", metrics.recall_score(y_test, predicted))
