<a href="https://colab.research.google.com/github/ShivaShirsath/nlp/blob/master/2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
# Load the dataset
df = pd.read_csv('data.csv')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Define the text to use for feature extraction
text_columns = ['Make', 'Model', 'Engine Fuel Type', 'Transmission Type', 'Driven_Wheels', 'Market Category', 'Vehicle Size', 'Vehicle Style']
text = df[text_columns].apply(lambda x: ' '.join(x.astype(str)), axis=1)

# Clean the text by removing special characters, digits, and stopwords
stop_words = stopwords.words('english')
text = text.apply(lambda x: re.sub('[^a-zA-Z\s]', '', x))
text = text.apply(lambda x: re.sub(r'\d+', '', x))
text = text.apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
print("Text : ", text, sep="\n")

Text : 
0        BMW Series premium unleaded required MANUAL re...
1        BMW Series premium unleaded required MANUAL re...
2        BMW Series premium unleaded required MANUAL re...
3        BMW Series premium unleaded required MANUAL re...
4        BMW Series premium unleaded required MANUAL re...
                               ...                        
11909    Acura ZDX premium unleaded required AUTOMATIC ...
11910    Acura ZDX premium unleaded required AUTOMATIC ...
11911    Acura ZDX premium unleaded required AUTOMATIC ...
11912    Acura ZDX premium unleaded recommended AUTOMAT...
11913    Lincoln Zephyr regular unleaded AUTOMATIC fron...
Length: 11914, dtype: object


In [4]:
# Create bag-of-words features using count occurrence and normalized count occurrence
count_vec = CountVectorizer()
X_count = count_vec.fit_transform(text)
X_count_norm = X_count / np.sum(X_count, axis=1)
print("\nCount Occurrence :", X_count, sep="\n")
print("\nNormalized Count Occurrence :", X_count_norm, sep="\n")


Count Occurrence :
  (0, 51)	1
  (0, 559)	1
  (0, 484)	1
  (0, 673)	1
  (0, 521)	1
  (0, 402)	1
  (0, 512)	1
  (0, 704)	1
  (0, 185)	1
  (0, 245)	1
  (0, 667)	1
  (0, 115)	1
  (0, 127)	1
  (1, 51)	1
  (1, 559)	1
  (1, 484)	1
  (1, 673)	1
  (1, 521)	1
  (1, 402)	1
  (1, 512)	1
  (1, 704)	1
  (1, 185)	1
  (1, 115)	1
  (1, 395)	1
  (1, 120)	1
  :	:
  (11911, 724)	1
  (11911, 141)	1
  (11912, 484)	1
  (11912, 673)	1
  (11912, 704)	1
  (11912, 185)	1
  (11912, 420)	1
  (11912, 29)	1
  (11912, 514)	1
  (11912, 314)	1
  (11912, 184)	1
  (11912, 6)	1
  (11912, 724)	1
  (11912, 141)	1
  (11913, 673)	1
  (11913, 704)	1
  (11913, 185)	1
  (11913, 391)	1
  (11913, 518)	1
  (11913, 271)	1
  (11913, 420)	1
  (11913, 552)	1
  (11913, 29)	1
  (11913, 381)	1
  (11913, 725)	1

Normalized Count Occurrence :
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0. 

In [5]:
# Create bag-of-words features using TF-IDF
tfidf_vec = TfidfVectorizer()
X_tfidf = tfidf_vec.fit_transform(text)
print("\nTF-IDF :", X_tfidf, sep="\n")


TF-IDF :
  (0, 127)	0.29155512347301105
  (0, 115)	0.17180852763247648
  (0, 667)	0.4173474512280893
  (0, 245)	0.38395763600012056
  (0, 185)	0.08964700037823825
  (0, 704)	0.08964700037823825
  (0, 512)	0.20280769809182894
  (0, 402)	0.2152200361399859
  (0, 521)	0.24918848483841655
  (0, 673)	0.09857959161354882
  (0, 484)	0.1966191494248329
  (0, 559)	0.4265343565320581
  (0, 51)	0.40981434665401595
  (1, 120)	0.36307337671360584
  (1, 395)	0.3828183207136555
  (1, 115)	0.18946546514426285
  (1, 185)	0.0988601139856354
  (1, 704)	0.0988601139856354
  (1, 512)	0.22365045194964012
  (1, 402)	0.2373384186310876
  (1, 521)	0.27479783942680275
  (1, 673)	0.10871071672732242
  (1, 484)	0.21682589982805686
  (1, 559)	0.4703697271256941
  (1, 51)	0.4519313847895299
  :	:
  (11911, 673)	0.0780478668206422
  (11911, 484)	0.1556681757097905
  (11912, 141)	0.5871480489107743
  (11912, 724)	0.5871480489107743
  (11912, 6)	0.3429930416948985
  (11912, 184)	0.15082338608041232
  (11912, 314)	0.2

In [6]:
# Train Word2Vec model to create embeddings
sentences = [nltk.word_tokenize(sent) for sent in text]
w2v_model = Word2Vec(sentences, size=100, min_count=1)
print("\nWord2Vec :", w2v_model, sep="\n")


Word2Vec :
Word2Vec(vocab=752, size=100, alpha=0.025)


In [7]:
# Get the embedding vector for a specific word
print("Embedding Vector for Audi : ", w2v_model['Audi'], sep="\n")

Embedding Vector for Audi : 
[ 0.5044263   0.34770858 -0.0638372  -0.3189772  -0.08491277 -0.45134172
 -0.23854408 -0.33530536  0.3076679  -0.13442138  0.00591657  0.3686675
 -0.16924813 -0.13772874 -0.35768571 -0.15862294 -0.3893453   0.37283438
  0.53814644  0.15523675  0.02774514  0.06630558 -0.15259778 -0.3319281
  0.9990962   0.40906322 -0.39458808 -0.32587063  0.86114115 -0.3133111
 -0.19215992 -0.5381166  -0.06661527 -0.23726499  0.02297745  0.2006813
 -0.35913298 -0.36772344 -0.14318204 -0.27630192  0.49682653  0.75089025
  0.15991527  0.48038432 -0.5071131   0.54681826 -0.14039695  0.04161979
  0.32185617  0.3189142   0.0144504  -0.12769185 -0.10586888  0.19949545
  0.16830006  0.2852649   0.15174691  0.16381378  0.2603552  -0.64330894
 -0.22704309 -0.59486026  0.13079247  0.40255865 -0.24674995 -0.40300277
 -0.05535044 -0.7643973   0.4182064  -0.5971203  -0.02811981  0.51519084
  0.5162952   0.46689016 -0.14160074  0.06851733 -0.02190312  0.1501793
  0.17352587 -0.436448   -0

  print("Embedding Vector for Audi : ", w2v_model['Audi'], sep="\n")
