In [18]:
import numpy as np
import pandas as pd
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [11]:
df = pd.read_csv('spam.csv', encoding = 'latin1')[['v1', 'v2']].rename(columns = {'v1': 'class', 'v2': 'text'})

In [44]:
df['text']

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: text, Length: 5572, dtype: object

In [13]:
#tfidf, ohe, w2v

In [19]:
tokenized_docs = [simple_preprocess(doc) for doc in df['text']]
w2v = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=1, workers=4)
df['text_w2v'] = [[w2v.wv[word] for word in doc] for doc in tokenized_docs]

In [36]:
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
tfidf_vectors = vectorizer.fit_transform(df['text'])


In [37]:
feature_names = vectorizer.get_feature_names_out()


In [45]:
tfidf_df = pd.DataFrame(tfidf_vectors.toarray(), columns=vectorizer.get_feature_names_out())

In [46]:
tfidf_df

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,ó_,û_,û_thanks,ûªm,ûªt,ûªve,ûï,ûïharry,ûò,ûówell
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
vectorizer.transform([df['text'][0]])

<1x8404 sparse matrix of type '<class 'numpy.float64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [52]:
df['text_tfidf'] = df['text'].apply(lambda x: vectorizer.transform([x]))

In [54]:
df

Unnamed: 0,class,text,text_w2v,text_tfidf
0,ham,"Go until jurong point, crazy.. Available only ...","[[-0.0777484, 0.61794955, 0.16475247, 0.004982...","(0, 8227)\t0.23740046706740076\n (0, 8026)\..."
1,ham,Ok lar... Joking wif u oni...,"[[-0.049168333, 0.45805022, 0.13132215, 0.0022...","(0, 8134)\t0.4316010362639011\n (0, 5369)\t..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[[-0.115629375, 0.5827952, 0.15182507, -0.0166...","(0, 8185)\t0.1927520210056736\n (0, 8146)\t..."
3,ham,U dun say so early hor... U c already then say...,"[[-0.03326846, 0.33185422, 0.08792593, 0.01984...","(0, 6450)\t0.6518642650180976\n (0, 3815)\t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[[-0.0048587094, 0.05938003, 0.021728095, -0.0...","(0, 7837)\t0.4493939296226912\n (0, 7443)\t..."
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,"[[-0.08221098, 0.6105551, 0.18085748, 0.008498...","(0, 8202)\t0.19074118816829963\n (0, 7643)\..."
5568,ham,Will Ì_ b going to esplanade fr home?,"[[-0.08088673, 0.58774805, 0.1817025, 0.015562...","(0, 8390)\t0.37764633472218584\n (0, 3789)\..."
5569,ham,"Pity, * was in mood for that. So...any other s...","[[-0.003369402, -0.0015769876, 0.006742084, 0....","(0, 7168)\t0.6095307789831879\n (0, 5673)\t..."
5570,ham,The guy did some bitching but I acted like i'd...,"[[-0.08141691, 0.76171845, 0.21405223, 0.02883...","(0, 8071)\t0.23479081568562485\n (0, 4485)\..."


In [63]:
df

Unnamed: 0,class,text,text_w2v,text_tfidf
0,ham,"Go until jurong point, crazy.. Available only ...","[[-0.0777484, 0.61794955, 0.16475247, 0.004982...","(0, 8227)\t0.23740046706740076\n (0, 8026)\..."
1,ham,Ok lar... Joking wif u oni...,"[[-0.049168333, 0.45805022, 0.13132215, 0.0022...","(0, 8134)\t0.4316010362639011\n (0, 5369)\t..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[[-0.115629375, 0.5827952, 0.15182507, -0.0166...","(0, 8185)\t0.1927520210056736\n (0, 8146)\t..."
3,ham,U dun say so early hor... U c already then say...,"[[-0.03326846, 0.33185422, 0.08792593, 0.01984...","(0, 6450)\t0.6518642650180976\n (0, 3815)\t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[[-0.0048587094, 0.05938003, 0.021728095, -0.0...","(0, 7837)\t0.4493939296226912\n (0, 7443)\t..."
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,"[[-0.08221098, 0.6105551, 0.18085748, 0.008498...","(0, 8202)\t0.19074118816829963\n (0, 7643)\..."
5568,ham,Will Ì_ b going to esplanade fr home?,"[[-0.08088673, 0.58774805, 0.1817025, 0.015562...","(0, 8390)\t0.37764633472218584\n (0, 3789)\..."
5569,ham,"Pity, * was in mood for that. So...any other s...","[[-0.003369402, -0.0015769876, 0.006742084, 0....","(0, 7168)\t0.6095307789831879\n (0, 5673)\t..."
5570,ham,The guy did some bitching but I acted like i'd...,"[[-0.08141691, 0.76171845, 0.21405223, 0.02883...","(0, 8071)\t0.23479081568562485\n (0, 4485)\..."


In [64]:
df_w2v = df[['class', 'text_w2v']]
df_tfidf = df[['class', 'text_tfidf']]
df_text = df['text']

In [95]:
df_w2v.to_csv('df_w2v.csv')

In [96]:
df_tfidf.to_csv('df_tfidf.csv')

In [99]:
df_text.to_csv('df_text.csv')