https://spotintelligence.com/2023/02/15/word2vec-for-text-classification/#TF-IDF_vs_Word2Vec

# 1. Install the required packages
You will need scikit-learn, Gensim, and NLTK packages. You can install them using pip as follows:

In [89]:
!pip install scikit-learn gensim nltk



#2. Load the data
Load the text data into Python, and split it into training and testing sets.

In [90]:
import pandas as pd
import nltk
nltk.download('stopwords')
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [91]:
df = pd.read_csv('/content/test.csv', encoding='latin-1')

In [92]:
df.head(2)

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0


In [93]:
# Mapping sentiment to numerical labels and removing neutral
df['sentiment'] = df['sentiment'].map({'negative': 0, 'positive': 1, 'neutral': None})

# Drop rows with neutral sentiment
df = df.dropna()

In [94]:
df=df[['text','sentiment']]

In [95]:
df.head(3)

Unnamed: 0,text,sentiment
1,Shanghai is also really exciting (precisely -...,1.0
2,"Recession hit Veronique Branquinho, she has to...",0.0
3,happy bday!,1.0


#3. Preprocess the text data


In [96]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [97]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size=0.2, random_state=42)
# Checking types
print(type(X_train))  # pandas Series or DataFrame
print(type(X_test))   # pandas Series or DataFrame
print(type(y_train))  # pandas Series or NumPy array
print(type(y_test))   # pandas Series or NumPy array

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [98]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words('english'))
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [99]:
def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [100]:
X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)

#4. Train the Word2Vec model

Train a Word2Vec model on the preprocessed training data using Gensim package.

In [101]:
from gensim.models import Word2Vec
sentences = [sentence.split() for sentence in X_train]
print(sentences)



In [102]:
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)
w2v_model.train(sentences, total_examples=w2v_model.corpus_count,epochs=w2v_model.epochs)



(27426, 63550)

In [103]:
import numpy as np

def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

In [104]:
X_train = np.array([vectorize(sentence) for sentence in X_train])
X_test = np.array([vectorize(sentence) for sentence in X_test])

#6. Train a classification model

Train a classification model such as logistic regression, random forests, or support vector machines using the vectorised training data and the sentiment labels.

In [105]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)

#7. Evaluate the model

Evaluate the performance of the classification model on the testing set with the accuracy, precision, recall and F1 score.


In [108]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = clf.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, pos_label=1.0))
print('Recall:', recall_score(y_test, y_pred, pos_label=1.0))
print('F1 score:', f1_score(y_test, y_pred, pos_label=1.0))

Accuracy: 0.5629453681710214
Precision: 0.5792880258899676
Recall: 0.7682403433476395
F1 score: 0.6605166051660516
