In [3]:
import pandas as pd
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
df = pd.concat([train,test],axis=0,ignore_index=True)

In [5]:
df.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [6]:
df.shape

(127600, 3)

In [7]:
df.isnull().sum()

Unnamed: 0,0
Class Index,0
Title,0
Description,0


In [8]:
df['Class Index'].value_counts()

Unnamed: 0_level_0,count
Class Index,Unnamed: 1_level_1
3,31900
4,31900
2,31900
1,31900


In [9]:
df['text'] = df['Title'] + ' ' + df['Description']

In [10]:
df.head()

Unnamed: 0,Class Index,Title,Description,text
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",Wall St. Bears Claw Back Into the Black (Reute...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Carlyle Looks Toward Commercial Aerospace (Reu...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,Iraq Halts Oil Exports from Main Southern Pipe...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...","Oil prices soar to all-time record, posing new..."


In [11]:
df = df[['Class Index','text']]

In [12]:
df.head()

Unnamed: 0,Class Index,text
0,3,Wall St. Bears Claw Back Into the Black (Reute...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...
4,3,"Oil prices soar to all-time record, posing new..."


In [19]:
#Lower all the cases
df['text'] = df['text'].str.lower()

In [37]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [21]:
stemmer  = PorterStemmer()

In [22]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [23]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [39]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize
    tokens = [lemmatizer.lemmatize(word,pos='v') for word in tokens if word not in stopwords.words('english')]  # Remove stopwords and stem
    return tokens

In [40]:
df['tokens'] = df['text'].apply(preprocess_text)

In [54]:
df['tokens'].shape

(127600,)

In [41]:
df['Class Index']

Unnamed: 0,Class Index
0,3
1,3
2,3
3,3
4,3
...,...
127595,1
127596,2
127597,2
127598,3


In [42]:
df['Class Index'].value_counts()

Unnamed: 0_level_0,count
Class Index,Unnamed: 1_level_1
3,31900
4,31900
2,31900
1,31900


In [43]:
import gensim
from gensim.models import Word2Vec

In [44]:
words2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=1, workers=4)

In [53]:
## Word2vec modlels Vocabulary
words2vec_model.wv

<gensim.models.keyedvectors.KeyedVectors at 0x7c78053c7790>

In [57]:
import numpy as np
def document_vector(tokens):
  vectors = [words2vec_model.wv[word] for word in tokens if word in words2vec_model.wv]
  if len(vectors) == 0:
    return np.zeros(words2vec_model.vector_size)
  return np.mean(vectors,axis=0)

In [58]:
df['vectors'] = df['tokens'].apply(document_vector)

In [62]:
df['vectors'].shape

(127600,)

In [63]:
x_vectors = np.vstack(df['vectors'])
y_vectors = df['Class Index']

In [66]:
x_vectors

array([[-0.8750112 ,  0.72615504, -0.3115762 , ..., -0.34584612,
         0.15036756, -0.40160763],
       [-0.05344198,  0.41425377,  0.24119267, ..., -0.0365239 ,
         0.18032113, -0.16295998],
       [ 0.6121694 ,  1.3684978 , -0.41290748, ..., -0.28187716,
        -0.34207165, -0.16697323],
       ...,
       [-0.17162871,  0.10034728, -0.04066098, ..., -0.92556   ,
         0.2926494 , -0.3756822 ],
       [ 0.61971956,  0.13616231, -0.0525265 , ..., -0.39066893,
        -0.168624  , -0.16779372],
       [ 0.03954275,  0.22472464,  0.04867087, ..., -0.0841822 ,
         0.05908742,  0.32509398]], dtype=float32)

In [67]:
#Train Test split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_vectors,y_vectors,test_size=0.2,random_state=42)

In [70]:
#Apply a machine learning model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report

In [71]:
classifier = RandomForestClassifier()
classifier.fit(x_train,y_train)

In [72]:
y_pred = classifier.predict(x_test)

In [73]:
print("Word2Vec Classifier Performance")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Word2Vec Classifier Performance
Accuracy: 0.8890282131661442
Classification Report:
              precision    recall  f1-score   support

           1       0.91      0.88      0.89      6283
           2       0.94      0.97      0.95      6466
           3       0.85      0.86      0.85      6370
           4       0.86      0.85      0.86      6401

    accuracy                           0.89     25520
   macro avg       0.89      0.89      0.89     25520
weighted avg       0.89      0.89      0.89     25520

