# Word2Vec

IMPORTING LIBRARIES

In [64]:
import pandas as pd
import numpy as np

from gensim.models import Word2Vec
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB , GaussianNB

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import RandomizedSearchCV


from sklearn.metrics import accuracy_score , precision_score , recall_score , f1_score , roc_auc_score , confusion_matrix 

In [7]:
df = pd.read_csv('/Users/sarthaksharna/Text_Classification/data/cleaned_bbc_text')

df.head()

Unnamed: 0,category,text
0,4,tv future hand viewer home theatre system plas...
1,0,worldcom bos leave book alone former worldcom ...
2,3,tiger wary farrell gamble leicester say rush m...
3,3,yeading face newcastle fa cup premiership side...
4,1,ocean twelve raid box office ocean twelve crim...


BUILDING THE CORPUS TO TRAIN CUSTOM WORD2VEC MODEL

In [11]:
corpus = [text.split() for text in df['text']]

TRAINING WORD2VEC MODEL ON CORPUS

In [23]:
model = (Word2Vec(corpus , window = 5 , vector_size = 100 , min_count = 2 , epochs = 80 , workers = 4))

EXAMPLE OF PERFORMANCE OF WORD2VEC EMBEDINGS

In [31]:
model.wv.most_similar('economy')

[('economic', 0.6848961710929871),
 ('export', 0.5342537760734558),
 ('growth', 0.501365602016449),
 ('recovery', 0.46114271879196167),
 ('manufacturing', 0.44830459356307983),
 ('eurozone', 0.44820621609687805),
 ('reconstruction', 0.419453501701355),
 ('tourism', 0.4194468557834625),
 ('spending', 0.4189000725746155),
 ('prosperity', 0.4171815514564514)]

In [17]:
len(model.wv.index_to_key)

16316

AVERAGE Word2Vec

In [36]:
def avg_word2vec(text):
    text = [word for word in text if word in model.wv.index_to_key]
    if len(text) >= 1:
        return np.mean(model.wv[text], axis=0)
    else:
        return np.zeros(model.vector_size)

In [37]:
avg_word2vec(df['text'][0])

array([-0.06186217,  0.47195077,  0.46774596,  0.22015978, -0.01070364,
       -0.73100764,  0.6814023 ,  1.1034652 , -0.81723744, -0.65028024,
       -0.50911295, -0.90445036, -0.02309715, -0.24053508,  0.8122594 ,
       -0.27319038, -0.7675168 , -1.2137283 ,  0.26681498, -0.95029753,
        0.16705774,  0.54618144,  0.16522108, -1.0695883 , -0.4250347 ,
        0.98700804, -1.2009306 ,  0.44621125, -0.6188722 , -0.56393147,
        1.3343891 , -0.15970287,  0.0276325 ,  0.60533893,  0.4096876 ,
        0.30672482,  0.9676891 , -1.1721765 , -0.4365567 , -0.9912437 ,
        0.19740902, -0.48940924, -0.34490502,  0.11961739, -0.06073605,
       -1.1481203 , -0.53604704,  1.014435  ,  0.10032601, -0.14339465,
        1.0143349 , -0.45714152, -0.04736331,  0.28960285, -0.5851607 ,
        0.20471282,  1.0107883 ,  0.0042961 , -0.05704316,  0.66864914,
        0.16763914,  1.8738737 , -0.6079127 ,  0.13694963, -1.4475131 ,
        0.20606405,  0.979309  , -0.05338528, -1.0411755 ,  1.68

In [38]:
from tqdm import tqdm

In [39]:
X = []

for text in tqdm(df['text'].values):
    X.append(avg_word2vec(text))
    


100%|██████████| 2118/2118 [03:13<00:00, 10.93it/s]


In [40]:
X = np.array(X)

In [41]:
X.shape

(2118, 100)

In [42]:
X[0]

array([-0.06186217,  0.47195077,  0.46774596,  0.22015978, -0.01070364,
       -0.73100764,  0.6814023 ,  1.1034652 , -0.81723744, -0.65028024,
       -0.50911295, -0.90445036, -0.02309715, -0.24053508,  0.8122594 ,
       -0.27319038, -0.7675168 , -1.2137283 ,  0.26681498, -0.95029753,
        0.16705774,  0.54618144,  0.16522108, -1.0695883 , -0.4250347 ,
        0.98700804, -1.2009306 ,  0.44621125, -0.6188722 , -0.56393147,
        1.3343891 , -0.15970287,  0.0276325 ,  0.60533893,  0.4096876 ,
        0.30672482,  0.9676891 , -1.1721765 , -0.4365567 , -0.9912437 ,
        0.19740902, -0.48940924, -0.34490502,  0.11961739, -0.06073605,
       -1.1481203 , -0.53604704,  1.014435  ,  0.10032601, -0.14339465,
        1.0143349 , -0.45714152, -0.04736331,  0.28960285, -0.5851607 ,
        0.20471282,  1.0107883 ,  0.0042961 , -0.05704316,  0.66864914,
        0.16763914,  1.8738737 , -0.6079127 ,  0.13694963, -1.4475131 ,
        0.20606405,  0.979309  , -0.05338528, -1.0411755 ,  1.68

In [None]:
# INDEPENDENT VARIABLE
y = df['category']

TRAIN TEST SPLIT

In [44]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 42)

X_train.shape , X_test.shape , y_train.shape , y_test.shape

((1694, 100), (424, 100), (1694,), (424,))

TRAINING MODELS IN PIPELINE

In [65]:
models = {
    
    'Logistic Regression' : LogisticRegression(),
    'Random Forest' : RandomForestClassifier(),
    'GradientBoostingClassifier' : GradientBoostingClassifier(),
    'XGBClassifier' : XGBClassifier()
    
}

In [66]:
for model_name , clf in models.items() :

    pipe = Pipeline(
        [
            ('classifier' , clf)
        ]
    )

    pipe.fit(X_train , y_train)

    y_pred_train = pipe.predict(X_train)
    y_pred_test = pipe.predict(X_test)

    print(f'Model : {model_name} \n ')

    print(f'Train Accuracy : {accuracy_score(y_train , y_pred_train)}')
    print(f'Train Precision : {precision_score(y_train , y_pred_train , average = "macro")}')
    print(f'Train Recall : {recall_score(y_train , y_pred_train , average = "macro")}')
    print(f'Train F1-Score : {f1_score(y_train , y_pred_train , average = "macro")}')

    print('\n')

    print(f'Test Accuracy : {accuracy_score(y_test , y_pred_test)}')
    print(f'Test Precision : {precision_score(y_test , y_pred_test , average = "macro")}')
    print(f'Test Recall : {recall_score(y_test , y_pred_test , average = "macro")}')
    print(f'Test F1-Score : {f1_score(y_test , y_pred_test , average = "macro")}' , '\n')

    print(f'confusion_matrix : {confusion_matrix(y_test , y_pred_test)}' , '\n')


    print('\n')

    print('=='*50)

    print('\n')      

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model : Logistic Regression 
 
Train Accuracy : 0.5867768595041323
Train Precision : 0.5749562953418123
Train Recall : 0.5672787995254541
Train F1-Score : 0.5671068656176864


Test Accuracy : 0.5825471698113207
Test Precision : 0.5820957911867003
Test Recall : 0.5693087798739623
Test F1-Score : 0.5624900406109541 

confusion_matrix : [[73  5  8  6  5]
 [16 25  6 24 10]
 [12  4 43 14  5]
 [ 8  5  3 69  9]
 [12  3 14  8 37]] 





Model : Random Forest 
 
Train Accuracy : 1.0
Train Precision : 1.0
Train Recall : 1.0
Train F1-Score : 1.0


Test Accuracy : 0.5683962264150944
Test Precision : 0.5661136656715604
Test Recall : 0.5549799916620125
Test F1-Score : 0.5479176566123474 

confusion_matrix : [[71  9 10  4  3]
 [18 23 15 20  5]
 [11  5 42 15  5]
 [11  2  4 69  8]
 [14  5 10  9 36]] 





Model : GradientBoostingClassifier 
 
Train Accuracy : 0.9569067296340024
Train Precision : 0.9586711063302598
Train Recall : 0.9546354598302618
Train F1-Score : 0.9564484531404307


Test Accuracy : 0