In [3]:
import pandas as pd
import spacy
import warnings
warnings.filterwarnings("ignore")

In [4]:
data = pd.read_csv("spam_ham_dataset.csv")
data.columns, data.label.value_counts()

(Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object'),
 ham     3672
 spam    1499
 Name: label, dtype: int64)

In [5]:
data["label_num"] = data.label.map({"spam" : 1,"ham":0})
data["label_num"].value_counts()

0    3672
1    1499
Name: label_num, dtype: int64

In [6]:
data.drop('label',axis='columns', inplace = True)
data.columns

Index(['Unnamed: 0', 'text', 'label_num'], dtype='object')

## This is clearly a class imbalance problem now we have to make 2 class ham and spam has same number of samples. This we can do in many ways
1. Under Sampling:-  where we take minority class number of samples for majority class "df_class_0_under = df_class_0.sample(count_class_1)"
2. Over Sampling:- Where we duplicate the minority class sample equal to majority class"df_class_1_over = df_class_1.sample(count_class_0, replace=True)"
3. SMOTE:- SMOTE stands for Synthetic Minority Oversampling Technique: Using imbalance learn it uses knn to generate nearest sample

In [7]:
## SMOTE Example works at integer or float columns
# from imblearn.over_sampling import SMOTE #pip install imbalanced-learn
# X = data.drop('label_num',axis='columns')
# y = data['label_num']
# smote = SMOTE(sampling_strategy='minority')
# X_sm, y_sm = smote.fit_resample(X, y)

# y_sm.value_counts()

In [8]:
## DO Over sampling
# Class count
count_class_0, count_class_1 = data.label_num.value_counts()
# Divide by class
df_class_0 = data[data['label_num'] == 0]
df_class_1 = data[data['label_num'] == 1]

df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_class_1_over.shape,df_class_0.shape
data_balanced = pd.concat([df_class_0,df_class_1_over])
data_balanced.shape,data_balanced.label_num.value_counts()
data_balanced["Pre_processed_text"] = data_balanced["text"]

## Using Spacy do preprocessing and Do some general processing

In [9]:
import re
bindings = {
    "aren't" : "are not",
    "can't" : "cannot",
    "couldn't" : "could not",
    "didn't" : "did not",
    "doesn't" : "does not",
    "don't" : "do not",
    "hadn't" : "had not",
    "hasn't" : "has not",
    "haven't" : "have not",
    "he'd" : "he would",
    "he'll" : "he will",
    "he's" : "he is",
    "i'd" : "I would",
    "i'll" : "I will",
    "i'm" : "I am",
    "isn't" : "is not",
    "it's" : "it is",
    "it'll":"it will",
    "i've" : "I have",
    "let's" : "let us",
    "mightn't" : "might not",
    "mustn't" : "must not",
    "shan't" : "shall not",
    "she'd" : "she would",
    "she'll" : "she will",
    "she's" : "she is",
    "shouldn't" : "should not",
    "that's" : "that is",
    "there's" : "there is",
    "they'd" : "they would",
    "they'll" : "they will",
    "they're" : "they are",
    "they've" : "they have",
    "we'd" : "we would",
    "we're" : "we are",
    "weren't" : "were not",
    "we've" : "we have",
    "what'll" : "what will",
    "what're" : "what are",
    "what's" : "what is",
    "what've" : "what have",
    "where's" : "where is",
    "who'd" : "who would",
    "who'll" : "who will",
    "who're" : "who are",
    "who's" : "who is",
    "who've" : "who have",
    "won't" : "will not",
    "wouldn't" : "would not",
    "you'd" : "you would",
    "you'll" : "you will",
    "you're" : "you are",
    "you've" : "you have",
    "'re": " are",
    "let's":"let us",
    "wasn't": "was not",
    "we'll":" will",
    "didn't": "did not",
    "wasnt": "was not"
}
regex={
    re.compile('|'.join([r'(\$\d*\.\d{1,2}\+?)',r'(\$\d+\+?)',r'(\$\d+\.?\+?)',r'(\d+(?:[\.,]\d+)?dollars)',r'(\d+(?:[\.,]\d+)? usd)', r'(\d+(?:[\.,]\d+)?dollar)'])) : r"dollar ||curr||",

    re.compile(r' \$+ ', re.UNICODE) : r" dollar ",
    re.compile('|'.join([r'\b\d+ hours\b', r'\b\d+ hrs\b',r'\b\d+hour\b', r'\b\d+ hr\b',r'\b\d+-hours\b', r'\b\d+-hrs\b',r'\b\d+ -hours\b', r'\b\d+ - hrs\b'])) : r"hours ||duration||",

    re.compile('|'.join([r'\b\d+ minutes\b', r'\b\d+ mins\b', r'\b\d+min\b',r'\b\d+-minutes\b', r'\b\d+-mins\b', r'\b\d+-min\b',r'\b\d+ -minutes\b', r'\b\d+ - mins\b', r'\b\d+ - min\b'])) : r"minutes||duration||",

    re.compile('|'.join([r'\b\d+ seconds\b', r'\b\d+ secs\b', r'\b\d+sec\b',r'\b\d+-seconds\b', r'\b\d+-secs\b', r'\b\d+-sec\b',r'\b\d+ -seconds\b', r'\b\d+ - secs\b', r'\b\d+ - sec\b'])) : r"seconds||duration||",

    re.compile(r'\b\d+:\d+') : r"||abstime||"
}

def replaceBindings(text_df,col,bi_dict):
    #Input: Data frame and text column and dict for replacement
    #Output: Text column converted by replacing bindings
    text_df[col]=text_df[col].replace(bi_dict,regex=True)
    return text_df


def replaceRegex(text_df,col,reg_dict):
    #Input: Data frame and text column and dict for replacement
    #Output: Text column converted by replacing regex
    text_df[col]=text_df[col].replace(reg_dict,regex=True)
    return text_df


## Step1 : Remove unwanted text and special characters
data_balanced['Pre_processed_text'] = data_balanced['Pre_processed_text'].apply(lambda x: x.replace("Subject:",""))
data_balanced['Pre_processed_text'] = data_balanced['Pre_processed_text'].apply(lambda x: x.replace("\r\n"," "))
special_list = ["%","*",'+','[',']',"-","=","_","|","<",">","@",'/',":",";","#"]
for sc in special_list:
    data_balanced['Pre_processed_text'] = data_balanced['Pre_processed_text'].apply(lambda x: x.replace(sc,""))

## Step2: Lower the text
data_balanced['Pre_processed_text'] = data_balanced['Pre_processed_text'].apply(lambda x: x.lower())

## Step3: Replace the Bindings
data_balanced = replaceBindings(data_balanced,'Pre_processed_text',bindings)

## Step4: Replace the Regex
data_balanced = replaceRegex(data_balanced,'Pre_processed_text',regex)

## Step 5 using spacy remove stop words punctuatutions and get lemmatization of words

import spacy
nlp = spacy.load("en_core_web_lg")
def preprocessing(text):
    doc = nlp(text) ### getting doc tokenize
    nlp.vocab['not'].is_stop = "False"
    req_token = []
    for token in doc:
        if token.is_stop or token.is_punct: ## Get only token which are not stop words and punctuation
            continue
        req_token.append(token.lemma_) ## Get the lemmatization of a token
    return " ".join(req_token)

data_balanced["Pre_processed_text"] = data_balanced['Pre_processed_text'].apply(lambda x: preprocessing(x))
data_balanced.to_csv("Pre_Processed_text.csv")


## Try to develop a model which identify a news is fake or real. 

### In general text is converted to number vector in 5 ways 
1. label Encoding (Give the unique number to all the words in vocabulary and tag the documnet) not famous has lot of drawbacks. No meaning captured, data is sparse, for new word in test need to be treated as same.
2. One Hot Encoding, one hot for all the words in vocab, not famous has lot of drawbacks. No meaning captured, data is sparse, for new word in test need to be treated as same.
3. Bag of Words, Count the number of times word present in a documnet. less sparse dataset, still meaning between the words are not captured.
4. TF-IDF: tf(total number of time term t present in doc A/ total number of terms in doc A) * idf(log(total number of doc present/number of documnets term t present)) ex:- 48/1000 * 4/3,4/1. As n increases dimensionlity increases, Doesn't capture relation between words, doesn't address out of vocabulary problem. 
5. Word embeddings(GLove(method -Continuos bag of words), Gensim(method -Continuos bag of words),FastText(method - Character n gram), Bert(Transformers).


## Using Bag of words and build a classification model With Preprocessing txt

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

In [11]:
X_train,X_test,y_train,y_test = train_test_split(data_balanced.Pre_processed_text,data_balanced.label_num, test_size = 0.2,random_state=2023, stratify=data_balanced.label_num)
X_train.shape,y_test.value_counts()

((5875,),
 1    735
 0    734
 Name: label_num, dtype: int64)

In [12]:
## example of CountVectorizer
document = ["One praneeth helps Two friends"]

# Create a Vectorizer Object where ngram = 1
vectorizer = CountVectorizer()
 
vectorizer.fit(document)
# Printing the identified Unique words along with their indices
print("Vocabulary: ", vectorizer.vocabulary_)

vector = vectorizer.transform(document)
vector.toarray()

# n-gram =2 
vectorizer = CountVectorizer(ngram_range = (1,2))
vectorizer.fit(document)
# Printing the identified Unique words along with their indices
print("Vocabulary: ", vectorizer.vocabulary_)

vector = vectorizer.transform(document)
vector.toarray()



Vocabulary:  {'one': 2, 'praneeth': 3, 'helps': 1, 'two': 4, 'friends': 0}
Vocabulary:  {'one': 3, 'praneeth': 5, 'helps': 1, 'two': 7, 'friends': 0, 'one praneeth': 4, 'praneeth helps': 6, 'helps two': 2, 'two friends': 8}


array([[1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int64)

In [13]:
## Build a pipeline wich will do vectorizer with ngram = 1 and model
Kn_pipeline= Pipeline([
    ("vectorizer",CountVectorizer()),
    ("model",KNeighborsClassifier(n_neighbors = 2 , metric = "cosine"))
])
Kn_pipeline.fit(X_train,y_train)
y_pred = Kn_pipeline.predict(X_test)
print("KNN Classifier with bag of words ngram = 1: \n {}".format(classification_report(y_test,y_pred)))

## Build a pipeline wich will do vectorizer with ngram = 1,2,3 and model
Kn_pipeline= Pipeline([
    ("vectorizer",CountVectorizer(ngram_range = (1,3))),
    ("model",KNeighborsClassifier(n_neighbors = 2 , metric = "cosine"))
])
Kn_pipeline.fit(X_train,y_train)
y_pred = Kn_pipeline.predict(X_test)
print("KNN Classifier with bag of words ngram = 1,2,3 : \n {}".format(classification_report(y_test,y_pred)))

## Build a pipeline wich will do vectorizer with ngram = 3 and model
Kn_pipeline= Pipeline([
    ("vectorizer",CountVectorizer(ngram_range = (3,3))),
    ("model",KNeighborsClassifier(n_neighbors = 2 , metric = "cosine"))
])
Kn_pipeline.fit(X_train,y_train)
y_pred = Kn_pipeline.predict(X_test)
print("KNN Classifier with bag of words ngram = 3 : \n {}".format(classification_report(y_test,y_pred)))


KNN Classifier with bag of words ngram = 1: 
               precision    recall  f1-score   support

           0       0.96      0.99      0.97       734
           1       0.99      0.96      0.97       735

    accuracy                           0.97      1469
   macro avg       0.97      0.97      0.97      1469
weighted avg       0.97      0.97      0.97      1469

KNN Classifier with bag of words ngram = 1,2,3 : 
               precision    recall  f1-score   support

           0       0.96      0.99      0.97       734
           1       0.99      0.95      0.97       735

    accuracy                           0.97      1469
   macro avg       0.97      0.97      0.97      1469
weighted avg       0.97      0.97      0.97      1469

KNN Classifier with bag of words ngram = 3 : 
               precision    recall  f1-score   support

           0       1.00      0.92      0.96       734
           1       0.93      1.00      0.96       735

    accuracy                          

In [14]:
## Build a pipeline wich will do vectorizer with ngram = 1,2,3 and model Random Forest

RF_pipeline= Pipeline([
    ("vectorizer",CountVectorizer()),
    ("model",RandomForestClassifier(n_estimators = 100, criterion = "entropy"))
])
RF_pipeline.fit(X_train,y_train)
y_pred = RF_pipeline.predict(X_test)
print("RF Classifier with bag of words ngram = 1: \n {}".format(classification_report(y_test,y_pred)))

## Build a pipeline wich will do vectorizer with ngram = 1,2,3 and model
RF_pipeline= Pipeline([
    ("vectorizer",CountVectorizer(ngram_range = (1,3))),
    ("model",RandomForestClassifier(n_estimators = 100, criterion = "entropy"))
])
RF_pipeline.fit(X_train,y_train)
y_pred = RF_pipeline.predict(X_test)
print("RF Classifier with bag of words ngram = 1,2,3 : \n {}".format(classification_report(y_test,y_pred)))

## Build a pipeline wich will do vectorizer with ngram = 3 and model
RF_pipeline= Pipeline([
    ("vectorizer",CountVectorizer(ngram_range = (3,3))),
    ("model",RandomForestClassifier(n_estimators = 100, criterion = "entropy"))
])
RF_pipeline.fit(X_train,y_train)
y_pred = RF_pipeline.predict(X_test)
print("RF Classifier with bag of words ngram = 3 : \n {}".format(classification_report(y_test,y_pred)))



RF Classifier with bag of words ngram = 1: 
               precision    recall  f1-score   support

           0       1.00      0.98      0.99       734
           1       0.98      1.00      0.99       735

    accuracy                           0.99      1469
   macro avg       0.99      0.99      0.99      1469
weighted avg       0.99      0.99      0.99      1469

RF Classifier with bag of words ngram = 1,2,3 : 
               precision    recall  f1-score   support

           0       1.00      0.96      0.98       734
           1       0.96      1.00      0.98       735

    accuracy                           0.98      1469
   macro avg       0.98      0.98      0.98      1469
weighted avg       0.98      0.98      0.98      1469

RF Classifier with bag of words ngram = 3 : 
               precision    recall  f1-score   support

           0       1.00      0.69      0.82       734
           1       0.76      1.00      0.87       735

    accuracy                           0.

In [15]:
from sklearn.naive_bayes import MultinomialNB
## Build a pipeline wich will do vectorizer with ngram = 1 and model
NB_pipeline= Pipeline([
    ("vectorizer",CountVectorizer()),
    ("model",MultinomialNB(alpha = 0.75))
])
NB_pipeline.fit(X_train,y_train)
y_pred = NB_pipeline.predict(X_test)
print("NB Classifier with bag of words ngram = 1 : \n {}".format(classification_report(y_test,y_pred)))

## Build a pipeline wich will do vectorizer with ngram = 1,2,3 and model
NB_pipeline= Pipeline([
    ("vectorizer",CountVectorizer(ngram_range = (1,3))),
    ("model",MultinomialNB(alpha = 0.75))
])
NB_pipeline.fit(X_train,y_train)
y_pred = NB_pipeline.predict(X_test)
print("NB Classifier with bag of words ngram = 1,2,3 : \n {}".format(classification_report(y_test,y_pred)))

## Build a pipeline wich will do vectorizer with ngram = 3 and model
NB_pipeline= Pipeline([
    ("vectorizer",CountVectorizer(ngram_range = (3,3))),
    ("model",MultinomialNB(alpha = 0.75))
])
NB_pipeline.fit(X_train,y_train)
y_pred = NB_pipeline.predict(X_test)
print("NB Classifier with bag of words ngram = 3 : \n {}".format(classification_report(y_test,y_pred)))

NB Classifier with bag of words ngram = 1 : 
               precision    recall  f1-score   support

           0       0.99      0.97      0.98       734
           1       0.97      0.99      0.98       735

    accuracy                           0.98      1469
   macro avg       0.98      0.98      0.98      1469
weighted avg       0.98      0.98      0.98      1469

NB Classifier with bag of words ngram = 1,2,3 : 
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       734
           1       0.99      0.99      0.99       735

    accuracy                           0.99      1469
   macro avg       0.99      0.99      0.99      1469
weighted avg       0.99      0.99      0.99      1469

NB Classifier with bag of words ngram = 3 : 
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       734
           1       1.00      0.94      0.97       735

    accuracy                           0

## Now try to generate word 2 vector using TFIDF Method

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

## Build a pipeline wich will do vectorizer and model  KNNN
KN_pipeline= Pipeline([
    ("vectorizer",TfidfVectorizer()),
    ("model",KNeighborsClassifier(n_neighbors = 2 , metric = "cosine"))
])
KN_pipeline.fit(X_train,y_train)
y_pred = KN_pipeline.predict(X_test)
print("KNN Classifier with TFIDF: \n {}".format(classification_report(y_test,y_pred)))

## Build a pipeline wich will do vectorizer with ngram = 1,2,3 and model RF
RF_pipeline= Pipeline([
    ("vectorizer",TfidfVectorizer()),
    ("model",RandomForestClassifier(n_estimators = 100, criterion = "entropy"))
])
RF_pipeline.fit(X_train,y_train)
y_pred = RF_pipeline.predict(X_test)
print("RF Classifier with TFIDF: \n {}".format(classification_report(y_test,y_pred)))

## Build a pipeline wich will do vectorizer with ngram = 3 and model Navie Bayes
NB_pipeline= Pipeline([
    ("vectorizer",TfidfVectorizer()),
    ("model",MultinomialNB(alpha = 0.75))
])
NB_pipeline.fit(X_train,y_train)
y_pred = NB_pipeline.predict(X_test)
print("NB Classifier with TFIDF: \n {}".format(classification_report(y_test,y_pred)))

KNN Classifier with TFIDF: 
               precision    recall  f1-score   support

           0       0.96      0.99      0.98       734
           1       0.99      0.96      0.98       735

    accuracy                           0.98      1469
   macro avg       0.98      0.98      0.98      1469
weighted avg       0.98      0.98      0.98      1469

RF Classifier with TFIDF: 
               precision    recall  f1-score   support

           0       1.00      0.97      0.99       734
           1       0.97      1.00      0.99       735

    accuracy                           0.99      1469
   macro avg       0.99      0.99      0.99      1469
weighted avg       0.99      0.99      0.99      1469

NB Classifier with TFIDF: 
               precision    recall  f1-score   support

           0       0.98      0.96      0.97       734
           1       0.97      0.98      0.97       735

    accuracy                           0.97      1469
   macro avg       0.97      0.97      0.97

## Word to Vectors Using Spacy pre trained model (GLOVE)

In [17]:
data_balanced["Text_vectors"] = data_balanced["Pre_processed_text"].apply(lambda x: nlp(x).vector)
data_balanced.head()

Unnamed: 0.1,Unnamed: 0,text,label_num,Pre_processed_text,Text_vectors
0,605,Subject: enron methanol ; meter # : 988291\r\n...,0,enron methanol meter 988291 follow note...,"[-0.116396755, -0.01640397, -1.3500654, 1.1144..."
1,2349,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,hpl nom january 9 2001 attached file hplno...,"[0.4305962, -3.5029912, -0.22538322, 0.4564125..."
2,3624,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,neon retreat ho ho ho wonderful time year ...,"[0.15420337, 0.414349, -1.8427932, -0.23462386..."
4,2030,Subject: re : indian springs\r\nthis deal is t...,0,indian spring deal book teco pvr revenue u...,"[0.40145546, 0.8645684, -2.441826, -0.648139, ..."
5,2949,Subject: ehronline web address change\r\nthis ...,0,ehronline web address change message intend ...,"[1.1535546, 0.14871399, 1.3144618, -0.7234356,..."


In [18]:
X_train,X_test,y_train,y_test = train_test_split(data_balanced.Text_vectors,data_balanced.label_num, test_size = 0.2,random_state=2023, stratify=data_balanced.label_num)
X_train.shape,y_test.value_counts()

((5875,),
 1    735
 0    734
 Name: label_num, dtype: int64)

In [19]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [20]:
X_train[0].shape
X_train_stack = np.stack(X_train)
X_train_stack[0].shape

(300,)

In [21]:
X_train_stack = np.stack(X_train)
X_test_stack = np.stack(X_test)
scaler = MinMaxScaler()
X_train_stack_scald = scaler.fit_transform(X_train_stack)
X_test_stack_scald = scaler.transform(X_test_stack)
X_test_stack_scald

array([[0.44210124, 0.46387514, 0.45495138, ..., 0.3466497 , 0.4272204 ,
        0.5262414 ],
       [0.28936383, 0.5348895 , 0.4673059 , ..., 0.37713078, 0.56919205,
        0.5240829 ],
       [0.3544381 , 0.40016073, 0.49442762, ..., 0.31481326, 0.56105137,
        0.47268748],
       ...,
       [0.31870535, 0.53391814, 0.34457618, ..., 0.4225276 , 0.45700148,
        0.52994436],
       [0.41558844, 0.5996464 , 0.3784262 , ..., 0.266191  , 0.48799622,
        0.49995637],
       [0.25866765, 0.7273883 , 0.41262874, ..., 0.51113445, 0.4243504 ,
        0.4788338 ]], dtype=float32)

In [22]:

## Build a pipeline wich will do vectorizer and model  KNNN
KN_pipeline= Pipeline([
    
    ("model",KNeighborsClassifier(n_neighbors = 2 , metric = "cosine"))
])
KN_pipeline.fit(X_train_stack_scald,y_train)
y_pred = KN_pipeline.predict(X_test_stack_scald)
print("KNN Classifier with sapcy w2v: \n {}".format(classification_report(y_test,y_pred)))

## Build a pipeline wich will do vectorizer with ngram = 1,2,3 and model RF
RF_pipeline= Pipeline([

    ("model",RandomForestClassifier(n_estimators = 100, criterion = "entropy"))
])
RF_pipeline.fit(X_train_stack_scald,y_train)
y_pred = RF_pipeline.predict(X_test_stack_scald)
print("RF Classifier with sapcy w2v: \n {}".format(classification_report(y_test,y_pred)))

## Build a pipeline wich will do vectorizer with ngram = 3 and model Navie Bayes
NB_pipeline= Pipeline([

    ("model",MultinomialNB(alpha = 0.75))
])
NB_pipeline.fit(X_train_stack_scald,y_train)
y_pred = NB_pipeline.predict(X_test_stack_scald)
print("NB Classifier with sapcy w2v: \n {}".format(classification_report(y_test,y_pred)))

KNN Classifier with sapcy w2v: 
               precision    recall  f1-score   support

           0       0.97      0.96      0.97       734
           1       0.96      0.97      0.97       735

    accuracy                           0.97      1469
   macro avg       0.97      0.97      0.97      1469
weighted avg       0.97      0.97      0.97      1469

RF Classifier with sapcy w2v: 
               precision    recall  f1-score   support

           0       0.99      0.95      0.97       734
           1       0.96      0.99      0.97       735

    accuracy                           0.97      1469
   macro avg       0.97      0.97      0.97      1469
weighted avg       0.97      0.97      0.97      1469

NB Classifier with sapcy w2v: 
               precision    recall  f1-score   support

           0       0.78      0.71      0.74       734
           1       0.73      0.80      0.77       735

    accuracy                           0.75      1469
   macro avg       0.76      0.

## Using Gensim Get Word2Vec

In [2]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')



In [1]:
import spacy
nlp = spacy.load("en_core_web_lg") # if this fails then run "python -m spacy download en_core_web_lg" to download that model

def preprocess_and_vectorize(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
        
    return wv.get_mean_vector(filtered_tokens)

In [24]:
data_balanced["Gensim_vectors"] = data_balanced["Pre_processed_text"].apply(lambda x: preprocess_and_vectorize(x))

In [27]:

#Do the 'train-test' splitting with test size of 20% with random state of 2022 and stratify sampling too
X_train, X_test, y_train, y_test = train_test_split(
    data_balanced.Gensim_vectors.values, 
    data_balanced.label_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2023,
    stratify=data_balanced.label_num
)

print("Shape of X_train before reshaping: ", X_train.shape)
print("Shape of X_test before reshaping: ", X_test.shape)


X_train_2d = np.stack(X_train)
X_test_2d =  np.stack(X_test)

print("Shape of X_train after reshaping: ", X_train_2d.shape)
print("Shape of X_test after reshaping: ", X_test_2d.shape)

Shape of X_train before reshaping:  (5875,)
Shape of X_test before reshaping:  (1469,)
Shape of X_train after reshaping:  (5875, 300)
Shape of X_test after reshaping:  (1469, 300)


In [30]:

## Build a pipeline wich will do vectorizer and model  KNNN
KN_pipeline= Pipeline([
    
    ("model",KNeighborsClassifier(n_neighbors = 2 , metric = "cosine"))
])
KN_pipeline.fit(X_train_2d,y_train)
y_pred = KN_pipeline.predict(X_test_2d)
print("KNN Classifier with genism w2v: \n {}".format(classification_report(y_test,y_pred)))

## Build a pipeline wich will do vectorizer with ngram = 1,2,3 and model RF
RF_pipeline= Pipeline([

    ("model",RandomForestClassifier(n_estimators = 100, criterion = "entropy"))
])
RF_pipeline.fit(X_train_2d,y_train)
y_pred = RF_pipeline.predict(X_test_2d)
print("RF Classifier with genism w2v: \n {}".format(classification_report(y_test,y_pred)))

## Build a pipeline wich will do vectorizer with ngram = 3 and model Navie Bayes
# NB_pipeline= Pipeline([

#     ("model",MultinomialNB(alpha = 0.75))
# ])
# NB_pipeline.fit(X_train_2d,y_train)
# y_pred = NB_pipeline.predict(X_test_2d)
# print("NB Classifier with genism w2v: \n {}".format(classification_report(y_test,y_pred)))

KNN Classifier with genism w2v: 
               precision    recall  f1-score   support

           0       0.97      0.98      0.98       734
           1       0.98      0.97      0.98       735

    accuracy                           0.98      1469
   macro avg       0.98      0.98      0.98      1469
weighted avg       0.98      0.98      0.98      1469

RF Classifier with genism w2v: 
               precision    recall  f1-score   support

           0       0.99      0.98      0.98       734
           1       0.98      0.99      0.98       735

    accuracy                           0.98      1469
   macro avg       0.98      0.98      0.98      1469
weighted avg       0.98      0.98      0.98      1469

