<a href="https://colab.research.google.com/github/SriRamK345/NLP_dataset/blob/main/NLP_ML_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [199]:
# ! git clone https://github.com/krishnaik06/NLP-Live.git

In [200]:
# %cd NLP-Live

In [201]:
import pandas as pd

In [202]:
df = pd.read_csv("/content/NLP-Live/smsspamcollection/SMSSpamCollection", sep = "\t", names = ["label", "message"] )

In [203]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [204]:
df.shape

(5572, 2)

### data cleaning and pre processing

In [205]:
import re
import nltk
import string

In [206]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [207]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [208]:
ps = PorterStemmer()

### Preprocessing

In [209]:
corpus = []
for i in range(0, len(df.message)):
  text = re.sub(r"\d+", "" , df.message[i]) # Remove number
  text = text.translate(str.maketrans("", "", string.punctuation)) # Remove punctuation
  text = text.lower() # Convert test to lowercase
  text = text.split()
  text = ([word for word in text if word not in stopwords.words("english")]) # Remove stopwords
  text = ([ps.stem(word) for word in text]) # Lemmatization
  text = " ".join(text)
  corpus.append(text)

In [210]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri questionstd txt ratetc appli over',
 'u dun say earli hor u c alreadi say',
 'nah dont think goe usf live around though',
 'freemsg hey darl week word back id like fun still tb ok xxx std chg send £ rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea £ prize reward claim call claim code kl valid hour',
 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free',
 'im gonna home soon dont want talk stuff anymor tonight k ive cri enough today',
 'six chanc win cash pound txt csh send cost pday day tsandc appli repli hl info',
 'urgent week free membership £ prize jackpot txt word claim tc wwwdbuknet lccltd pobox ldnwarw',
 'ive 

## Creating BOW model

In [211]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True)

X = cv.fit_transform(corpus).toarray()

In [212]:
X[1]

array([0, 0, 0, ..., 0, 0, 0])

In [213]:
X.shape

(5572, 7162)

In [214]:
Y = pd.get_dummies(df.label).astype(int)
Y = Y.iloc[:,1].values

In [215]:
Y

array([0, 0, 1, ..., 0, 0, 0])

## Train Test Split

In [216]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.4,random_state=42)


In [217]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [218]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [219]:
y_pred = clf.predict(X_test)

In [220]:
from sklearn.metrics import accuracy_score, classification_report

In [221]:
score = accuracy_score(y_test, y_pred)
score

0.9739793629430238

In [222]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98      1938
           1       0.87      0.93      0.90       291

    accuracy                           0.97      2229
   macro avg       0.93      0.96      0.94      2229
weighted avg       0.98      0.97      0.97      2229



## TF-IDF

In [223]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [224]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X = vectorizer.fit_transform(corpus)

In [225]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2,random_state=42)

In [226]:
X_train

<4457x37191 sparse matrix of type '<class 'numpy.float64'>'
	with 69950 stored elements in Compressed Sparse Row format>

In [227]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [228]:
y_pred = clf.predict(X_test)

In [229]:
from sklearn.metrics import accuracy_score, classification_report

In [230]:
score = accuracy_score(y_test, y_pred)
score

0.9542600896860987

In [231]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       966
           1       1.00      0.66      0.79       149

    accuracy                           0.95      1115
   macro avg       0.97      0.83      0.88      1115
weighted avg       0.96      0.95      0.95      1115



In [232]:
from sklearn.ensemble import RandomForestClassifier

In [233]:
rf= RandomForestClassifier()
rf.fit(X_train, y_train)

In [234]:
y_pred = clf.predict(X_test)

In [235]:
from sklearn.metrics import accuracy_score, classification_report

In [236]:
score = accuracy_score(y_test, y_pred)
score

0.9542600896860987

In [237]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       966
           1       1.00      0.66      0.79       149

    accuracy                           0.95      1115
   macro avg       0.97      0.83      0.88      1115
weighted avg       0.96      0.95      0.95      1115



## Word2Vec

In [238]:
! pip install gensim



In [239]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [240]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [241]:
corpus = []
for i in range(0, len(df.message)):
  text = re.sub(r"\d+", "" , df.message[i]) # Remove number
  text = text.translate(str.maketrans("", "", string.punctuation)) # Remove punctuation
  text = text.lower() # Convert test to lowercase
  text = text.split()
  text = ([word for word in text if word not in stopwords.words("english")]) # Remove stopwords
  text = ([wnl.lemmatize(word) for word in text]) # Lemmatization
  text = " ".join(text)
  corpus.append(text)

In [242]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry questionstd txt ratetcs apply over',
 'u dun say early hor u c already say',
 'nah dont think go usf life around though',
 'freemsg hey darling week word back id like fun still tb ok xxx std chgs send £ rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea £ prize reward claim call claim code kl valid hour',
 'mobile month u r entitled update latest colour mobile camera free call mobile update co free',
 'im gonna home soon dont want talk stuff anymore tonight k ive cried enough today',
 'six chance win cash pound txt csh send cost pday day tsandcs apply reply hl info',
 'urgent week free membership £ prize jackpot txt word claim tc www

In [243]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [244]:
corpus[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [245]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [246]:
from gensim.utils import simple_preprocess

words = []

for i in corpus:
  sentence = sent_tokenize(i) # calls the sent_tokenize function on the string i
  for sent in sentence:
    words.append(simple_preprocess(sent))

In [247]:
words # unique word on each sentence

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni'],
 ['free',
  'entry',
  'wkly',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'questionstd',
  'txt',
  'ratetcs',
  'apply',
  'over'],
 ['dun', 'say', 'early', 'hor', 'already', 'say'],
 ['nah', 'dont', 'think', 'go', 'usf', 'life', 'around', 'though'],
 ['freemsg',
  'hey',
  'darling',
  'week',
  'word',
  'back',
  'id',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'melle',
  'melle',
  'oru',
  'minnaminunginte',
  'nurungu',
  'vettam',
  'set',
  'callertune',
  'caller',
  'press',
  'copy',
  'friend',
  'callertune'],
 ['winner',
  'valued',
  'network',
  'customer',
  'sel

## Train Word2Vec model form Scratch

In [248]:
import gensim

In [249]:
model_ = gensim.models.Word2Vec(words, window = 5, min_count=2)

In [250]:
model_.wv.index_to_key

['call',
 'im',
 'get',
 'ur',
 'go',
 'dont',
 'ok',
 'free',
 'ltgt',
 'know',
 'day',
 'come',
 'like',
 'got',
 'ill',
 'good',
 'time',
 'text',
 'want',
 'love',
 'send',
 'need',
 'one',
 'going',
 'today',
 'txt',
 'home',
 'lor',
 'stop',
 'see',
 'sorry',
 'still',
 'back',
 'mobile',
 'think',
 'reply',
 'take',
 'tell',
 'phone',
 'new',
 'week',
 'well',
 'hi',
 'later',
 'da',
 'please',
 'make',
 'cant',
 'night',
 'say',
 'claim',
 'thing',
 'dear',
 'much',
 'oh',
 'hey',
 'great',
 'give',
 'pls',
 'number',
 'happy',
 'work',
 'friend',
 'hope',
 'message',
 'way',
 'msg',
 'wat',
 'thats',
 'prize',
 'right',
 'min',
 'yes',
 'let',
 'tomorrow',
 'already',
 'ask',
 'said',
 'yeah',
 'really',
 'amp',
 'co',
 'babe',
 'life',
 'miss',
 'meet',
 'didnt',
 'morning',
 'year',
 'win',
 'last',
 'service',
 'thanks',
 'would',
 'anything',
 'ive',
 'find',
 'cash',
 'tone',
 'lol',
 'feel',
 'every',
 'nokia',
 'also',
 'care',
 'sure',
 'pick',
 'contact',
 'keep',
 's

In [251]:
model_.corpus_count

5566

In [252]:
model_.epochs

5

In [253]:
model_.wv.similar_by_word("free")

[('text', 0.9996860027313232),
 ('mobile', 0.9996843934059143),
 ('reply', 0.9996693134307861),
 ('txt', 0.9996500611305237),
 ('call', 0.9996094703674316),
 ('phone', 0.9995978474617004),
 ('per', 0.9995406270027161),
 ('stop', 0.9995403289794922),
 ('msg', 0.9995290637016296),
 ('ur', 0.9995157718658447)]

In [254]:
model_.wv.similar_by_word("good")

[('one', 0.9996905326843262),
 ('give', 0.9996766448020935),
 ('really', 0.9996604919433594),
 ('much', 0.9996599555015564),
 ('love', 0.9996538758277893),
 ('day', 0.9996525049209595),
 ('amp', 0.9996351599693298),
 ('think', 0.9996333718299866),
 ('im', 0.9996315836906433),
 ('go', 0.9996269941329956)]

## Avg Word2Vec

In [255]:
def avg_word2vec(doc):
  return np.mean([model_.wv[word] for word in doc if word in model_.wv.index_to_key], axis=0)

In [256]:
# ! pip install tqdm

In [257]:
from tqdm import tqdm
import numpy as np

In [258]:
words[33]

['fear', 'fainting', 'housework', 'quick', 'cuppa']

In [259]:
X = []

for i in tqdm(range(len(words))):
  X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 5566/5566 [00:02<00:00, 2746.45it/s]


In [260]:
X[0].shape

(100,)

In [261]:
words[0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [262]:
type(X)

list

In [263]:
X_ = np.array(X)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (5566,) + inhomogeneous part.

In [None]:
# Check shapes of elements in X
for i, element in enumerate(X):
  print(f"Shape of element {i}: {np.shape(element)}")

# If different shapes are found, investigate further why avg_word2vec returns different shapes
# Potential solutions include padding shorter arrays or investigating why avg_word2vec is producing varying lengths

In [None]:
Y.shape

In [None]:
len(X)

In [None]:
if X.shape[0] != Y.shape[0]:
    # Handle the mismatch in number of samples
    # Example: If Y has extra samples, truncate to match X
    Y = Y[:X.shape[0]]

In [None]:
len(Y)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.4,random_state=42)


In [None]:
X_train

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

In [None]:
score = accuracy_score(y_test, y_pred)
score

In [None]:
print(classification_report(y_test, y_pred))