# NLP Tutorial

## 1. Importing necessary dependencies

In [None]:
import numpy as np
import pandas as pd
import nltk
# nltk.download("stopwords")

In [None]:
##  2. Importing the dataset
- The dataset is a tab-separated spam classification text where we will be implementing NLP methods like BoW, TF-IDF, and Word embedding techniques. 
- We have two columns, output- label and input- message. 

In [None]:
messages = pd.read_csv("SMSSpamCollection.txt", sep='\t', names=['label','message'])
#sep:"separator." It specifies the delimiter that separates columns in the CSV file. 
#In this case, \t denotes a tab character

In [None]:
messages
messages.shape

In [None]:
messages['message'].loc[120]

In [None]:
### Steps to be followed
1. Text Preprocessing 1 - Tokenisation, Stopwords, stemming, lemmatization using NLTK
2. Text Preprocessing 2 - BoW, TF-IDF, Word2Vec, AvgWord2Vec using gensim module.

#### re - regular expression module
- re is a module in Python that stands for regular expression.
- Regular expressions are sequences of characters that define a search pattern, mainly used for pattern matching within strings.
- The re module provides functions to perform various operations using regular expressions in Python.

In [None]:
import re #regular expression
import nltk
# nltk.download("stopwords")

In [None]:
#Stemming
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
# from nltk.stem import PorterStemmer

In [None]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z0-9]', ' ', messages['message'][i]) #to remove all the characters and keep only words
    review = review.lower()
    review = review.split()
    #apply stemming
    review = [ps.stem(word) for word in review if word not in stopwords.words("english")]
    review = ' '.join(review)
    corpus.append(review)
corpus


In [None]:
- **re.sub(pattern, replacement, string)** is the syntax for using re.sub() to **substitute patterns in a string.**
- attempt to remove all characters except alphabetic letters (both lowercase and uppercase) from the messages['message'[i]] string.

### Creating a Bag of Words model

In [None]:
#Creating a Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500,binary=True, ngram_range=(1,2))
X = cv.fit_transform(corpus).toarray()
X

In [None]:
y = pd.get_dummies(messages['label']).astype(int)
y = y.iloc[:,1].values
y

In [None]:
### Train-test split

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.80,random_state=69)

In [None]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train,y_train)

In [None]:
y_pred = spam_detect_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,classification_report
score = accuracy_score(y_pred, y_test)
print(f"Accuracy Score: {score}")
print(classification_report(y_test,y_pred))

In [None]:
## Creating a TF-IDF Model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500, ngram_range=(1,2))
X = tv.fit_transform(corpus).toarray()
X

In [None]:
#Train-test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.20, random_state=69)

In [None]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train,y_train)

In [None]:
y_pred = spam_detect_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,classification_report
score = accuracy_score(y_pred, y_test)
print(f"Accuracy Score: {score}")
print(classification_report(y_pred, y_test))

In [None]:
## Using Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred_rf = rf.predict(X_test)
score = accuracy_score(y_test, y_pred_rf)
print(score)
print(classification_report(y_test, y_pred_rf))