<a href="https://colab.research.google.com/github/SumathiGit/NLP-2/blob/main/Feature_Extraction_from_Text_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
import pandas as pd
import numpy as np

In [36]:
df = pd.read_csv('smsspamcollection.tsv', sep='\t')
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [37]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [38]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [39]:
from sklearn.model_selection import train_test_split

X = df['message']  # this time we want to look at the text
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [40]:
#We are gonna use Count Vectorizer which includes the text preprocessing, tasks like tokanizing, filtering stop words ability 
#Which bulids the dictionary of features and transforms the documents into feature vectors [....]

In [41]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(X_train) #Fit the vectorizer to the training set which build a vocab, count the no of words
X_train_counts.shape

(3733, 7082)

In [42]:
type(X_train_counts) #Sparse matrix with 7082[vocab] unique words (lots of zeros) 

scipy.sparse.csr.csr_matrix

In [43]:
X_train.shape #all messages count

(3733,)

In [44]:
"""Transform the counts into frequencies using tfidf
Tfidf - Giving more important words >> more weights """

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(3733, 7082)

In [None]:
"""As you can see this have same shape.But its no longer just counts..
Instead we've taken the term freq and multiplied by its inverse document freq"""

In [46]:
#We can combine the count vectorization anf the tf-idf transformation into one step (the above two steps)

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train) # remember to use the original X_train set
X_train_tfidf.shape

(3733, 7082)

In [None]:
"""TRAIN THE CLASSIFIER

Here we'll introduce an SVM classifier that's similar to SVC,
called LinearSVC.LinearSVC handles sparse input better, and scales well to large numbers of samples.

"""

In [49]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [None]:
""" we only did the count vectorization and the tfidef transformation for thr training dataset only not for the testing dataset 
For that The sklearn pipeline class do the process of vectorization and classification as well"""

In [51]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)  

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [52]:
#Test the classifier and display results

In [53]:
predictions = text_clf.predict(X_test)#passing the raw testing data

In [54]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[1586    7]
 [  12  234]]


In [55]:
print(metrics.accuracy_score(y_test,predictions)) #we got 99% of accuracy

0.989668297988037


In [56]:
#Making prediction

In [57]:
text_clf.predict(["Hello, my dear how are you?"])

array(['ham'], dtype=object)

In [58]:
text_clf.predict(["Congratulations! You won a car worth $100 , Get your free gift by clicking the link below www.carwon.com "])

array(['spam'], dtype=object)