## A Two-stage Feature Selection method for Text Categorization

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
Corpus = pd.read_csv('/content/drive/MyDrive/ML_C_Proj/messages.csv')

In [3]:
Corpus.head()

Unnamed: 0,subject,message,label
0,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0
1,,"lang classification grimes , joseph e . and ba...",0
2,query : letter frequencies for text identifica...,i am posting this inquiry for sergei atamas ( ...,0
3,risk,a colleague and i are researching the differin...,0
4,request book information,earlier this morning i was on the phone with a...,0


In [4]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, f1_score


#Import Seaborn
import seaborn as sns; 
sns.set()

#Import matplot library
import matplotlib.pyplot as plt

In [5]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

#Step 1. Remove stop words, punctuation, and non-alphanumeric text.

In [6]:
# Step - a : Remove blank rows if any.
# Corpus = Corpus[:1000]
Corpus['message'].dropna(inplace=True)
# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus['message'] = [entry.lower() for entry in Corpus['message']]
# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['message'] = [word_tokenize(entry) for entry in Corpus['message']]

In [7]:
print(Corpus['message'][1])

['lang', 'classification', 'grimes', ',', 'joseph', 'e', '.', 'and', 'barbara', 'f', '.', 'grimes', ';', 'ethnologue', 'language', 'family', 'index', ';', 'pb', '.', 'isbn', ':', '0-88312', '-', '708', '-', '3', ';', 'vi', ',', '116', 'pp', '.', ';', '$', '14', '.', '00', '.', 'summer', 'institute', 'of', 'linguistics', '.', 'this', 'companion', 'volume', 'to', 'ethnologue', ':', 'languages', 'of', 'the', 'world', ',', 'twelfth', 'edition', 'lists', 'language', 'families', 'of', 'the', 'world', 'with', 'sub-groups', 'shown', 'in', 'a', 'tree', 'arrangement', 'under', 'the', 'broadest', 'classification', 'of', 'language', 'family', '.', 'the', 'language', 'family', 'index', 'facilitates', 'locating', 'language', 'names', 'in', 'the', 'ethnologue', ',', 'making', 'the', 'data', 'there', 'more', 'accessible', '.', 'internet', ':', 'academic', '.', 'books', '@', 'sil', '.', 'org', 'languages', ',', 'reference', 'lang', '&', 'culture', 'gregerson', ',', 'marilyn', ';', 'ritual', ',', 'belie

In [None]:
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Corpus['message']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop wLoadingords and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final0'] = str(Final_words)
    

In [None]:
print(Corpus['text_final0'])

In [None]:
words = set(nltk.corpus.words.words())

def clean_sent(sent):
    return " ".join(w for w in nltk.wordpunct_tokenize(sent) \
     if w.lower() in words or not w.isalpha())
    
Corpus['text_final0'] = Corpus['text_final0'].apply(clean_sent)

print(Corpus['text_final0'])

#Splitting data into Training and Testing data


In [None]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final0'],Corpus['label'],test_size=0.3, random_state = 42)

In [None]:
df = pd.DataFrame(Test_Y)
lb=df.value_counts().index.tolist()
val=df.value_counts().values.tolist()
exp=(0.025,0)
clr=('orange','blue')
plt.figure(figsize=(10,5),dpi=140)
plt.pie(x=val,explode=exp,labels=lb,colors=clr,autopct='%2.0f%%',pctdistance=0.5, shadow=True,radius=0.9)
plt.legend(["0 = NO SPAM",'1 = SPAM'])
plt.show()

In [None]:
print(Corpus.shape)

print(Test_X)

# Step 2. Calculate the normalized TFIDF in the corresponding element of the weight matrix.
#Variables : max_features

In [None]:
# Tfidf_vect = TfidfVectorizer(max_features=15000) #Tfidfvectorizer makes
# X = Tfidf_vect.fit_transform(Corpus['text_final0'])
# Train_X_Tfidf = Tfidf_vect.transform(Train_X)   
# Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [None]:
Tfidf_vect = TfidfVectorizer(max_features=15000) #Tfidfvectorizer makes
X = Tfidf_vect.fit_transform(Corpus['text_final0'])

In [None]:
print(X.shape)
# print(Train_X.shape)

# Step 3. Select the features according to the FCD method and get a new vector space model.

In [None]:
print(type(X))
import scipy
#iterate over x and find max value of a word
cx = scipy.sparse.coo_matrix(X)
# max_term = 0
# min_term = 100

a0 = {}
a1 = {}
for i,j,v in zip(cx.row, cx.col, cx.data):
    # print ("(%d, %d), %s" % (i,j,v))
    # max_term = max(max_term, j)
    # min_term = min(min_term,j)
    if Corpus['label'][i] == 0:
      if j in a0:
      # incerementing the count by 1
        a0[j] += 1
      else:
      # setting the count to 1
        a0[j] = 1
    else:
      if j in a1:
      # incerementing the count by 1
        a1[j] += 1
      else:
      # setting the count to 1
        a1[j] = 1

# print(max_term)
# print(min_term)

In [None]:
(l,m) = X.shape
fc = np.zeros(m)

for j in range(m):
  a0_j=0
  a1_j=0
  if j in a0:
    a0_j=a0[j]
  if j in a1:
    a1_j=a1[j]
  fc[j] = (abs((a0_j - a1_j)/(a0_j + a1_j)))
  print(j, fc[j])

In [None]:
count = 0
for i in range(m):
  if(fc[i] == 1.0):
    count = count + 1

print(count)

In [None]:
import numpy as np
from scipy.sparse import csr_matrix
import multiprocessing

ct = scipy.sparse.coo_matrix(X)
n_row = []
n_col = []
n_data = []

r_m = {}
j = 0

for i in range(len(ct.col)):
  if (fc[ct.col[i]] == 1.0):
    if (ct.col[i] not in r_m):
      r_m[ct.col[i]] = j
      j = j + 1
    n_row.append(ct.row[i])
    n_col.append(r_m[ct.col[i]])
    n_data.append(ct.data[i])


In [None]:
print(min((n_col)))
print(j)

In [None]:
X1 = csr_matrix((n_data, (n_row, n_col)), shape = (max(ct.row)+1, j)).toarray()

print(X1.shape)

In [None]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X1,Corpus['label'],test_size=0.3, random_state = 42)

# Step 4. Construct the new semantic space model by means of LSI. 
#Variables: n_components

In [None]:
import sklearn
from sklearn.decomposition import TruncatedSVD
SVD = sklearn.decomposition.TruncatedSVD(n_components=20, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)
SVD.fit(X1)
Train_X_SVD = SVD.transform(Train_X)
Test_X_SVD = SVD.transform(Test_X)

In [None]:
# from sklearn.utils.extmath import randomized_svd

# U, Sigma, VT = randomized_svd(Train_X_Tfidf, 
#                               n_components=15,
#                               n_iter=5,
#                               random_state=None)
# Train_X_SVD = U

In [None]:
print(Train_X_SVD.shape)
print(Test_X_SVD.shape)

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# vectorizer = TfidfVectorizer(max_df=0.5, stop_words='english', use_idf=True)
# lsa = TruncatedSVD(n_components=20)
# Tfidf_vect.fit(Corpus['text_final0'])

# train_text = vectorizer.fit_transform(Train_X)
# test_text = vectorizer.fit_transform(Test_X)

# train_text = lsa.fit_transform(train_text)
# test_text = lsa.fit_transform(test_text)


# clf = RandomForestClassifier(max_depth=20, random_state=0)
# clf.fit(train_text, Train_Y)
# clf.score(test_text,Test_Y)

#Step 5. Use the SVM classifier on the semantic space model.

#5.1 RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(Train_X_SVD, Train_Y)
clf.score(Test_X_SVD,Test_Y)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=20, random_state=0)
clf.fit(Train_X, Train_Y)
clf.score(Test_X,Test_Y)

# 5.2 SVM

In [None]:
from sklearn import svm
clf = svm.SVC()
clf.fit(Train_X_SVD, Train_Y)
clf.score(Test_X_SVD,Test_Y)

In [None]:
from sklearn import svm
clf = svm.SVC()
clf.fit(Train_X, Train_Y)
clf.score(Test_X,Test_Y)

# Step 6. Obtain the categorization performance over the data set.