In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import os
#---------------------------------------Text Processing------------------------------------------------------------#
from sklearn.feature_extraction.text import TfidfVectorizer
from string import punctuation
#------------------------------------Metrics and Validation---------------------------------------------------------#
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, cohen_kappa_score
#-------------------------------------Models to be trained----------------------------------------------------------#
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans


In [3]:
names = []
base = 'C:/Users/nomesh.palakaluri.EMEA/OneDrive - Drilling Info/Desktop/Text model/Data/'
with os.scandir(base) as entries:
    for entry in entries:
        if(entry.is_file() == False):
            names.append(entry.name)
print(names)

['Crime', 'Entertainment', 'Politics', 'Science']


In [4]:
files = {}
unique = []
for name in names:
    path = base + name+'/'
    x = []
    with os.scandir(path) as entries:
        for entry in entries:
            if(entry.is_file()):
                x.append(entry.name)
    files[name] = x
    files[name].sort()

In [5]:
for i in range(len(names)):
    x = files[names[i]]
    for j in x:
        for k in range(i+1, len(names)):
            key = names[k]
            if j in files[key]:
                files[key].remove(j)

In [6]:
data = {}
i = 0

for genre in files.keys() :
    texts = files[genre]
    for text in texts:
        if text in files[genre]:
            path = base + genre + '/' + text
            with open(path, "r", encoding = "latin1") as file:
                data[i] = file.readlines()
                i = i+1
            data[i-1] = [" ".join(data[i-1]), genre] 

data = pd.DataFrame(data).T
print(data.shape)
data.columns = ['Text', 'Class']

(6734, 2)


In [7]:
unique = list(data.Text.unique())
len(unique)

6581

In [8]:
dic = dict(data)

In [9]:
#unique values in the documents
uni = {}
i = 0
for k in range(len(list(dic['Text']))):
    if dic['Text'][k] in unique:
        uni[i] = [dic['Text'][k], dic['Class'][k]]
        unique.remove(dic['Text'][k])
        i += 1

In [10]:
#classification into text and type of class
data = pd.DataFrame(uni).T
print(data.shape)
data.columns = ['Text', 'Class']

(6581, 2)


In [11]:
#cleaning text
import nltk.corpus
import regex
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.tokenize import WordPunctTokenizer
from string import punctuation
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

stop = stopwords.words('english')

for punct in punctuation:
    stop.append(punct)

def filter_text(text, stop_words):
    word_tokens = WordPunctTokenizer().tokenize(text.lower())
    filtered_text = [regex.sub(u'\p{^Latin}', u'', w) for w in word_tokens if w.isalpha() and len(w) > 3]
    filtered_text = [wordnet_lemmatizer.lemmatize(w, pos="v") for w in filtered_text if not w in stop_words] 
    return " ".join(filtered_text)

[nltk_data] Downloading package stopwords to C:\Users\nomesh.palakalur
[nltk_data]     i.EMEA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
nltk.download('wordnet')
nltk.download('omw-1.4')
data["filtered_text"] = data.Text.apply(lambda x : filter_text(x, stop)) 
data.head()

[nltk_data] Downloading package wordnet to C:\Users\nomesh.palakaluri.
[nltk_data]     EMEA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\nomesh.palakaluri.
[nltk_data]     EMEA\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,Text,Class,filtered_text
0,"\n Archive-name: ripem/faq\n Last-update: Sun,...",Crime,archive name ripem last update post still rath...
1,Approved: news-answers-request@MIT.EDU\n Conte...,Crime,approve news answer request content type text ...
2,Approved: news-answers-request@MIT.EDU\n Conte...,Crime,approve news answer request content type text ...
3,Message-ID: <1ppvai$l79@bilbo.suite.com>\n Rep...,Crime,message bilbo suite reply miller suite nntp po...
4,\n Some sick part of me really liked that p...,Crime,sick part really like phrase actually merely t...


In [13]:
#top 10 words to claasify the text into crime class
all_text = " ".join(data[data.Class == "Crime"].filtered_text) 
count = pd.DataFrame(all_text.split(), columns = ['words'])
top_10 = count[count['words'].isin(list(count.words.value_counts()[:10].index[:10]))]
print(top_10)

             words
17      encryption
19           write
53            know
79             use
96           write
...            ...
154707         use
154734       would
154762       would
154765       would
154767        know

[8874 rows x 1 columns]


In [14]:
#top 10 words to claasify the text into politics class
all_text = " ".join(data[data.Class == "Politics"].filtered_text)
count = pd.DataFrame(all_text.split(), columns = ['words'])
top_10 = count[count['words'].isin(list(count.words.value_counts()[:10].index[:10]))]
print(top_10)

         words
10       write
13       state
27       would
38         say
45       state
...        ...
537437   state
537444   state
537516  people
537544   would
537556   right

[29328 rows x 1 columns]


In [15]:
#top 10 words to claasify the text into science class
all_text = " ".join(data[data.Class == "Science"].filtered_text)
count = pd.DataFrame(all_text.split(), columns = ['words'])
top_10 = count[count['words'].isin(list(count.words.value_counts()[:10].index[:10]))]
print(top_10)

          words
1          post
13      article
43        would
52         know
54          use
...         ...
327941    write
327997     know
328006    space
328017    space
328021    space

[15315 rows x 1 columns]


In [16]:
tfidf = TfidfVectorizer(lowercase=False)
train_vec = tfidf.fit_transform(data['filtered_text'])
train_vec.shape

(6581, 43129)

In [17]:
data['classification'] = data['Class'].replace(['Crime','Politics','Science'],[0,1,2])

In [18]:
x_train, x_val, y_train, y_val = train_test_split(train_vec,data['classification'], stratify=data['classification'], test_size=0.2)

In [19]:
C = np.arange(0, 1, 0.001)
max_iter = range(100, 500)
warm_start = [True, False]
solver = ['lbfgs', 'newton-cg', 'liblinear']
penalty = ['l2', 'l1']

params = {
    'C' : C,
    'max_iter' : max_iter,
    'warm_start' : warm_start,
    'solver' : solver,
    'penalty' : penalty
}

random_search = RandomizedSearchCV(
    estimator = LogisticRegression(random_state = 1),
    param_distributions = params,
    n_iter = 100,
    cv = 3,
    n_jobs = -1,
    random_state = 1,
    verbose = 1
).fit(x_train, y_train)

random_search.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


117 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\nomesh.palakaluri.EMEA\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\nomesh.palakaluri.EMEA\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\nomesh.palakaluri.EMEA\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _ch

{'warm_start': False,
 'solver': 'lbfgs',
 'penalty': 'l2',
 'max_iter': 486,
 'C': 0.982}

In [21]:
model_lr = random_search.best_estimator_
model_lr.score(x_train, y_train)

0.9775835866261399

In [22]:
predicted = model_lr.predict(x_val)

lr_acc = accuracy_score(y_val,predicted)
lr_cop = cohen_kappa_score(y_val,predicted)
lr = pd.DataFrame([lr_acc, lr_cop], columns = ['Logistic Regression with RandomizedSearchCV'])

print("Test score: {:.2f}".format(lr_acc))
print("Cohen Kappa score: {:.2f}".format(lr_cop))

Test score: 0.95
Cohen Kappa score: 0.92


In [23]:
alpha = np.arange(0, 1, 0.001)
fit_prior = [True, False]

params = {
    'alpha' : alpha,
    'fit_prior' : fit_prior
}

random_search = RandomizedSearchCV(
    estimator = MultinomialNB(),
    param_distributions = params,
    n_iter = 100,
    cv = 3,
    n_jobs = -1,
    random_state = 1,
    verbose = 1
).fit(x_train, y_train)

random_search.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


{'fit_prior': True, 'alpha': 0.024}

In [24]:
model_mnb = random_search.best_estimator_
model_mnb.score(x_train, y_train)

0.9946808510638298

In [25]:
model_mnb.predict(x_val)

array([1, 1, 2, ..., 2, 1, 1], dtype=int64)