## Importing all the required model

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import os
#---------------------------------------Text Processing------------------------------------------------------------#
from sklearn.feature_extraction.text import TfidfVectorizer
from string import punctuation
#------------------------------------Metrics and Validation---------------------------------------------------------#
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, cohen_kappa_score
#-------------------------------------Models to be trained----------------------------------------------------------#
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

# Loading all the files and converting every .txt file to one column value

In [2]:
names = []
base = 'C:/Users/nomesh.palakaluri.EMEA/OneDrive - Drilling Info/Desktop/Text model/text_classification_email/Training_data/'
with os.scandir(base) as entries:
    for entry in entries:
        if(entry.is_file() == False):
            names.append(entry.name)
print(names)

['Crime', 'Entertainment', 'Politics', 'Science']


In [3]:
files = {}
unique = []
for name in names:
    path = base + name+'/'
    x = []
    with os.scandir(path) as entries:
        for entry in entries:
            if(entry.is_file()):
                x.append(entry.name)
    files[name] = x
    files[name].sort()

In [4]:
for i in range(len(names)):
    x = files[names[i]]
    for j in x:
        for k in range(i+1, len(names)):
            key = names[k]
            if j in files[key]:
                files[key].remove(j)

In [6]:
data = {}
i = 0

for genre in files.keys() :
    texts = files[genre]
    for text in texts:
        if text in files[genre]:
            path = base + genre + '/' + text
            with open(path, "r", encoding = "latin1") as file:
                data[i] = file.readlines()
                i = i+1
            data[i-1] = [" ".join(data[i-1]), genre] 

data = pd.DataFrame(data).T
print(data.shape)
data.columns = ['Text', 'Class']

(6734, 2)


In [7]:
unique = list(data.Text.unique())
len(unique)
dic = dict(data)

In [8]:
#unique values in the documents
uni = {}
i = 0
for k in range(len(list(dic['Text']))):
    if dic['Text'][k] in unique:
        uni[i] = [dic['Text'][k], dic['Class'][k]]
        unique.remove(dic['Text'][k])
        i += 1

# Changing the dataset and classifying the data to classnames

In [9]:
#classification into text and type of class
data = pd.DataFrame(uni).T
print(data.shape)
data.columns = ['Text', 'Class']

(6581, 2)


# Cleaning the data

In [10]:
#cleaning text
import nltk.corpus
import regex
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.tokenize import WordPunctTokenizer
from string import punctuation
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

stop = stopwords.words('english')

for punct in punctuation:
    stop.append(punct)

def filter_text(text, stop_words):
    word_tokens = WordPunctTokenizer().tokenize(text.lower())
    filtered_text = [regex.sub(u'\p{^Latin}', u'', w) for w in word_tokens if w.isalpha() and len(w) > 3]
    filtered_text = [wordnet_lemmatizer.lemmatize(w, pos="v") for w in filtered_text if not w in stop_words] 
    return " ".join(filtered_text)

[nltk_data] Downloading package stopwords to C:\Users\nomesh.palakalur
[nltk_data]     i.EMEA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
#applying the filter method on origibal text
nltk.download('wordnet')
nltk.download('omw-1.4')
data["filtered_text"] = data.Text.apply(lambda x : filter_text(x, stop)) 

[nltk_data] Downloading package wordnet to C:\Users\nomesh.palakaluri.
[nltk_data]     EMEA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\nomesh.palakaluri.
[nltk_data]     EMEA\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# S aving the vectorizer in a separate file

In [32]:
import pickle
#vectorizing the data 
tfidf=TfidfVectorizer()
filename='tfidf.pick'
pickle.dump(tfidf, open(filename, 'wb'))

#train_vec = tfidf.fit_transform(data['filtered_text'])


# loading the vectorizer from the saved file

In [34]:
loaded_vec = pickle.load(open(filename, 'rb'))
train_vec = loaded_vec.fit_transform(data['filtered_text'])
print(train_vec)

  (0, 2176)	0.010470275703028841
  (0, 22982)	0.01202214230128693
  (0, 32697)	0.011832176202765212
  (0, 24510)	0.01719194997996771
  (0, 30202)	0.015817794021327238
  (0, 26902)	0.01194453878780885
  (0, 2758)	0.01981421845998446
  (0, 7414)	0.020147292375635215
  (0, 6275)	0.010661140738426644
  (0, 42262)	0.011301265790630042
  (0, 9555)	0.015239129658539886
  (0, 34954)	0.011024874268906236
  (0, 23487)	0.009761664697026284
  (0, 11121)	0.015465319009974733
  (0, 11993)	0.015561403101600771
  (0, 17590)	0.00994928362133883
  (0, 27191)	0.011164287255451507
  (0, 27395)	0.008993449465875736
  (0, 2004)	0.013847674163902018
  (0, 39533)	0.009943218075191454
  (0, 35537)	0.012688838577964096
  (0, 36479)	0.023386061783693467
  (0, 6807)	0.010411952148509002
  (0, 12316)	0.013186029660684427
  (0, 19263)	0.018566105938608184
  :	:
  (6580, 1237)	0.059367537143380084
  (6580, 16832)	0.039859353677237634
  (6580, 9565)	0.05723606283997494
  (6580, 3648)	0.04486931631270501
  (6580, 3556

In [14]:
#classifying the classes and assigning some values
data['classification'] = data['Class'].replace(['Crime','Politics','Science'],[0,1,2])

In [15]:
#splitting and train of the data
x_train, x_val, y_train, y_val = train_test_split(train_vec,data['classification'], stratify=data['classification'], test_size=0.2)
 

# model

In [None]:
#model
C = np.arange(0, 1, 0.001)
max_iter = range(100, 500)
warm_start = [True, False]
solver = ['lbfgs', 'newton-cg', 'liblinear']
penalty = ['l2', 'l1']

params = {
    'C' : C,
    'max_iter' : max_iter,
    'warm_start' : warm_start,
    'solver' : solver,
    'penalty' : penalty
}
#log='C:/Users/nomesh.palakaluri.EMEA/OneDrive - Drilling Info/Desktop/Text model/Text model/finalized_model.sav'

random_search = RandomizedSearchCV(
    estimator =LogisticRegression(random_state=1),
    param_distributions = params,
    n_iter = 100,
    cv = 3,
    n_jobs = -1,
    random_state = 1,
    verbose = 1
).fit(x_train, y_train)

random_search.best_params_

# Accuracy

In [17]:
#accuracy
model_lr = random_search.best_estimator_
model_lr.score(x_train, y_train)

0.9791033434650456

# testing data

In [18]:
#testing data
test_data=pd.read_csv(f'C:/Users/nomesh.palakaluri.EMEA/OneDrive - Drilling Info/Desktop/Text model/text_classification_email/Testing_data/SMS_test.csv',encoding='unicode_escape')
test_data["filtered_text"] = test_data['Message'].apply(lambda x : filter_text(x, stop)) 
test_data.head()

Unnamed: 0,S. No.,Message,Label,filtered_text
0,1,"UpgrdCentre Orange customer, you may now claim...",Spam,upgrdcentre orange customer claim free camera ...
1,2,"Loan for any purpose £500 - £75,000. Homeowner...",Spam,loan purpose homeowners tenant welcome previou...
2,3,Congrats! Nokia 3650 video camera phone is you...,Spam,congrats nokia video camera phone call call co...
3,4,URGENT! Your Mobile number has been awarded wi...,Spam,urgent mobile number award prize guarantee cal...
4,5,Someone has contacted our dating service and e...,Spam,someone contact date service enter phone fancy...


In [35]:
#vectorizing the data
test_vec=loaded_vec.transform(test_data['filtered_text']) 

In [36]:
#predicting the output
pred=model_lr.predict(test_vec)
print(pred)

[2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 1 2 2 2
 2 2 2 2 2 2 1 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2
 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1
 2 2 2 2 2 1 2 2 2 2 2 2 2 2]


# Predicting the output

In [37]:
#prediction
# model_mnb.predict(x_val)
from collections import Counter
import numpy as np
# sen1=["hi i am excited a good news with you",'wish you good luck today','fine get on with it']
# sen1=tfidf.transform(sen1[0].split())
# model_mnb.predict(sen1)
b = Counter(model_lr.predict(test_vec))
if(b.most_common()[0][0]==0):
    print('Crime')
elif(b.most_common()[0][0]==1):
    print('Politics')
else:
    print('Science')

Science


In [22]:
text_1 = """
With visa office closures and international travel restrictions still in place, experts say the economic benefits of President Donald Trump’s immigration suspension last week are uncertain, at best, while thousands of prospective immigrants still stand to suffer the consequences.

Although the suspension was filled with broad exemptions and does not include current visa holders or those already in the U.S., the Migration Policy Institute estimates 26,000 would-be green card applicants would be blocked each month.

Over the weekend, a coalition of advocacy groups filed an emergency request to halt Trump’s directive. It was the latest move in a 2019 case, stemming from when the White House sought to ban immigrants who couldn’t prove their ability to acquire.
Attorneys with the American Immigration Lawyers Association requested a hold on the recent suspension order while the courts continue to decide the fate of the health insurance restrictions.

The White House framed the immigration suspension as a vital component of helping the economy and American workers recover, but it could have the opposite effect.

Immigrants already in the U.S. and those who already have certain visas can still obtain permanent residency. The suspension applies only to green card applicants from outside the country and primarily targets to those looking to settle down in the U.S. permanently through a family connection, according to MPI estimates.

That leaves the possibility of employment-based visa slots opening up as a result of the suspension.

"President Trump claims he signed it in order to protect the economy in the wake of the coronavirus outbreak, but the reality is that our economic recovery will depend on immigrants," said Esther Sung, a senior attorney involved in the 2019 case.

That view is shared, in part, by Rutgers economics professor Jennifer Hunt. She told ABC News that an influx of workers into critical sectors of the economy could boost per capita GDP.

"It would be kind of a shot in the arm," Hunt said.

On-going travel restrictions and the closure of foreign visa offices still pose the greatest barrier to legal immigration in the age of coronavirus. Economists and labor experts, including Hunt, say those factors would override any attempt to assess the economic impacts of the restrictions.

"It’s very hard for me to see the direct correlation in terms of immediate and practical positive impact on displaced U.S. workers," said Caroline Tang, an Austin-based attorney who advises companies on work authorization for immigrants.

Tang said the bar for employers to obtain work authorization for visa holders is already high enough, referring to specific requirements for prioritizing U.S. citizens.

Researchers have identified some connection between more immigrants settled in the country and economic growth under normal circumstances, but the results are mixed.

For example, a 2017 report by the economics firm Moody Analytics and ProPublica found that for every 1% increase in the U.S. population, the gross domestic product rises by 1.5%. While a 2018 study from Stanford University’s Hoover Institution confirms that "positive relationship" generally, but found variations and some negative impacts to U.S.-born employment rates in its state-by-state review.

Researchers studying the large population of foreign-born workers in South Africa said it likely had a positive impact on per capita GDP.

"Foreign-born workers also generated additional employment for native-born workers," according to the 2018 Organisation for Economic Co-operation and Development report.

"""


In [38]:
#vectorizing the test data
test_vec1=loaded_vec.transform([text_1]) 

In [40]:
#prediction
# model_mnb.predict(x_val)
from collections import Counter
import numpy as np
# sen1=["hi i am excited a good news with you",'wish you good luck today','fine get on with it']
# sen1=tfidf.transform(sen1[0].split())
# model_mnb.predict(sen1)
b = Counter(model_lr.predict(test_vec1))
if(b.most_common()[0][0]==0):
    print('Crime')
elif(b.most_common()[0][0]==1):
    print('Politics')
else:
    print('Science')

Politics
