In [1]:
import pandas as pd
import numpy as np
import time
import os
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [61]:
#read the files form the folder
input_path = "C:/Users/ramna/DataScience/IR/Search_Engine-7071CEM/subject_Classification"
df_cat = pd.DataFrame(columns=['Details','File Name','Info'])

category_details =[]
files_details =[]
data_details = []

for details in os.listdir(input_path):
    print("\n",details)
    subfolder_path = os.path.join(input_path,details)
    for files in os.listdir(subfolder_path):
        file_path = os.path.join(subfolder_path,files)
        
        #print(file_path)
        category_details.append(details)
        files_details.append(files)
        file_ptr = open(file_path, encoding="utf8")
        data = file_ptr.read().split('\n')
        data=list(filter(None, data))
        #data = data.split(' ')
        data_details.append(data)
        
df_cat['Details'] = category_details
df_cat['File Name'] = files_details
df_cat['Info']  = data_details    


 business

 entertainment

 food

 graphics

 historical

 medical

 politics

 space

 sport

 technologie


In [62]:
df_cat.head()

Unnamed: 0,Details,File Name,Info
0,business,business_1.txt,"[Lufthansa flies back to profit, German airlin..."
1,business,business_10.txt,"[Winn-Dixie files for bankruptcy, US supermark..."
2,business,business_100.txt,"[US economy still growing says Fed, Most areas..."
3,business,business_11.txt,"[Saab to build Cadillacs in Sweden, General Mo..."
4,business,business_12.txt,"[Bank voted 8-1 for no rate change, The decisi..."


In [63]:
df_cat['Details'].value_counts()

politics         100
food             100
historical       100
entertainment    100
business         100
technologie      100
sport            100
space            100
graphics         100
medical          100
Name: Details, dtype: int64

In [64]:
df_cat.Info[0][:1000]

['Lufthansa flies back to profit',
 'German airline Lufthansa has returned to profit in 2004 after posting huge losses in 2003.',
 'In a preliminary report, the airline announced net profits of 400m euros ($527.61m; £274.73m), compared with a loss of 984m euros in 2003. Operating profits were at 380m euros, ten times more than in 2003. Lufthansa was hit in 2003 by tough competition and a dip in demand following the Iraq war and the killer SARS virus. It was also hit by troubles at its US catering business. Last year, Lufthansa showed signs of recovery even as some European and US airlines were teetering on the brink of bankruptcy. The board of Lufthansa has recommended paying a 2004 dividend of 0.30 euros per share. In 2003, shareholders did not get a dividend. The company said that it will give all the details of its 2004 results on 23 March.']

In [66]:
label_encode = LabelEncoder()
df_cat['Class'] = label_encode.fit_transform(df_cat['Details'])
df_cat.sample(5)

Unnamed: 0,Details,File Name,Info,Class
623,politics,politics_153.txt,"[Clarke to unveil immigration plan, New contro...",6
868,sport,sport_70.txt,"[Record fails to lift lacklustre meet, Yelena ...",8
78,business,business_8.txt,"[Weak dollar trims Cadbury profits, The world'...",0
588,medical,medical_644.txt,"[------------- cut here -----------------, Vol...",5
943,technologie,technologie_48.txt,[Startup Deploying AI Chatbots With “Conversat...,9


In [67]:
#convert data into an array
info_array = np.array(df_cat['Info'])

In [68]:
stop_words = stopwords.words('english')
ps = PorterStemmer()

In [69]:
#convert into tokens, remove stop words and stem the tokens
tokenizer = RegexpTokenizer('[A-Za-z]\w+')
for idx in range(len(info_array)):   
    info_array[idx] = tokenizer.tokenize(str(info_array[idx]))
    
info_array = [[ps.stem(token) for token in doc if token not in stop_words] for doc in info_array]


In [70]:
df_cat['Tokenized_Info']=info_array
df_cat.sample(10)
            

Unnamed: 0,Details,File Name,Info,Class,Tokenized_Info
441,historical,historical_46.txt,"[World War II Ends (1945), At the Potsdam Conf...",4,"[world, war, II, end, At, potsdam, confer, jul..."
857,sport,sport_60.txt,"[McIlroy wins 800m indoor title, James McIlroy...",8,"[mcilroy, win, indoor, titl, jame, mcilroy, mo..."
621,politics,politics_15.txt,"[Talks held on Gibraltar's future, Two days of...",6,"[talk, held, gibraltar, futur, two, day, talk,..."
803,sport,sport_11.txt,"[Radcliffe yet to answer GB call, Paula Radcli...",8,"[radcliff, yet, answer, GB, call, paula, radcl..."
349,graphics,graphics_53.txt,"[Hi:, I am digitizing a NTSC signal and displa...",3,"[Hi, digit, ntsc, signal, display, PC, video, ..."
318,graphics,graphics_25.txt,[I require BGI drivers for Super VGA Displays ...,3,"[requir, bgi, driver, super, vga, display, sup..."
12,business,business_2.txt,"[Japanese growth grinds to a halt, Growth in J...",0,"[japanes, growth, grind, halt, growth, japan, ..."
451,historical,historical_55.txt,"[Algerian nationalism, 1954 film about French ...",4,"[algerian, nation, film, french, algeria, both..."
864,sport,sport_67.txt,"[Lewis-Francis eyeing world gold, Mark Lewis-F...",8,"[lewi, franci, eye, world, gold, mark, lewi, f..."
300,graphics,graphics_1.txt,"[\t\t\tTuesday, June 22, 1993, \t Carderock...",3,"[ttuesday, june, carderock, divis, naval, surf..."


In [71]:
#replace , with space in token list
df_cat['Tokenized_Info_Details'] = [ ' '.join(map(str,tok)) for tok in df_cat['Tokenized_Info']]

df_cat.head(2)

Unnamed: 0,Details,File Name,Info,Class,Tokenized_Info,Tokenized_Info_Details
0,business,business_1.txt,"[Lufthansa flies back to profit, German airlin...",0,"[lufthansa, fli, back, profit, german, airlin,...",lufthansa fli back profit german airlin luftha...
1,business,business_10.txt,"[Winn-Dixie files for bankruptcy, US supermark...",0,"[winn, dixi, file, bankruptci, US, supermarket...",winn dixi file bankruptci US supermarket group...


In [72]:
x_train,x_test,y_train,y_test = train_test_split(df_cat['Tokenized_Info_Details'].values,df_cat['Class'].values)

In [73]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape


((750,), (250,), (750,), (250,))

In [74]:
x_train[0]

'imag edit tip graphic design publish graphic design what imag edit whi import well us may know answer nowaday peopl click pictur everywher profession camera also smartphon camera who want look beauti everyon want look gorgeou want imag click look beauti pictur straight camera may look okay even okay expens camera perfect To make imag perfect imag edit post process necessari graphic design photoshop expert profession knowledg imag edit softwar like adob photoshop cc lightroom illustr etc most photograph also learn photoshop edit imag slow product photograph So today articl editor design also photograph speed edit process basic edit imag edit tip for graphic design and photograph the imag edit tip go talk today speed edit process focu import critic thing Of cours graphic design photoshop expert import thing edit photograph click pictur import edit the photo edit tip go share combin basic edit eas workflow So let get start without delay'

In [75]:
x_test.shape

(250,)

In [76]:
# Form tf-idf vector
vectorizer = TfidfVectorizer()

In [77]:
test_input = ["This is sports column"]
test_input = np.array(test_input)
x_train_vector = vectorizer.fit_transform(x_train)
x_test_vector =vectorizer.transform(x_test)
test_vector = vectorizer.transform(test_input)
pickle.dump(x_train, open("NB_train_data.npy", 'wb'))

In [78]:
x_train_vector.shape, x_test_vector.shape,test_vector.shape

((750, 16599), (250, 16599), (1, 16599))

In [79]:
print(list(label_encode.classes_))

['business', 'entertainment', 'food', 'graphics', 'historical', 'medical', 'politics', 'space', 'sport', 'technologie']


In [80]:
# Use multiple classifiers and grid search for prediction
def sub_model(models, params, X_train, X_test, y_train, y_test):    
    
    if not set(models.keys()).issubset(set(params.keys())):
        raise ValueError('parameters missing')

    for key in models.keys():
    
        model = models[key]
        param = params[key]
        gs = GridSearchCV(model, param, cv=10, error_score=0, refit=True)
        gs.fit(X_train, y_train)
        y_pred = gs.predict(X_test)
        
        # Print scores for the classifier
        print(key, ':', gs.best_params_)
        print("Accuracy: %1.3f \tPrecision: %1.3f \tRecall: %1.3f \t\tF1: %1.3f\n" % (accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='macro'), recall_score(y_test, y_pred, average='macro'), f1_score(y_test, y_pred, average='macro')))
    
    return gs

In [81]:
# Preparing to make a pipeline 
models = {
    'Naive Bayes': MultinomialNB(), 
}

In [82]:
#params
params = {
    'Naive Bayes': { 'alpha': [0.5, 1], 'fit_prior': [True, False] }, 
}


In [83]:
trained_model_NB = sub_model(models, params, x_train_vector, x_test_vector, y_train, y_test)
## ML_modeling method also prints performance scores for each classifier

Naive Bayes : {'alpha': 0.5, 'fit_prior': True}
Accuracy: 0.936 	Precision: 0.937 	Recall: 0.937 		F1: 0.932



In [84]:
trained_model_NB.get_params

<bound method BaseEstimator.get_params of GridSearchCV(cv=10, error_score=0,
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [0.5, 1], 'fit_prior': [True, False]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)>

In [85]:
# Test the subject classification model:
def vectorize(test_input):
    #stem and  stop words
    global vectorizer
    stop_words = stopwords.words('english')
    ps = PorterStemmer()
    
    #convert into tokens, remove stop words and stem the tokens
    tokenizer = RegexpTokenizer('[A-Za-z]\w+')
    test_input = tokenizer.tokenize(str(test_input))
    
    test_input = [ps.stem(token) for token in test_input if token not in stop_words]      
#     print(test_input)

    test_input =  [' '.join(map(str,test_input))]
    print(test_input)
    test_input = np.array(test_input)
#     print(test_input)
    test_vector = vectorizer.transform(test_input)
#     print(test_vector)    
    
    return test_vector


In [86]:
test_input = "virus spreading"
# test_input= test_input.replace('"', ' ')
print(type(test_input))
print(test_input)

# test_input = np.array(test_input)
# print(test_input)
# test_vector = vectorizer.transform(test_input)
# print(test_vector)

<class 'str'>
virus spreading


In [88]:
#predict the subject classifiction
test_vector= vectorize(test_input)
y_pred = trained_model_NB.predict(test_vector)
y_prob = trained_model_NB.predict_proba(test_vector)
y_pred,y_prob,y_prob[0][np.argmax(y_prob)]

['viru spread']


(array([9]),
 array([[0.09653965, 0.08104756, 0.09881052, 0.07939591, 0.09255726,
         0.0897472 , 0.09499243, 0.07741363, 0.0922975 , 0.19719834]]),
 0.19719834287549579)

In [89]:
# save the model to disk
filename = 'NB_Model.sav'
pickle.dump(trained_model_NB, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(test_vector)
print(label_encode.inverse_transform(result))

['technologie']
