In [76]:
import pandas as pd
import numpy as np
import json
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

'''Classifiers'''
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier

'''Metrics/Evaluation'''
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix

import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import StanfordNERTagger

In [77]:
data = dict()
content, label = [], []

In [78]:
with open('final_data.json', 'r') as f:
    data = json.load(f)
    
for each in data:
    content.append(each)
    label.append(data[each])

In [33]:
#text is a string ex) "This is a SE project". Returns string of only NN and NNP
def POS_remove(text):
    tokens = word_tokenize(text)
    tagged = nltk.pos_tag(tokens)
    st = ""
    for i in tagged:
        if i[1] == "NN" or i[1] == "NNP":
            st = st + " " + str(i[0])
    st = st.strip()    
    return st

'''def NER_remove(text):
    st = StanfordNERTagger('/home/saksham/Documents/Sem 7/SE/Project/stanford_ner/classifiers/english.all.3class.distsim.crf.ser.gz','/home/saksham/Documents/Sem 7/SE/Project/stanford_ner/stanford-ner.jar',encoding='utf-8')
    tokenized_text = word_tokenize(text)
    classified_text = st.tag(tokenized_text)
    st = ""
    for i in classified_text:
        if i[1] != "PERSON" or i[1] != "LOCATION" or i[1] != "ORGANIZATION":
            st = st + " " + str(i[0])
    st = st.strip()    
    return st
'''
    
#stop words removal, lower case, punctutations
def pre_process(text):
    text = POS_remove(text)  
    text=text.lower()
    text = re.sub(r'\d+', '', text)
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    no_punct = ""
    for char in text:
        if char not in punctuations:
            no_punct = no_punct + char
    
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(no_punct)
    text = [i for i in tokens if not i in stop_words]
    #lemmatizer=WordNetLemmatizer()
    st = ""
    for word in text:
        #st = st + " " +lemmatizer.lemmatize(word)
        st = st + " " +word
    text = st.strip()
    st =""
    for ch in text:
        if(ch.isalpha() or ch == ' '):
            st = st + ch
    return st


In [35]:
f = open("test.txt", "r")
test = f.read()

test = pre_process(test)

content.append(test)
label.append('CC')

In [36]:
content[3477]

'web developer web developer bangalore karnataka challenging role career part organization scope experience knowledge world web technology experience web developer cigma india may november system admin present cigma india  design develop manage wordpress update website content seo develop scratch php mysql server system management registration dashboards reportschartsgraphs assess client architecture manage programs events resume achievements personal strengths  pursued web development course aspire gate institute  attitude ability pressure familiar secure web designing client secure website minority welfare department govt karnataka url http  coordinated ngo csr  coordinated various development  leadership training programs declaration knowledge belief education bachelor engineering engineering city engineering college wordpress design national college holy mother english high school links http http googlecmab'

In [37]:
df = pd.DataFrame([content, label]).T
df.columns= ['content', 'label']
df.head()

Unnamed: 0,content,label
0,software engineer implementation software engi...,CC
1,web designer developer class hl web span desig...,WT
2,software engineer software engineer software e...,CN
3,cloud engineer consultant class hl cloud span ...,CC
4,linux system admin linux engineer linux system...,CN


In [38]:
#Preparing the dataframes

#Splitting the df into the different categories
#df_WT = df.loc[df['label'] == 'WT']  # WEB TECHNOLOGY
#df_ML = df.loc[df['label'] =='ML']  #MACHINE LEARNING
#df_CN = df.loc[df['label'] == 'CN']  #COMPUTER NETWORKS
#df_CC = df.loc[df['label'] =='CC']  #CLOUD COMPUTING
#df_CG = df.loc[df['label'] =='CG']  #COMPUTER GRAPHICS

'''
#Randomly sampling to create imbalanced classes
df_WT = df_WT.sample(n=1005, random_state=3)
df_ML = df_ML.sample(n=805, random_state=3)
'''

#Holding out 5 articles from each class for prediction at the end
#df_WT_holdout = df_WT.iloc[:5]
#df_ML_holdout = df_ML.iloc[:5]
#df_CN_holdout = df_CN.iloc[:5]
#df_CC_holdout = df_CC.iloc[:5]
#df_CG_holdout = df_CG.iloc[:5]

#df_WT = df_WT.iloc[5:]
#df_ML = df_ML.iloc[5:]
#df_CN = df_CN.iloc[5:]
#df_CC = df_CC.iloc[5:]
#df_CG = df_CG.iloc[5:]

#Appending the dfs back together
#df = pd.concat([df_WT, df_ML, df_CN, df_CC, df_CG])
#df_holdout = pd.concat([df_WT_holdout, df_ML_holdout, df_CN_holdout, df_CC_holdout, df_CG_holdout ])

#Turning the labels into numbers
LE = LabelEncoder()
df['label_num'] = LE.fit_transform(df['label'])

print(df['label'].unique())
print(df['label_num'].unique())

In [72]:
lab = df['label'].unique()
lab_num = df['label_num'].unique()

In [73]:
print(lab)
print(lab_num)

['CC' 'WT' 'CN' 'ML' 'CG']
[0 4 2 3 1]


In [40]:
#Creating the features (tf-idf weights) for the processed text


texts = df['content'].astype('str')



print(texts)

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), 
                                   min_df = 2, 
                                   max_df = .95)

X = tfidf_vectorizer.fit_transform(texts) #features

#print(X[5589][0])

y = df['label_num'].values #target

print (X.shape)
print(y.shape)

0       software engineer implementation software engi...
1       web designer developer class hl web span desig...
2       software engineer software engineer software e...
3       cloud engineer consultant class hl cloud span ...
4       linux system admin linux engineer linux system...
5       support gss global support gss global support ...
6       technology consultant technology consultant te...
7       hr executive senior hr executive pune maharash...
8       project support associate project support asso...
9       finance finance intern finance udipi karnataka...
10      span class hl graphics span designer ui develo...
11      senior project assistant senior project assist...
12      perception engineer senior perception engineer...
13      c final year mtech project c final year mtech ...
14      operator programmer operator programmer vmc hm...
15      designer class hl graphics span designer gurga...
16      store manager store manager graduation mba tim...
17      accout

In [43]:
#print(X[3477])

test = X[3477]

In [48]:
y = y[:-1] 

3477

In [50]:
#Dimenionality reduction. Only using the 100 best features er category

lsa = TruncatedSVD(n_components=100, 
                   n_iter=10, 
                   random_state=3)

X = lsa.fit_transform(X)
X.shape

(3478, 100)

In [52]:
test = X[-1]
X = X[:-1]

In [55]:
X.shape

(3477, 100)

In [63]:
#Preliminary model evaluation using default parameters

#Creating a dict of the models
#model_dict = {'Stochastic Gradient Descent' : SGDClassifier(random_state=3, loss='log')}

model_dict = {'Dummy' : DummyClassifier(random_state=3),
              'Stochastic Gradient Descent' : SGDClassifier(random_state=3, loss='log'),
              'Random Forest': RandomForestClassifier(random_state=3),
              'Decsision Tree': DecisionTreeClassifier(random_state=3),
              'AdaBoost': AdaBoostClassifier(random_state=3),
              'Gaussian Naive Bayes': GaussianNB(),
              'K Nearest Neighbor': KNeighborsClassifier()}

#Train test split with stratified sampling for evaluation
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = .3, 
                                                    shuffle = True, 
                                                    stratify = y, 
                                                    random_state = 3)

#Function to get the scores for each model in a df
def model_score_df(model_dict):   
    model_name, ac_score_list, p_score_list, r_score_list, f1_score_list = [], [], [], [], []
    for k,v in model_dict.items():   
        model_name.append(k)
        v.fit(X_train, y_train)
        y_pred = v.predict(X_test)
        ac_score_list.append(accuracy_score(y_test, y_pred))
        p_score_list.append(precision_score(y_test, y_pred, average='macro'))
        r_score_list.append(recall_score(y_test, y_pred, average='macro'))
        f1_score_list.append(f1_score(y_test, y_pred, average='macro'))
        model_comparison_df = pd.DataFrame([model_name, ac_score_list, p_score_list, r_score_list, f1_score_list]).T
        model_comparison_df.columns = ['model_name', 'accuracy_score', 'precision_score', 'recall_score', 'f1_score']
        model_comparison_df = model_comparison_df.sort_values(by='f1_score', ascending=False)
    return model_comparison_df

model_score_df(model_dict)



Unnamed: 0,model_name,accuracy_score,precision_score,recall_score,f1_score
1,Stochastic Gradient Descent,0.868726,0.879766,0.872786,0.872537
2,Random Forest,0.765444,0.77613,0.768873,0.771417
0,Gaussian Naive Bayes,0.741313,0.750546,0.744836,0.746727
4,K Nearest Neighbor,0.730695,0.751536,0.734088,0.741052
3,AdaBoost,0.73166,0.743789,0.739539,0.740442
6,Decsision Tree,0.706564,0.713001,0.711591,0.711788
5,Dummy,0.206564,0.201074,0.201324,0.201014


In [57]:
#Preliminary model evaluation using default parameters

#Creating a dict of the models
#model_dict = {'Stochastic Gradient Descent' : SGDClassifier(random_state=3, loss='log')}

model = SGDClassifier(random_state=3, loss='log')

#Train test split with stratified sampling for evaluation
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, shuffle = True, stratify = y, random_state = 3)

#Function to get the scores for each model in a df
model.fit(X, y)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=3, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [68]:
#test = test.reshape(1, -1)
y_pred = model.predict(test)
#test.shape

In [70]:
print(y_pred)

[4]


In [74]:
#['CC' 'WT' 'CN' 'ML' 'CG']
#[0 4 2 3 1]

mp = {0:"Cloud computing", 1:"Computer Graphics", 2:"Computer Networks", 3:"Machine Learning", 4:"Web Technology"}

output = mp[y_pred[0]]

In [75]:
output

'Web Technology'

In [30]:
import pickle 
  
# Save the trained model as a pickle string. 
saved_model = pickle.dumps(model) 
  
# Load the pickled model 
#knn_from_pickle = pickle.loads(saved_model) 
  
# Use the loaded pickled model to make predictions 
#knn_from_pickle.predict(X_test) 

In [43]:
print(type(model))

<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>


In [39]:
filename = 'finalized_model.txt'


In [37]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.868725868726


In [44]:
result = SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,eta0=0.0, fit_intercept=True, l1_ratio=0.15,learning_rate='optimal', loss='log', max_iter=None, n_iter=None,n_jobs=1, penalty='l2', power_t=0.5, random_state=3, shuffle=True,tol=None, verbose=0, warm_start=False).score(X_test, y_test)

NotFittedError: This SGDClassifier instance is not fitted yet

In [42]:
result = result = loaded_model.score(X_test, y_test)
print(result).score(X_test, y_test)
print(result)

AttributeError: 'str' object has no attribute 'score'

In [64]:
X

array([[ 0.31076791,  0.13827592, -0.03877879, ...,  0.00144493,
        -0.01419858, -0.08643185],
       [ 0.29361776,  0.04444615, -0.02087508, ..., -0.0127362 ,
        -0.01455517, -0.02466081],
       [ 0.11628839,  0.0523266 ,  0.00205743, ...,  0.11528785,
         0.04087778,  0.08524027],
       ..., 
       [ 0.11100088, -0.00074102, -0.02486677, ...,  0.04963457,
         0.03647644, -0.00526828],
       [ 0.24851592,  0.23894643, -0.02627308, ...,  0.02956519,
         0.00769423, -0.0044162 ],
       [ 0.06897526,  0.06796924, -0.00611521, ...,  0.03972011,
         0.00509244, -0.01459564]])