<a href="https://colab.research.google.com/github/Olayile/Research_paper_classifier/blob/master/Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
!pip install nltk

import pandas as pd
import sklearn
import numpy as np
import nltk
import re

nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn import tree
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import chi2

from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest


from sklearn.metrics import precision_recall_fscore_support

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
from google.colab import files
uploaded = files.upload()

Saving PoPCites.csv to PoPCites.csv


In [12]:
import io
df2 = pd.read_csv(io.BytesIO(uploaded['PoPCites.csv']))

In [67]:
# load data to only get the title and the journal name

def Data(data):
  return data[["Title" ,"Source"]]

def preProcessing(titles):
    stops = set(stopwords.words("english"))
    words = titles.lower().split()
    words = [w.lower() for w in words if not w in stops]
    return words
    
def preProcessing2(features):
    num_titles = features.size
    clean_wordlist = []
    clean_titles = []
    stops = set(stopwords.words('english'))
    for i in range( 0, num_titles):
        #letters_only = re.sub("[^a-zA-Z]", " ", features[i]) 
        words = features[i].lower().split()
        words = [w.lower() for w in words if not w in stops]  
        clean_wordlist.append(words)
        clean_titles.append(" ".join(words))
    return clean_titles, clean_wordlist


In [56]:
Data_new= df2[["Title" ,"Source"]]
Data_new.dropna(inplace=True)
Data_new['Title'].apply(preProcessing)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


1      [chapter, 1., introduction, sulfur, chemical, ...
2      [part, ii:, chemical, biology, organosulfur, m...
3       [part, i:, inorganic, sulfur, chemical, biology]
4      [created, green, chemical, engineering, (green...
5           [chapter, 10., n–p, bond, chemical, biology]
                             ...                        
195              [physical, methods, chemical, analysis]
196                        [physics, chemical, industry]
197             [chemical, checkers, chemical, dominoes]
198    ["chemical, age", chemical, dictionary:, chemi...
199    [“chemical, age”, chemical, dictionary:, chemi...
Name: Title, Length: 191, dtype: object

In [57]:
Data_new

Unnamed: 0,Title,Source
1,Chapter 1. Introduction to Sulfur Chemical Bio...,The Chemical Biology of Sulfur
2,Part II: Chemical Biology of Organosulfur Meta...,The Chemical Biology of Sulfur
3,Part I: Inorganic Sulfur Chemical Biology,The Chemical Biology of Sulfur
4,Why we created Green Chemical Engineering (Gre...,Green Chemical Engineering
5,Chapter 10. N–P Bond Chemical Biology,The Chemical Biology of Phosphorus
...,...,...
195,Physical Methods in Chemical Analysis,Chemical Engineering Science
196,Physics in Chemical Industry,Chemical Engineering Science
197,Chemical Checkers and Chemical Dominoes,Chemical & Engineering News
198,"The ""Chemical Age"" Chemical Dictionary: Chemic...",Journal of Chemical Education


In [58]:
# Convert our text to numeric form using erm frequency-inverse document frequency or TF-IDF.

def getDTMByTFIDF(titles,nfeatures):
    tfIdf_vectorizer = TfidfVectorizer(max_features=nfeatures)
    dtm = tfIdf_vectorizer.fit_transform(titles).toarray()
    return dtm, tfIdf_vectorizer

In [59]:
def featuresByChiSq(features,labels,nFeature=5000):
    chi2_model = SelectKBest(chi2,k=nFeature)
    dtm = chi2_model.fit_transform(features,labels)
    return dtm,chi2_model

def featuresByInformationGain(features,labels):
    treeCL = tree.DecisionTreeClassifier(criterion="entropy")
    treeCL = treeCL.fit(features,labels)
    transformed_features = SelectFromModel(treeCL,prefit=True).transform(features)
    return transformed_features

def featuresByLSA(features,ncomponents=100):
    svd = TruncatedSVD(n_components=ncomponents)
    normalizer =  Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    dtm_lsa = lsa.fit_transform(features)
    return dtm_lsa

In [63]:
def makeFeatureVec(words, model, num_features):
    feature_vec = np.zeros((num_features,),dtype="float32")
    nwords = 0.
    index2word_set = set(model.index2word)
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            feature_vec = np.add(feature_vec,model[word]) 

    feature_vec = np.divide(feature_vec,nwords)
   
    return feature_vec

def getAvgFeatureVecs(title, model, num_features):
    counter = 0.
    titleFeatureVecs = np.zeros((len(title), num_features),dtype="float32")
    for t in title:
        titleFeatureVecs[counter] = makeFeatureVec(t, model,num_features)
        counter = counter + 1.
    return titleFeatureVecs

In [None]:

def crossValidate(document_term_matrix,labels,classifier="SVM",nfold=2):
    clf = None
    precision = []
    recall = []
    fscore = []
    
    if classifier == "RF":
        clf = RandomForestClassifier()
    elif classifier == "NB":
        clf = MultinomialNB()
    elif classifier == "SVM":
        clf = LinearSVC()
    
    skf = StratifiedKFold(labels, n_folds=nfold)

    for train_index, test_index in skf:
        X_train, X_test = document_term_matrix[train_index], document_term_matrix[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        model = clf.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        p,r,f,s = precision_recall_fscore_support(y_test, y_pred, average='weighted')
        precision.append(p)
        recall.append(r)
        fscore.append(f)
        
    return np.mean(precision),np.mean(recall),np.mean(fscore)

In [68]:
titles = Data_new['Title']
labels = Data_new['Source']

dtm,vect = getDTMByTFIDF(titles,None)

chisqDtm, chisqModel = featuresByChiSq(dtm,labels,200)

processed_titles, processed_titles_wordlist = preProcessing2(titles)

KeyError: ignored

In [62]:
# We will use three classification algorithms to categorize research papers. 1) Support vector machine with linear kernel 2) Random forest and 3) Multinomial Naïve Bayes.
num_features = 300    # Word vector dimensionality                      
min_word_count = 1    # Minimum word count                        
num_workers = 1       # Number of threads to run in parallel
context = 8           # Context window size                                                                                    
downsampling = 1e-5   # Downsample setting for frequent words

word2vec_model = Word2Vec(processed_titles_wordlist, workers=num_workers, 
            size=num_features, min_count = min_word_count, 
            window = context, sample = downsampling)
word2vec_model.init_sims(replace=True)


NameError: ignored