# Step1: Files import

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import re
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,f1_score,classification_report
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import seaborn as sns

In [2]:
tdf=pd.read_csv("train.csv")

In [3]:
#Basic checks 
print("Train data shape--",tdf.shape)
print("Train data columns--",tdf.columns)
print("Row of the text----",tdf['TITLE'][3])
tdf.head()

Train data shape-- (20972, 9)
Train data columns-- Index(['ID', 'TITLE', 'ABSTRACT', 'Computer Science', 'Physics', 'Mathematics',
       'Statistics', 'Quantitative Biology', 'Quantitative Finance'],
      dtype='object')
Row of the text---- A finite element approximation for the stochastic Maxwell--Landau--Lifshitz--Gilbert system


Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


# Data EDA

In [4]:
# 1.Checking the noise data checking all the missing data
tdf[(tdf['Computer Science']==0) & (tdf['Physics']==0) & (tdf['Mathematics']==0) & (tdf['Statistics']==0 )
    & (tdf['Quantitative Biology']==0) & (tdf['Quantitative Finance']==0)]

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance


In [5]:

print("computer science")
print(tdf['Computer Science'].value_counts())
print("physics")
print(tdf['Physics'].value_counts())
print("Mathematics")
print(tdf['Mathematics'].value_counts())
print("Statistics")
print(tdf['Statistics'].value_counts())
print("Quantitative Biology")
print(tdf['Quantitative Biology'].value_counts())
print("Quantitative Finance")
print(tdf['Quantitative Finance'].value_counts())

computer science
0    12378
1     8594
Name: Computer Science, dtype: int64
physics
0    14959
1     6013
Name: Physics, dtype: int64
Mathematics
0    15354
1     5618
Name: Mathematics, dtype: int64
Statistics
0    15766
1     5206
Name: Statistics, dtype: int64
Quantitative Biology
0    20385
1      587
Name: Quantitative Biology, dtype: int64
Quantitative Finance
0    20723
1      249
Name: Quantitative Finance, dtype: int64


In [6]:
# 2. Concatenating title and abstract columns
tdf['text']=tdf['TITLE']+tdf['ABSTRACT']
# 3. Removing unnecessary columns
tdf=tdf.drop(['ID','TITLE','ABSTRACT'],axis=1)

In [7]:
# 4. function for data clearning/noise data removel 
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [8]:
tdf['text'] = tdf['text'].map(lambda com : clean_text(com))
tdf['text'][0]

'reconstructing subject specific effect maps predictive models allow subject specific inference when analyzing disease related alterations in neuroimaging data given a subject data inference can be made at two levels global i e identifiying condition presence for the subject and local i e detecting condition effect on each individual measurement extracted from the subject data while global inference is widely used local inference which can be used to form subject specific effect maps is rarely used because existing models often yield noisy detections composed of dispersed isolated islands in this article we propose a reconstruction method named rsm to improve subject specific detections of predictive modeling approaches and in particular binary classifiers rsm specifically aims to reduce noise due to sampling error associated with using a finite sample of examples to train classifiers the proposed method is a wrapper type algorithm that can be used with different binary classifiers in 

In [9]:
testdf=pd.read_csv('test.csv')
ids_test=testdf[['ID']]
categories = ['Computer Science', 'Physics', 'Mathematics', 'Statistics','Quantitative Biology', 'Quantitative Finance']
for category in categories:
#     ****************************** Train and test split *********************************
    tdf_x=tdf['text']
    tdf_y=tdf[category]
    print(category)
    X_train,X_test,y_train,y_test = train_test_split(tdf_x,tdf_y, random_state=42, test_size=0.33,shuffle=False)
    # converting text to numeric
    tfidf=TfidfVectorizer(stop_words=stop_words,ngram_range=(1, 2),max_df=.8)
    X_train=tfidf.fit_transform(X_train)
    X_test=tfidf.transform(X_test)
    
#     ********************  over sampling with smote ****************************************
    print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1))) 
    print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0))) 
    # import SMOTE module from imblearn library 
    from imblearn.over_sampling import SMOTE 
    sm = SMOTE(random_state = 2) 
    X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel()) 
    print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape)) 
    print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape)) 
    print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1))) 
    print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0))) 


#     ******************************Model building -classification *****************************************
    # Linear SVC
    cs_clf=OneVsRestClassifier(LinearSVC(), n_jobs=1)
    cs_clf.fit(X_train_res, y_train_res.ravel())
    prediction = cs_clf.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))
    print(classification_report(y_test, prediction)) 
    
#     **********************************  Prediction with text data ****************************************
    testdf=pd.read_csv('test.csv')
    testdf['text']=testdf['TITLE']+testdf['ABSTRACT']
    testdf1=testdf.drop(['ID','TITLE','ABSTRACT'], axis=1)
    testdf1['text'] = testdf1['text'].map(lambda com : clean_text(com))
    testdf1['text'][0]
    testdf1=testdf1['text']
    testdf1=tfidf.transform(testdf1)
    prediction_t = cs_clf.predict(testdf1)
    df_con=pd.DataFrame(prediction_t)
    ids_test = pd.concat([ids_test,df_con], axis=1)

Computer Science
Before OverSampling, counts of label '1': 5742
Before OverSampling, counts of label '0': 8309 

After OverSampling, the shape of train_X: (16618, 935125)
After OverSampling, the shape of train_y: (16618,) 

After OverSampling, counts of label '1': 8309
After OverSampling, counts of label '0': 8309
Test accuracy is 0.874873573183066
              precision    recall  f1-score   support

           0       0.92      0.86      0.89      4069
           1       0.82      0.89      0.85      2852

   micro avg       0.87      0.87      0.87      6921
   macro avg       0.87      0.88      0.87      6921
weighted avg       0.88      0.87      0.88      6921

Physics
Before OverSampling, counts of label '1': 4031
Before OverSampling, counts of label '0': 10020 

After OverSampling, the shape of train_X: (20040, 935125)
After OverSampling, the shape of train_y: (20040,) 

After OverSampling, counts of label '1': 10020
After OverSampling, counts of label '0': 10020
Test accurac

In [10]:
#renaming the column names
ids_test.columns=['ID','Computer Science', 'Physics', 'Mathematics',
       'Statistics', 'Quantitative Biology', 'Quantitative Finance']
#Exporting the results
ids_test.to_csv('Text_Classification_LinearSVC_ovr_smt_n2.csv',index=False)