In [1]:
import pandas as pd
import numpy as np
import json
import nltk
import re
import csv
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [6]:
df= pd.read_csv("train.csv")

In [7]:
df['Text'] = df[['TITLE', 'ABSTRACT']].apply(lambda x: ' '.join(x), axis=1)

In [8]:
del df['TITLE']
del df['ABSTRACT']

In [9]:
def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text

In [10]:
df['Text'] = df['Text'].apply(lambda x: clean_text(x))


In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ranit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

In [13]:
df['Text'] = df['Text'].apply(lambda x: remove_stopwords(x))

In [14]:
tfidf_vectorizer = TfidfVectorizer()

In [15]:
xtrain, xval, ytrain, yval = train_test_split(df['Text'], df[['Computer Science','Physics','Mathematics','Statistics','Quantitative Biology','Quantitative Finance']], test_size=0.2, random_state=9)

In [16]:
xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

In [17]:
from sklearn.linear_model import LogisticRegression

# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier

# Performance metric
from sklearn.metrics import f1_score

In [18]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

In [19]:
# fit model on train data
clf.fit(xtrain_tfidf, ytrain)



OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [20]:
y_pred = clf.predict(xval_tfidf)


In [21]:
y_pred_prob = clf.predict_proba(xval_tfidf)

In [22]:
t = 0.40 # threshold value
y_pred_new = (y_pred_prob >= t).astype(int)

In [23]:
# evaluate performance
f1_score(yval, y_pred_new, average="micro")

0.8265081039608707

In [24]:
dftest= pd.read_csv("test.csv")

In [25]:
dftest['Text'] = dftest[['TITLE', 'ABSTRACT']].apply(lambda x: ' '.join(x), axis=1)

In [26]:
del dftest['TITLE']
del dftest['ABSTRACT']

In [27]:
dftest['Text'] = dftest['Text'].apply(lambda x: clean_text(x))
dftest['Text'] = dftest['Text'].apply(lambda x: remove_stopwords(x))

In [28]:
dftest_tfidf = tfidf_vectorizer.transform(dftest['Text'])


In [29]:
predictions=clf.predict(dftest_tfidf)


In [30]:
predictions_prob = clf.predict_proba(dftest_tfidf)

In [31]:
t = 0.4 # threshold value
y_pred_new = (predictions_prob >= t).astype(int)

In [None]:
submission=pd.DataFrame(y_pred_new)
submission.columns = ['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology','Quantitative Finance']
submission.to_csv('submission.csv')
