In [None]:
import re

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

import matplotlib.pyplot as plt

import spacy
from nltk.stem import PorterStemmer

nlp = spacy.load('en_core_web_sm')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df_train = pd.read_csv("/content/drive/MyDrive/locus.csv")
df_valid = pd.read_csv("/content/drive/MyDrive/validation.csv")
test = pd.read_csv("/content/drive/MyDrive/test.csv")

In [None]:
print(df_train.shape)
print(df_valid.shape)
train = df_train.append(df_valid)
train.shape

(390603, 4)
(48824, 4)


(439427, 4)

In [None]:
df_train.head()

Unnamed: 0,id,abstract,category,category_num
0,271675,Bacteria are often exposed to multiple stimu...,q-bio-QM,138
1,412276,Accurate knowledge of the thermodynamic prop...,hep-ph-,68
2,256956,The largest X9.3 solar flare in solar cycle ...,astro-ph-SR,7
3,427612,We say that a random integer variable $X$ is...,math-PR,93
4,113852,We derive a formula expressing the joint dis...,math-CO,76


In [None]:
ps = PorterStemmer()

In [None]:
def clean_abstract(text):
  text = re.sub('[^a-z\s]', ' ', text.lower())
  text = [i.lower() for i in text.split() if i not in nlp.Defaults.stop_words]
  text = [ps.stem(i) for i in text]
  text = ' '.join(text)
  text.replace('\n',' ')
  return text

In [None]:
train['abstract'] = train['abstract'].apply(clean_abstract)

In [None]:
test['abstract'] = test['abstract'].apply(clean_abstract)

In [None]:
X_train, y_train = train['abstract'],train['category_num']
test_data = test['abstract']

In [None]:
tfidf = TfidfVectorizer(max_features=20000, stop_words='english', ngram_range=(1,2))

In [None]:
tfidf_vectorizer = tfidf.fit(X_train)

X_train = tfidf_vectorizer.transform(X_train)

In [None]:
test_data = tfidf_vectorizer.transform(test_data)

In [None]:
X_train

<439427x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 26222805 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [None]:
def sampling_strategy(y,n_samples, t='majority'):
    target_classes = ''
    if t == 'majority':
        target_classes = {k:v for k,v in dict(Counter(y)).items() if v > n_samples}
    elif t == 'minority':
        # Since SMOTE depends on KNN, filtering out those samples that occur less than 10 times
        target_classes = {k:v for k,v in dict(Counter(y)).items() if v < n_samples and v > 10}
    sampling_strategy = {k:n_samples for k,v in target_classes.items()}
    return sampling_strategy

In [None]:
over_sampler_ss = sampling_strategy(y_train,1000,t='minority')

In [None]:
over = SMOTE(sampling_strategy=over_sampler_ss)

In [None]:
X_over, y_over = over.fit_resample(X_train, y_train)
#svc_s = LinearSVC(max_iter=2000)

**SVC**

In [None]:
svc_s = LinearSVC(max_iter=2000, class_weight='balanced')

In [None]:
svc_s.fit(X_train, y_train)
y_pred = svc_s.predict(X_train)
print('F1 Score : {}'.format(f1_score(y_train, y_pred, average='macro')))

F1 Score : 0.8188600585129954


In [None]:
svc_s.fit(X_over, y_over)
y_pred = svc_s.predict(X_train)
print('F1 Score : {}'.format(f1_score(y_train, y_pred, average='macro')))

F1 Score : 0.8112695078457807


In [None]:
print('F1 Score : {}'.format(f1_score(y_train, y_pred, average='micro')))

F1 Score : 0.804333825641119


In [None]:
test

Unnamed: 0,id,abstract
0,430065,depth map obtain commerci depth sensor low res...
1,75226,lambda express introduc java program languag r...
2,301990,propos demonstr gamma gamma collid w gg gev ad...
3,301001,physic lab student experi wide rang equit ineq...
4,280179,exist local minima hidden layer relu network i...
...,...,...
48821,465386,recent approach onlin action detect tend appli...
48822,163074,calcul spectrum b c meson non relativist quark...
48823,70019,si photon immens potenti develop compact low l...
48824,157781,depth sens applic rang home robot ar vr common...


In [None]:
y_pred = svc_s.predict(test_data)
output = pd.DataFrame({'id': test.id,
                       'category_num': y_pred})
output.head()
output.to_csv('solution.csv', index=False)

**Naive Bayes**

In [None]:
nb = MultinomialNB()

In [None]:
nb.fit(X_train, y_train)
y_pred = nb.predict(X_train)
print('F1 Score : {}'.format(f1_score(y_train, y_pred, average='macro')))

F1 Score : 0.23938505162227913


In [None]:
nb.fit(X_under, y_under)
y_pred = nb.predict(X_train)
print('F1 Score : {}'.format(f1_score(y_train, y_pred, average='macro')))

F1 Score : 0.26553298579643286


In [None]:
nb.fit(X_over, y_over)
y_pred = nb.predict(X_train)
print('F1 Score : {}'.format(f1_score(y_train, y_pred, average='macro')))

F1 Score : 0.3105136339101684


**Logistic Regression**

In [None]:
logreg = LogisticRegression(n_jobs=1, C=1e5, max_iter = 200)

In [None]:
logreg.fit(X_train,y_train)
y_pred = logreg.predict(X_train)
print('F1 Score : {}'.format(f1_score(y_train, y_pred, average='macro')))

F1 Score : 0.5624722231388912


In [None]:
logreg.fit(X_under,y_under)
y_pred = logreg.predict(X_train)
print('F1 Score : {}'.format(f1_score(y_train, y_pred, average='macro')))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


F1 Score : 0.5995019024694178


In [None]:
logreg.fit(X_over,y_over)
y_pred = logreg.predict(X_train)
print('F1 Score : {}'.format(f1_score(y_train, y_pred, average='macro')))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


F1 Score : 0.7632795797709603


In [None]:
plain = LogisticRegression()
plain.fit(X_over, y_over)
y_pred = plain.predict(X_train)
print('F1 Score : {}'.format(f1_score(y_train, y_pred, average='macro')))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


F1 Score : 0.613139997156404


In [None]:
print('F1 Score : {}'.format(f1_score(y_train, y_pred, average='micro')))

F1 Score : 0.6804725244466089


In [None]:
y_pred = svc_s.predict(test_data)

In [None]:
y_pred 

array([ 25,  54,  66, ..., 110,  25, 104])

In [None]:
test.id

0        430065
1         75226
2        301990
3        301001
4        280179
          ...  
48821    465386
48822    163074
48823     70019
48824    157781
48825    333324
Name: id, Length: 48826, dtype: int64

In [None]:
output = pd.DataFrame({'id': test.id,
                       'category_num': y_pred})
output.head()
output.to_csv('solution.csv', index=False)

In [None]:
output.head()

Unnamed: 0,id,category_num
0,430065,25
1,75226,50
2,301990,108
3,301001,118
4,280179,40
