Import Required Libraries

In [5]:
import re
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\obhim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\obhim\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\obhim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\obhim\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Import the dataset and clean the datasets

In [6]:
train= pd.read_csv("train.csv")
test= pd.read_csv("test.csv")
train.head(2)
print(train.shape)
print(test.shape)
col = ['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']
test = test.drop(['ID'],axis=1)

X = train.loc[:,['TITLE','ABSTRACT']]
y = train.loc[:,col]

(20972, 9)
(8989, 3)


Prepare the dataset for fitting in models

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, shuffle=True)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

y_test.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)

y1 = np.array(y_train)
y2 = np.array(y_test)
#Removing Punctuations

X_train.replace('[^a-zA-Z]',' ', regex=True, inplace=True)
X_test.replace('[^a-zA-Z]',' ', regex=True, inplace=True)

test.replace('[^a-zA-Z]',' ', regex=True, inplace=True)

(18874, 2) (2098, 2)
(18874, 6) (2098, 6)


In [8]:
#Converting to lower case characters

for index in X_train.columns:
  X_train[index] = X_train[index].str.lower()

for index in X_test.columns:
  X_test[index] = X_test[index].str.lower()

for index in test.columns:
  test[index] = test[index].str.lower()

#Removing one letter words

X_train['ABSTRACT'] = X_train['ABSTRACT'].str.replace(r'\b\w\b', '').str.replace(r'\s+', ' ')
X_test['ABSTRACT'] = X_test['ABSTRACT'].str.replace(r'\b\w\b', '').str.replace(r'\s+', ' ')

test['ABSTRACT'] = test['ABSTRACT'].str.replace(r'\b\w\b', '').str.replace(r'\s+', ' ')

#Removing multiple blank spaces

X_train = X_train.replace(r's+', ' ', regex=True)
X_test = X_test.replace(r's+', ' ', regex=True)

test = test.replace(r's+', ' ', regex=True)

In [9]:
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back to form a preprocessed text
    processed_text = ' '.join(tokens)
    
    return processed_text

In [10]:
def convert_to_lines(data):
    lines = []
    for row in range(data.shape[0]):
        lines.append(' '.join(str(x) for x in data.iloc[row, :]))
    return lines
stop_words = set(stopwords.words('english')) 
X_train['combined'] = X_train['TITLE']+' '+X_train['ABSTRACT']
X_test['combined'] = X_test['TITLE']+' '+X_test['ABSTRACT']

test['combined'] = test['TITLE']+' '+test['ABSTRACT']

X_train = X_train.drop(['TITLE','ABSTRACT'],axis=1)
X_test = X_test.drop(['TITLE','ABSTRACT'],axis=1)

test = test.drop(['TITLE','ABSTRACT'],axis=1)

X_train.head()
X_lines = []
for row in range(0,X.shape[0]):
  X_lines.append(' '.join(str(x) for x in X.iloc[row,:]))

train_lines = []
for row in range(0,X_train.shape[0]):
  train_lines.append(' '.join(str(x) for x in X_train.iloc[row,:]))

test_lines = []
for row in range(0,X_test.shape[0]):
  test_lines.append(' '.join(str(x) for x in X_test.iloc[row,:]))

predtest_lines = []
for row in range(0,test.shape[0]):
  predtest_lines.append(' '.join(str(x) for x in test.iloc[row,:]))
from sklearn.feature_extraction.text import CountVectorizer

countvector = CountVectorizer(ngram_range=(1,2))
X_train_cv = countvector.fit_transform(train_lines)
X_test_cv = countvector.transform(test_lines)

test_cv = countvector.transform(predtest_lines)

In [11]:
#Using TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

tfidfvector = TfidfTransformer()
X_train_tf = tfidfvector.fit_transform(X_train_cv)
X_test_tf = tfidfvector.fit_transform(X_test_cv)

test_tf = tfidfvector.fit_transform(test_cv)

X_cv = countvector.transform(X_lines)

X_tf = tfidfvector.fit_transform(X_cv) 

Preparing Model and train the model

In [12]:
from sklearn.svm import LinearSVC
from sklearn.multioutput import MultiOutputClassifier

model = LinearSVC(C=0.5, class_weight='balanced', random_state=42)
models = MultiOutputClassifier(model)

models.fit(X_train_tf, y1)
preds = models.predict(X_test_tf)
preds
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

#print(confusion_matrix(y2,preds))
print(classification_report(y2,preds))
print(accuracy_score(y2,preds))
predssv = models.predict(test_tf)
predssv
test = pd.read_csv('test.csv')

submit = pd.DataFrame({'ID': test.ID, 'Computer Science': predssv[:,0],'Physics':predssv[:,1],
                       'Mathematics':predssv[:,2],'Statistics':predssv[:,3],'Quantitative Biology':predssv[:,4],
                       'Quantitative Finance':predssv[:,5]})
submit.head()
submit.to_csv('obhi_data.csv', index=False)

              precision    recall  f1-score   support

           0       0.80      0.90      0.85       853
           1       0.89      0.88      0.89       623
           2       0.83      0.83      0.83       580
           3       0.73      0.85      0.78       516
           4       0.49      0.40      0.44        58
           5       0.81      0.65      0.72        26

   micro avg       0.80      0.86      0.83      2656
   macro avg       0.76      0.75      0.75      2656
weighted avg       0.81      0.86      0.83      2656
 samples avg       0.84      0.88      0.84      2656

0.6601525262154433


In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.multioutput import MultiOutputClassifier

# Define the extended parameter grid
param_grid = {
    'estimator__C': [0.001, 0.01, 0.1, 0.5, 1, 10, 100],  # Regularization parameter
    'estimator__class_weight': [None, 'balanced'],
    'estimator__loss': ['hinge', 'squared_hinge'],  # Loss function
    'estimator__tol': [1e-5, 1e-4, 1e-3],  # Tolerance for stopping criteria
    'estimator__max_iter': [1000, 2000, 3000],  # Maximum number of iterations
}

# Create the base model
base_model = LinearSVC(random_state=42)

# Create the MultiOutputClassifier
model = MultiOutputClassifier(base_model)

# Create the GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model to the training data
grid_search.fit(X_train_tf, y1)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
preds = best_model.predict(X_test_tf)

# Print evaluation metrics
from sklearn.metrics import accuracy_score, classification_report
print(classification_report(y2, preds))
print("Accuracy:", accuracy_score(y2, preds))

# Make predictions on the final test set
predssv = best_model.predict(test_tf)

# Create the submission dataframe
submit = pd.DataFrame({'ID': test.ID, 'Computer Science': predssv[:, 0],
                       'Physics': predssv[:, 1], 'Mathematics': predssv[:, 2],
                       'Statistics': predssv[:, 3], 'Quantitative Biology': predssv[:, 4],
                       'Quantitative Finance': predssv[:, 5]})

# Save the submission to a CSV file
submit.to_csv('obhi_data_tuned_extended.csv', index=False)


Best Parameters: {'estimator__C': 1, 'estimator__class_weight': 'balanced', 'estimator__loss': 'squared_hinge', 'estimator__max_iter': 1000, 'estimator__tol': 1e-05}
              precision    recall  f1-score   support

           0       0.80      0.90      0.85       853
           1       0.90      0.88      0.89       623
           2       0.85      0.82      0.84       580
           3       0.73      0.83      0.77       516
           4       0.56      0.34      0.43        58
           5       0.79      0.58      0.67        26

   micro avg       0.81      0.85      0.83      2656
   macro avg       0.77      0.72      0.74      2656
weighted avg       0.82      0.85      0.83      2656
 samples avg       0.84      0.87      0.84      2656

Accuracy: 0.6625357483317446


In [19]:
import pickle
# Save the trained MultiOutputClassifier model to a file
with open('multi_output_classifier_model.pkl', 'wb') as file:
    pickle.dump(best_model, file) 
# Loading the MultiOutputClassifier model
with open('multi_output_classifier_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Save the CountVectorizer
with open('countvectorizer.pkl', 'wb') as file:
    pickle.dump(countvector, file)
# Loading the CountVectorizer
with open('countvectorizer.pkl', 'rb') as file:
    loaded_countvectorizer = pickle.load(file)

# Save the TfidfTransformer
with open('tfidftransformer.pkl', 'wb') as file:
    pickle.dump(tfidfvector, file)
# Loading the TfidfTransformer
with open('tfidftransformer.pkl', 'rb') as file:
    loaded_tfidftransformer = pickle.load(file)