In [1]:
import pandas as pd
import numpy as np
import json
import joblib
import gensim
import re
import pickle
import warnings
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Let's import our data
data = pd.read_csv('../data/processedasksciencefinal.csv')

In [3]:
flairs = ['Physics','Astronomy','Mathematics','Computing','Engineering',
          'Chemistry','Earth Sciences','Planetary Sci.','Biology',
          'Paleontology','Medicine','Human Body','Neuroscience','na','Psychology']

In [4]:
# Let's split our data into train and test
# We will use 75:25 rule, 75% for train and 25% for test
# Let's create the function for it
# we will use train_test_split function for it which comes with sklearn
# setting random state to 10, we can set it to any fixed value

def train_test(X,y): 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 10)
    print("Naive Bayes Classifier ")
    naivebayes_classifier(X_train, X_test, y_train, y_test)
    print("Linear Support Vector Machine ")
    lsvm(X_train, X_test, y_train, y_test)
    print("Logistic Regression ")
    logistic_regression(X_train, X_test, y_train, y_test)
    print("Random Forest ")
    random_forest(X_train, X_test, y_train, y_test)

In [5]:
# Let's create a function for each classifier

# Naive Bayes
def naivebayes_classifier(X_train, X_test, y_train, y_test):
    nb = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB()),
                ])
    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_test)
    print("Accuracy: "+str(accuracy_score(y_pred,y_test)))

# Linear Support Vector Machine
def lsvm(X_train, X_test, y_train, y_test):
    sgd = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=10, max_iter=5, tol=None)),
                 ])
    sgd.fit(X_train, y_train)
    y_pred = sgd.predict(X_test)
    print("Accuracy: "+str(accuracy_score(y_pred,y_test)))

# Logistic Regression
def logistic_regression(X_train, X_test, y_train, y_test):
    logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=1, C=1e30)),
                 ])
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    print("Accuracy: "+str(accuracy_score(y_pred,y_test)))
    
# Random Forest
def random_forest(X_train, X_test, y_train, y_test):
    ranfor = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 10)),
                 ])
    ranfor.fit(X_train, y_train)
    y_pred = ranfor.predict(X_test)
    print("Accuracy: "+str(accuracy_score(y_pred,y_test)))

In [6]:
warnings.filterwarnings('ignore')
print("Flair Detection using Title as a Feature:")
print("---------------------------------------")
train_test(data.title, data.flair)

Flair Detection using Title as a Feature:
---------------------------------------
Naive Bayes Classifier 
Accuracy: 0.9795918367346939
Linear Support Vector Machine 
Accuracy: 0.9925265881000287
Logistic Regression 
Accuracy: 0.999137683242311
Random Forest 
Accuracy: 0.999137683242311


In [7]:
print("Flair Detection using Body as a Feature:")
print("---------------------------------------")
train_test(data.body, data.flair)

Flair Detection using Body as a Feature:
---------------------------------------
Naive Bayes Classifier 
Accuracy: 0.7936188559931014
Linear Support Vector Machine 
Accuracy: 0.7755102040816326
Logistic Regression 
Accuracy: 0.8177637252083932
Random Forest 
Accuracy: 0.8177637252083932


In [8]:
print("Flair Detection using Title+Body+URL as a Feature:")
print("---------------------------------------")
train_test(data.feature_tbu, data.flair)

Flair Detection using Title+Body+URL as a Feature:
---------------------------------------
Naive Bayes Classifier 
Accuracy: 0.9833285426846795
Linear Support Vector Machine 
Accuracy: 0.9971256108077033
Logistic Regression 
Accuracy: 1.0
Random Forest 
Accuracy: 1.0


### After detailed analysis of all the metrics!
### Combined feature of Title, Body and URL performed better than others.
### Logistic Regression performed better than other algorithms!

In [6]:
X=data.feature_tbu
y=data.flair
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 10)
logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=1, C=1e30)),
                 ])
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
# Let's Dump our model
pickle.dump(logreg, open('../models/final_model_askscience.pkl','wb'))