## IMPORT REQUIRED LIBRARIES

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import joblib
import pickle
import pandas as pd
import warnings
from sklearn.linear_model import LogisticRegression
warnings.filterwarnings('ignore')

### define the flairs and read data. Afterwards define the features to be used and split into training and testing.

In [14]:
flairs = ["AskIndia", "Non-Political", "[R]eddiquette", "Scheduled", "Photography", "Science/Technology", "Politics", "Business/Finance", "Policy/Economy", "Sports", "Food", "AMA", "CAA-NRC", "Coronavirus"]

In [15]:
data = pd.read_csv('final.csv')
data.fillna("",inplace = True)

In [16]:
y = data.flair
X = data.feature_combine

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state = 42)

### define functions for logistic regression, stochastic gradient descent classifier and random forest classifier.
### using Pipeline, we used countvectorizer and tfidf transforms and for each function a different estimator.

In [18]:
def LogiReg(X_train, X_test, y_train, y_test):
    logreg = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression(n_jobs=1, C=1e5))])
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))
    
def SGDC(X_train, X_test, y_train, y_test):
    sgd = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),])
    sgd.fit(X_train, y_train)
    y_pred = sgd.predict(X_test)
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

def RandomForest(X_train, X_test, y_train, y_test):
    ranfor = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),])
    ranfor.fit(X_train, y_train)
    y_pred = ranfor.predict(X_test)
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

LogiReg(X_train, X_test, y_train, y_test)
SGDC(X_train, X_test, y_train, y_test)
RandomForest(X_train, X_test, y_train, y_test)


accuracy 0.6988636363636364
                    precision    recall  f1-score   support

          AskIndia       0.72      0.65      0.68        43
     Non-Political       0.40      0.44      0.42        36
     [R]eddiquette       0.69      0.69      0.69        42
         Scheduled       0.80      0.92      0.85        38
       Photography       0.75      0.90      0.82        42
Science/Technology       0.68      0.89      0.77        36
          Politics       0.53      0.52      0.52        33
  Business/Finance       0.93      0.84      0.88        45
    Policy/Economy       0.53      0.56      0.55        41
            Sports       0.68      0.53      0.60        43
              Food       0.80      0.58      0.67        48
               AMA       0.72      0.79      0.76        39
           CAA-NRC       0.84      0.73      0.78        37
       Coronavirus       0.80      0.80      0.80         5

          accuracy                           0.70       528
         m

### since random forest and SGD classifiers have given higher accuracy we will build final models with them.

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state = 42)

ranfor = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),])
RM = ranfor.fit(X_train, y_train)
pickle.dump(RM,open("RandomForest.pkl",'wb'))
y_pred = ranfor.predict(X_test)

sgd = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),])
sgd.fit(X_train, y_train)
pickle.dump(sgd,open("SGDC.pkl",'wb'))
y_pred = sgd.predict(X_test)
