#### Importing necessary libraries and reading input data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
import pickle
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [None]:

flairs = ["AskIndia", "Non-Political", "[R]eddiquette", 
          "Scheduled", "Photography", "Science/Technology",
          "Politics", "Business/Finance", "Policy/Economy",
          "Sports", "Food", "AMA"]


In [None]:
inputData = 'csv_data\\redditIndia2.csv'
data = pd.read_csv(inputData)
data.head()

In [None]:
data.fillna("",inplace = True)

In [None]:
flair = data.flair

title_comments = data.title_comments
title_body = data.title_body
body_comments = data.body_comments
comment_x = data.comments
title_x = data.title
body_x = data.body
url_x = data.url
title_comments_body = data.title_comments_body
title_comments_url = data.title_comments_url
all_features = data.all_features


#### Naive Bayes Classifier

In [None]:
def nb_classifier(X_train, X_test, y_train, y_test):
    print('Using NB Classifier')

    nb = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB()),
                ])
    nb.fit(X_train, y_train)

    y_pred = nb.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

#### Linear SVM

In [None]:
def LinearSVM(X_train, X_test, y_train, y_test):
    print('Linear SVM')
    sgd = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
                 ])
    sgd.fit(X_train, y_train)

    y_pred = sgd.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

#### Multi Layer Perceptron

In [None]:
def mlpClassifier(X_train, X_test, y_train, y_test):
    print('MLP Classifier')
    MLP = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', MLPClassifier(hidden_layer_sizes=(30,30,30))),
                 ])
    
    MLP.fit(X_train, y_train)
    y_pred = MLP.predict(X_test)
    
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

#### Random Forest

In [None]:
def RandomForest(X_train, X_test, y_train, y_test):
    print('Using Random Forest')

    RNF = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('rnf',RandomForestClassifier(n_estimators=1000, random_state=34))])
    
    RNF.fit(X_train, y_train)

    y_pred = RNF.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
import pickle
def logisticRegression(X_train, X_test, y_train, y_test):
    print('Using Logistic Regression')
    
    logreg = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('lr', LogisticRegression(penalty='l2',random_state=34, solver='newton-cg', multi_class='multinomial',warm_start=True))])

    logreg.fit(X_train, y_train)

    y_pred = logreg.predict(X_test)
    pickle.dump(logreg,open("model_logreg_2.sav",'wb'))

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

#### Function that takes text feature as input and prints classification report of all classifiers

In [None]:
def classify(x):
    for classifier in [logisticRegression,RandomForest,LinearSVM,mlpClassifier]:   

        X_train, X_test, y_train, y_test = train_test_split(x, flair, test_size=0.1, random_state = 49)
        classifier(X_train,X_test,y_train,y_test)


In [None]:
classify(title_comments)