#### Importing necessary libraries and reading input data

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
import pickle
import pandas as pd
import warnings
warnings.filterwarnings('ignore')



In [2]:

flairs = ["AskIndia", "Non-Political", "[R]eddiquette", 
          "Scheduled", "Photography", "Science/Technology",
          "Politics", "Business/Finance", "Policy/Economy",
          "Sports", "Food", "AMA"]


In [3]:
inputData = '..\\csv_data\\redditIndia2.csv'
data = pd.read_csv(inputData)
data.head()

Unnamed: 0,title,score,id,url,comms_num,created,body,author,flair,over_18,comments,authors,title_comments_body,title_comments_url,all_features,title_comments,title_body,body_comments
0,lost job sick mother paralysed dad lockdown ea...,1053,g014wc,https://www.reddit.com/r/india/comments/g014wc...,134,1586742000.0,hiits really tough time everyone recently lost...,sanand_satwik,AskIndia,False,im freelancer dont listen idiots cant freelanc...,hashedram diabapp xataari Aashayrao sarcrasti...,lost job sick mother paralysed dad lockdown ea...,lost job sick mother paralysed dad lockdown ea...,lost job sick mother paralysed dad lockdown ea...,lost job sick mother paralysed dad lockdown ea...,lost job sick mother paralysed dad lockdown ea...,im freelancer dont listen idiots cant freelanc...
1,government come begging bowl every crisis,649,fxofyu,https://www.reddit.com/r/india/comments/fxofyu...,204,1586448000.0,floods terrorist attacks famines due lack rain...,TWO-WHEELER-MAFIA,AskIndia,False,dont understand dont use money contingency fun...,Kinky-Monk ak32009 fools_eye None DwncstSheep...,government come begging bowl every crisis dont...,government come begging bowl every crisis dont...,government come begging bowl every crisis dont...,government come begging bowl every crisis dont...,government come begging bowl every crisis floo...,dont understand dont use money contingency fun...
2,mothers condition going worse due hepatitis b ...,762,g0zlly,https://www.reddit.com/r/india/comments/g0zlly...,94,1586871000.0,hi folks really appreciate warm response previ...,sanand_satwik,AskIndia,False,anyone knows influential twitter bangalore ple...,AlternativeDrop6 TheRobotsHaveCome lanky32 pl...,mothers condition going worse due hepatitis b ...,mothers condition going worse due hepatitis b ...,mothers condition going worse due hepatitis b ...,mothers condition going worse due hepatitis b ...,mothers condition going worse due hepatitis b ...,anyone knows influential twitter bangalore ple...
3,people stuck family lockdown family falling apart,159,g4lrhm,https://www.reddit.com/r/india/comments/g4lrhm...,117,1587384000.0,dont think weve spend much time family long ti...,GauGau24,AskIndia,False,yesterday major fight wife mom mominlaw father...,Best-Economist Srthak_ ppccbba tb33296 damnji...,people stuck family lockdown family falling ap...,people stuck family lockdown family falling ap...,people stuck family lockdown family falling ap...,people stuck family lockdown family falling ap...,people stuck family lockdown family falling ap...,yesterday major fight wife mom mominlaw father...
4,prominent caste system india nowadays,111,g6tldd,https://www.reddit.com/r/india/comments/g6tldd...,107,1587700000.0,caste still exist india people still know cast...,Oomada9,AskIndia,False,much intact know girl threatened honor killing...,Cierno Vpee26 ppccbba merlin318 nou_kar Buns4...,prominent caste system india nowadays much int...,prominent caste system india nowadays much int...,prominent caste system india nowadays much int...,prominent caste system india nowadays much int...,prominent caste system india nowadays caste st...,much intact know girl threatened honor killing...


In [4]:
data.fillna("",inplace = True)

In [5]:
flair = data.flair

title_comments = data.title_comments
title_body = data.title_body
body_comments = data.body_comments
comment_x = data.comments
title_x = data.title
body_x = data.body
url_x = data.url
title_comments_body = data.title_comments_body
title_comments_url = data.title_comments_url
all_features = data.all_features


#### Naive Bayes Classifier

In [6]:
def naive_bayes_classifier(X_train, X_test, y_train, y_test):
    print('Using NB Classifier')

    nb = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB()),
                ])
    nb.fit(X_train, y_train)

    y_pred = nb.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

#### Linear SVM

In [7]:
def LinearSVM(X_train, X_test, y_train, y_test):
    print('Linear SVM')
    sgd = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
                 ])
    sgd.fit(X_train, y_train)

    y_pred = sgd.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

#### Multi Layer Perceptron

In [8]:
def mlpClassifier(X_train, X_test, y_train, y_test):
    print('MLP Classifier')
    MLP = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', MLPClassifier(hidden_layer_sizes=(30,30,30))),
                 ])
    
    MLP.fit(X_train, y_train)
    y_pred = MLP.predict(X_test)
    
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

#### Random Forest

In [9]:
def RandomForest(X_train, X_test, y_train, y_test):
    print('Using Random Forest')

    RNF = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('rnf',RandomForestClassifier(n_estimators=1000, random_state=34))])
    
    RNF.fit(X_train, y_train)

    y_pred = RNF.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

#### Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression
import pickle
def logisticRegression(X_train, X_test, y_train, y_test):
    print('Using Logistic Regression')
    
    logreg = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), ('lr', LogisticRegression(penalty='l2',random_state=34, solver='newton-cg', multi_class='multinomial',warm_start=True))])

    logreg.fit(X_train, y_train)

    y_pred = logreg.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

#### Function that takes text feature as input and prints classification report of all classifiers

In [11]:
def classify(x):
    for classifier in [logisticRegression,LinearSVM,RandomForest,mlpClassifier]:   

        X_train, X_test, y_train, y_test = train_test_split(x, flair, test_size=0.1, random_state = 34)
        classifier(X_train,X_test,y_train,y_test)


In [12]:
classify(title_comments_body)

Using Logistic Regression
accuracy 0.7666666666666667
                    precision    recall  f1-score   support

          AskIndia       1.00      1.00      1.00         7
     Non-Political       0.78      1.00      0.88         7
     [R]eddiquette       0.75      0.60      0.67        10
         Scheduled       0.93      1.00      0.96        13
       Photography       0.25      0.14      0.18         7
Science/Technology       0.62      1.00      0.77        10
          Politics       0.64      0.78      0.70         9
  Business/Finance       0.79      0.79      0.79        14
    Policy/Economy       0.80      1.00      0.89        12
            Sports       0.88      0.54      0.67        13
              Food       0.73      1.00      0.84         8
               AMA       1.00      0.30      0.46        10

          accuracy                           0.77       120
         macro avg       0.76      0.76      0.73       120
      weighted avg       0.78      0.77     

In [13]:
classify(title_comments)

Using Logistic Regression
accuracy 0.7333333333333333
                    precision    recall  f1-score   support

          AskIndia       1.00      1.00      1.00         7
     Non-Political       0.70      1.00      0.82         7
     [R]eddiquette       0.71      0.50      0.59        10
         Scheduled       0.93      1.00      0.96        13
       Photography       0.00      0.00      0.00         7
Science/Technology       0.62      1.00      0.77        10
          Politics       0.50      0.56      0.53         9
  Business/Finance       0.79      0.79      0.79        14
    Policy/Economy       0.80      1.00      0.89        12
            Sports       0.70      0.54      0.61        13
              Food       0.73      1.00      0.84         8
               AMA       1.00      0.30      0.46        10

          accuracy                           0.73       120
         macro avg       0.71      0.72      0.69       120
      weighted avg       0.73      0.73     

In [14]:
classify(comment_x)

Using Logistic Regression
accuracy 0.6416666666666667
                    precision    recall  f1-score   support

          AskIndia       0.88      1.00      0.93         7
     Non-Political       0.64      1.00      0.78         7
     [R]eddiquette       0.71      0.50      0.59        10
         Scheduled       0.90      0.69      0.78        13
       Photography       0.00      0.00      0.00         7
Science/Technology       0.60      0.60      0.60        10
          Politics       0.50      0.44      0.47         9
  Business/Finance       0.79      0.79      0.79        14
    Policy/Economy       0.80      1.00      0.89        12
            Sports       1.00      0.38      0.56        13
              Food       0.31      1.00      0.47         8
               AMA       0.75      0.30      0.43        10

          accuracy                           0.64       120
         macro avg       0.66      0.64      0.61       120
      weighted avg       0.70      0.64     

In [15]:
classify(body_x)

Using Logistic Regression
accuracy 0.3
                    precision    recall  f1-score   support

          AskIndia       0.50      0.57      0.53         7
     Non-Political       0.47      1.00      0.64         7
     [R]eddiquette       0.00      0.00      0.00        10
         Scheduled       0.00      0.00      0.00        13
       Photography       0.00      0.00      0.00         7
Science/Technology       0.13      1.00      0.22        10
          Politics       1.00      0.22      0.36         9
  Business/Finance       0.00      0.00      0.00        14
    Policy/Economy       0.80      1.00      0.89        12
            Sports       1.00      0.08      0.14        13
              Food       0.00      0.00      0.00         8
               AMA       0.00      0.00      0.00        10

          accuracy                           0.30       120
         macro avg       0.32      0.32      0.23       120
      weighted avg       0.33      0.30      0.22       120

In [16]:
classify(title_x)

Using Logistic Regression
accuracy 0.5666666666666667
                    precision    recall  f1-score   support

          AskIndia       0.75      0.86      0.80         7
     Non-Political       0.38      0.43      0.40         7
     [R]eddiquette       0.50      0.30      0.37        10
         Scheduled       0.64      0.54      0.58        13
       Photography       0.25      0.29      0.27         7
Science/Technology       0.80      0.80      0.80        10
          Politics       0.38      0.56      0.45         9
  Business/Finance       0.53      0.57      0.55        14
    Policy/Economy       0.92      1.00      0.96        12
            Sports       0.50      0.46      0.48        13
              Food       0.40      0.50      0.44         8
               AMA       0.67      0.40      0.50        10

          accuracy                           0.57       120
         macro avg       0.56      0.56      0.55       120
      weighted avg       0.58      0.57     