In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [2]:
data = pd.read_csv('Cleaned Reddit Data AS.csv')
data.drop("Unnamed: 0", axis=1, inplace=True)
data.Content = data.Content.astype(str)
data.Title = data.Title.astype(str)
data.head()

Unnamed: 0,RID,Title,URL,Score,Comment_Score,Author,Content,Adult,Flair,Length_Title,Length_Content
0,g89s9t,Website find getting paid fairly comparing sal...,/r/india/comments/g89s9t/website_to_find_out_i...,2,0,ngranja19,,False,Business/Finance,84,0
1,g89rnl,Happy Akshaya Tritiya Akshaya Trititya 2020 Fe...,/r/india/comments/g89rnl/happy_akshaya_tritiya...,1,0,vaultuptechnologies,removed,False,AskIndia,74,7
2,g89ni7,A new turf war erupted Pakistan sponsored terr...,/r/india/comments/g89ni7/a_new_turf_war_has_er...,8,6,aviakki1,,False,Non-Political,166,0
3,g89j51,Special flights hospital beds Centre preps fly...,/r/india/comments/g89j51/special_flights_hospi...,13,6,silentr3b31,,False,Politics,75,0
4,g89iup,24 Vijayawada contract virus man hosts games h...,/r/india/comments/g89iup/24_in_vijayawada_cont...,65,8,drgnfly61,,False,Coronavirus,50,0


# 1. Logistic Regression

In [3]:
def log_classifier(xtrain, xtest, ytrain, ytest):

    from sklearn.linear_model import LogisticRegression

    model_log = Pipeline([('vect', CountVectorizer()), 
                          ('tfidf', TfidfTransformer()), 
                          ('clf', LogisticRegression(n_jobs=1, C=1e5))
                         ])
    model_log.fit(xtrain, ytrain)

    y_pred = model_log.predict(xtest)

    print('accuracy %s' % accuracy_score(ytest, y_pred))
    print(classification_report(ytest, y_pred))

# 2. Multinomial Naive Bayes Classifier

In [4]:
def nb_classifier(xtrain, xtest, ytrain, ytest):
    model_nb = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB()),
                        ])
    model_nb.fit(xtrain, ytrain)

    y_pred = model_nb.predict(xtest)

    print('accuracy %s' % accuracy_score(ytest, y_pred))
    print(classification_report(ytest, y_pred))

# 3. Multi Layer Perceptron Classifier

In [5]:
def mlp_classifier(xtrain, xtest, ytrain, ytest):  
    model_mlp = Pipeline([('vect', CountVectorizer()),
                          ('tfidf', TfidfTransformer()),
                          ('clf', MLPClassifier(hidden_layer_sizes=(30,30,30))),
                         ])
    model_mlp.fit(xtrain, ytrain)

    y_pred = model_mlp.predict(xtest)

    print('accuracy %s' % accuracy_score(ytest, y_pred))
    print(classification_report(ytest, y_pred))

# 4. Random Forest Classifier

In [6]:
def rforest_classifier(xtrain, xtest, ytrain, ytest):
    model_rforest = Pipeline([('vect', CountVectorizer()),
                              ('tfidf', TfidfTransformer()),
                              ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),
                             ])
    model_rforest.fit(xtrain, ytrain)

    y_pred = model_rforest.predict(xtest)

    print('accuracy %s' % accuracy_score(ytest, y_pred))
    print(classification_report(ytest, y_pred))

# 5. Linear Support Vector Machine Classifier

In [7]:
def linear_svm_classifier(xtrain, xtest, ytrain, ytest):
    model_linearsvc = Pipeline([('vect', CountVectorizer()),
                                ('tfidf', TfidfTransformer()),
                                ('clf', LinearSVC()),
                               ])
    model_linearsvc.fit(xtrain, ytrain)

    y_pred = model_linearsvc.predict(xtest)

    print('accuracy %s' % accuracy_score(ytest, y_pred))
    print(classification_report(ytest, y_pred))

# 6. Stochastic Gradient Descent Classifier

In [8]:
def sgd_classifier(xtrain, xtest, ytrain, ytest):
    model_sgd = Pipeline([('vect', CountVectorizer()),
                          ('tfidf', TfidfTransformer()),
                          ('clf', SGDClassifier()),
                         ])
    model_sgd.fit(xtrain, ytrain)

    y_pred = model_sgd.predict(xtest)

    print('accuracy %s' % accuracy_score(ytest, y_pred))
    print(classification_report(ytest, y_pred))

# 7. XGBoost Classifier

In [9]:
def xgb_classifier(xtrain, xtest, ytrain, ytest):
    model_xgb = Pipeline([('vect', CountVectorizer()),
                          ('tfidf', TfidfTransformer()),
                          ('clf', XGBClassifier()),
                         ])
    model_xgb.fit(xtrain, ytrain)

    y_pred = model_xgb.predict(xtest)

    print('accuracy %s' % accuracy_score(ytest, y_pred))
    print(classification_report(ytest, y_pred))

In [10]:
def train_test(x,y):
 
    xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2)
   
    print("Results of Logistic Regression Classifier")
    log_classifier(xtrain, xtest, ytrain, ytest)
    print("Results of Multinomial Naive Bayes Classifier")
    nb_classifier(xtrain, xtest, ytrain, ytest)
    print("Results of Multi Layer Perceptron Classifier")
    mlp_classifier(xtrain, xtest, ytrain, ytest)
    print("Results of Random Forest Classifier")
    rforest_classifier(xtrain, xtest, ytrain, ytest)
    print("Results of Linear Support Vector Machine Classifier")
    linear_svm_classifier(xtrain, xtest, ytrain, ytest)
    print("Results of Stochastic Gradient Descent Classifier")
    sgd_classifier(xtrain, xtest, ytrain, ytest)
    print("Results of XGBoost Classifier")
    xgb_classifier(xtrain, xtest, ytrain, ytest)

In [11]:
flr = data.Flair
W = data.Title
X = data.URL
Y = data.Content
Z = W+X+Y

print("Flair Detection using Title as Feature")
train_test(W,flr)
print("Flair Detection using URL as Feature")
train_test(X,flr)
print("Flair Detection using Content as Features")
train_test(Y,flr)
print("Flair Detection using Combined Features")
train_test(Z,flr)

Flair Detection using Title as Feature
Results of Logistic Regression Classifier




accuracy 0.45673299398426653
                    precision    recall  f1-score   support

          AskIndia       0.41      0.35      0.38       666
  Business/Finance       0.23      0.26      0.25       140
       CAA-NRC-NPR       0.43      0.21      0.29        14
       Coronavirus       0.57      0.64      0.60      1523
              Food       0.67      0.30      0.41        40
     Non-Political       0.34      0.35      0.35       855
       Photography       0.51      0.47      0.49        88
    Policy/Economy       0.36      0.21      0.26       169
          Politics       0.41      0.46      0.43       597
         Scheduled       0.76      0.63      0.69        35
Science/Technology       0.28      0.18      0.22       155
            Sports       0.74      0.35      0.47        40

          accuracy                           0.46      4322
         macro avg       0.48      0.37      0.40      4322
      weighted avg       0.45      0.46      0.45      4322

Results 

  'precision', 'predicted', average, warn_for)


accuracy 0.43614067561314207
                    precision    recall  f1-score   support

          AskIndia       0.37      0.36      0.37       666
  Business/Finance       0.26      0.32      0.29       140
       CAA-NRC-NPR       0.14      0.14      0.14        14
       Coronavirus       0.59      0.60      0.59      1523
              Food       0.27      0.30      0.28        40
     Non-Political       0.33      0.36      0.35       855
       Photography       0.35      0.32      0.33        88
    Policy/Economy       0.23      0.24      0.23       169
          Politics       0.44      0.41      0.42       597
         Scheduled       0.50      0.66      0.57        35
Science/Technology       0.26      0.15      0.20       155
            Sports       0.48      0.25      0.33        40

          accuracy                           0.44      4322
         macro avg       0.35      0.34      0.34      4322
      weighted avg       0.44      0.44      0.43      4322

Results 



accuracy 0.37760296159185563
                    precision    recall  f1-score   support

          AskIndia       0.74      0.04      0.07       648
  Business/Finance       0.93      0.09      0.16       159
       CAA-NRC-NPR       0.00      0.00      0.00        19
       Coronavirus       0.36      0.98      0.53      1497
              Food       0.33      0.02      0.04        48
     Non-Political       0.66      0.04      0.08       884
       Photography       0.83      0.10      0.18        98
    Policy/Economy       0.60      0.07      0.13       167
          Politics       0.61      0.07      0.12       565
         Scheduled       0.93      0.33      0.48        43
Science/Technology       0.58      0.04      0.08       163
            Sports       0.50      0.03      0.06        31

          accuracy                           0.38      4322
         macro avg       0.59      0.15      0.16      4322
      weighted avg       0.56      0.38      0.25      4322

Results 

  'precision', 'predicted', average, warn_for)


accuracy 0.1008792225821379
                    precision    recall  f1-score   support

          AskIndia       0.73      0.04      0.07       648
  Business/Finance       0.04      0.98      0.08       159
       CAA-NRC-NPR       0.00      0.00      0.00        19
       Coronavirus       0.86      0.09      0.17      1497
              Food       1.00      0.02      0.04        48
     Non-Political       0.63      0.05      0.08       884
       Photography       0.88      0.07      0.13        98
    Policy/Economy       0.50      0.06      0.11       167
          Politics       0.56      0.07      0.12       565
         Scheduled       1.00      0.30      0.46        43
Science/Technology       0.42      0.03      0.06       163
            Sports       1.00      0.03      0.06        31

          accuracy                           0.10      4322
         macro avg       0.63      0.15      0.12      4322
      weighted avg       0.69      0.10      0.12      4322

Results o

  'precision', 'predicted', average, warn_for)


Results of Linear Support Vector Machine Classifier
accuracy 0.37760296159185563
                    precision    recall  f1-score   support

          AskIndia       0.71      0.04      0.07       648
  Business/Finance       0.93      0.09      0.16       159
       CAA-NRC-NPR       0.00      0.00      0.00        19
       Coronavirus       0.36      0.98      0.53      1497
              Food       0.50      0.02      0.04        48
     Non-Political       0.65      0.04      0.08       884
       Photography       0.83      0.10      0.18        98
    Policy/Economy       0.60      0.07      0.13       167
          Politics       0.62      0.07      0.12       565
         Scheduled       0.93      0.33      0.48        43
Science/Technology       0.50      0.03      0.06       163
            Sports       0.50      0.03      0.06        31

          accuracy                           0.38      4322
         macro avg       0.60      0.15      0.16      4322
      weighted av



accuracy 0.41369736233225357
                    precision    recall  f1-score   support

          AskIndia       0.40      0.60      0.48       620
  Business/Finance       0.00      0.00      0.00       165
       CAA-NRC-NPR       0.00      0.00      0.00        17
       Coronavirus       0.41      0.87      0.56      1527
              Food       0.00      0.00      0.00        49
     Non-Political       0.32      0.03      0.05       851
       Photography       0.00      0.00      0.00       106
    Policy/Economy       0.63      0.07      0.12       182
          Politics       0.67      0.03      0.06       543
         Scheduled       1.00      0.58      0.73        52
Science/Technology       0.00      0.00      0.00       176
            Sports       0.00      0.00      0.00        34

          accuracy                           0.41      4322
         macro avg       0.29      0.18      0.17      4322
      weighted avg       0.39      0.41      0.30      4322

Results 

  'precision', 'predicted', average, warn_for)


accuracy 0.4055992596020361
                    precision    recall  f1-score   support

          AskIndia       0.40      0.57      0.47       620
  Business/Finance       0.00      0.00      0.00       165
       CAA-NRC-NPR       0.00      0.00      0.00        17
       Coronavirus       0.41      0.86      0.56      1527
              Food       0.00      0.00      0.00        49
     Non-Political       0.20      0.04      0.06       851
       Photography       0.00      0.00      0.00       106
    Policy/Economy       0.75      0.05      0.09       182
          Politics       0.35      0.02      0.04       543
         Scheduled       1.00      0.58      0.73        52
Science/Technology       0.00      0.00      0.00       176
            Sports       0.00      0.00      0.00        34

          accuracy                           0.41      4322
         macro avg       0.26      0.18      0.16      4322
      weighted avg       0.33      0.41      0.29      4322

Results o

  'precision', 'predicted', average, warn_for)


accuracy 0.40791300323924107
                    precision    recall  f1-score   support

          AskIndia       0.38      0.72      0.50       620
  Business/Finance       0.00      0.00      0.00       165
       CAA-NRC-NPR       0.00      0.00      0.00        17
       Coronavirus       0.41      0.84      0.55      1527
              Food       0.00      0.00      0.00        49
     Non-Political       0.00      0.00      0.00       851
       Photography       0.00      0.00      0.00       106
    Policy/Economy       0.90      0.05      0.09       182
          Politics       1.00      0.00      0.01       543
         Scheduled       1.00      0.58      0.73        52
Science/Technology       0.00      0.00      0.00       176
            Sports       0.00      0.00      0.00        34

          accuracy                           0.41      4322
         macro avg       0.31      0.18      0.16      4322
      weighted avg       0.38      0.41      0.28      4322

Results 

  'precision', 'predicted', average, warn_for)


accuracy 0.4178621008792226
                    precision    recall  f1-score   support

          AskIndia       0.40      0.64      0.50       620
  Business/Finance       0.00      0.00      0.00       165
       CAA-NRC-NPR       0.00      0.00      0.00        17
       Coronavirus       0.41      0.87      0.56      1527
              Food       0.00      0.00      0.00        49
     Non-Political       0.36      0.02      0.03       851
       Photography       0.00      0.00      0.00       106
    Policy/Economy       0.65      0.06      0.11       182
          Politics       0.74      0.03      0.06       543
         Scheduled       1.00      0.58      0.73        52
Science/Technology       0.00      0.00      0.00       176
            Sports       0.00      0.00      0.00        34

          accuracy                           0.42      4322
         macro avg       0.30      0.18      0.17      4322
      weighted avg       0.41      0.42      0.30      4322

Results o



accuracy 0.5583063396575659
                    precision    recall  f1-score   support

          AskIndia       0.52      0.51      0.52       645
  Business/Finance       0.55      0.40      0.46       151
       CAA-NRC-NPR       0.57      0.20      0.30        20
       Coronavirus       0.61      0.78      0.69      1531
              Food       0.45      0.10      0.17        48
     Non-Political       0.45      0.41      0.43       880
       Photography       0.63      0.46      0.53        89
    Policy/Economy       0.37      0.27      0.31       150
          Politics       0.59      0.52      0.55       582
         Scheduled       0.96      0.67      0.79        39
Science/Technology       0.38      0.23      0.28       146
            Sports       0.82      0.44      0.57        41

          accuracy                           0.56      4322
         macro avg       0.58      0.41      0.47      4322
      weighted avg       0.55      0.56      0.55      4322

Results o

  'precision', 'predicted', average, warn_for)


accuracy 0.466682091624248
                    precision    recall  f1-score   support

          AskIndia       0.53      0.28      0.37       645
  Business/Finance       0.57      0.28      0.38       151
       CAA-NRC-NPR       0.21      0.25      0.23        20
       Coronavirus       0.64      0.62      0.63      1531
              Food       0.13      0.04      0.06        48
     Non-Political       0.31      0.52      0.39       880
       Photography       0.73      0.21      0.33        89
    Policy/Economy       0.24      0.27      0.25       150
          Politics       0.54      0.44      0.49       582
         Scheduled       1.00      0.64      0.78        39
Science/Technology       0.15      0.21      0.18       146
            Sports       0.29      0.24      0.26        41

          accuracy                           0.47      4322
         macro avg       0.45      0.33      0.36      4322
      weighted avg       0.51      0.47      0.47      4322

Results of