In [39]:
# Libraries
import pandas as pd

import re
import praw
import pickle

import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

### Fetching the Data

In [40]:
data = pd.read_csv('Final Reddit India Data.csv')

In [41]:
data.shape

(2218, 8)

In [42]:
data.head()

Unnamed: 0,Post_Flair,Post_Title,Post_ID,Post_Score,Post_URL,Post_Body,Number_of_Comments_on_Post,Post_Comments
0,AMA,Healthcare Experts in an AMA on low count of C...,fqbie6,2235,https://i.imgur.com/dII91mh.jpg,,318,"As Physician working in a corporate hospital,..."
1,AMA,I am an exmuslim. AMA!,fw2u16,147,https://www.reddit.com/r/india/comments/fw2u16...,Hi! \n\n\nI am an in-the-closet exmuslim who...,259,What are the attitudes among your friends/fam...
2,AMA,"Hi, I’m Dr. Roshan Radhakrishnan, a doctor, an...",fk649f,813,https://i.redd.it/vf1ak8axs8n41.jpg,,411,Please remember that top level comments are r...
3,AMA,Casual AMA: we are group of friends who met th...,fmxz3j,29,https://www.reddit.com/r/india/comments/fmxz3j...,"Hello, r/india we are few old members of this ...",78,Are here since 2013. _Reddit age: 15D._ **PRE...
4,AMA,AMA. Indian living in North Italy in the coron...,fg3mdn,98,https://www.reddit.com/r/india/comments/fg3mdn...,Hi guys. I live in Milan and thought I'd try d...,48,"According to news, the virus kill rate in Ita..."


#### We cannot directly apply out machine learning or deep learning models directly on raw text. Data needs to be preprocessed that is data should be converted to a cleaner form so that it can be fed to our model.

### Text Cleaning

In [43]:
replace_by_space = re.compile('[/(){}\[\]\|@,;]')
replace_symbol = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()  # Converting text to lowercase
    text = replace_by_space.sub(' ', text)  # Replacing symbols mentioned in replace_by_space by space in text
    text = replace_symbol.sub('', text)  # Deleting symbols mentioned in replace_symbol from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)  # Removing STOPWORDS from text
    
    return text

# Stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to 
# ignore, both when indexing entries for searching and when retrieving them as the result of a search query.

### Data Preprocessing

In [44]:
# Converting data to string
data['Post_Title'] = data['Post_Title'].apply(str)
data['Post_Body'] = data['Post_Body'].apply(str)
data['Post_Comments'] = data['Post_Comments'].apply(str)

# Applying clean text function on data to clean the data
data['Post_Title'] = data['Post_Title'].apply(clean_text)
data['Post_Body'] = data['Post_Body'].apply(clean_text)
data['Post_Comments'] = data['Post_Comments'].apply(clean_text)

# Combining Title, Comment, URL, Body features of post and creating a combined feature
Combined_Features = data["Post_Title"] + data["Post_Comments"] + data["Post_URL"] + data["Post_Body"]
data_clean = data.assign(Combined_Features = Combined_Features)

In [45]:
data_clean.shape

(2218, 9)

In [46]:
data_clean.head()

Unnamed: 0,Post_Flair,Post_Title,Post_ID,Post_Score,Post_URL,Post_Body,Number_of_Comments_on_Post,Post_Comments,Combined_Features
0,AMA,healthcare experts ama low count covid19 cases...,fqbie6,2235,https://i.imgur.com/dII91mh.jpg,,318,physician working corporate hospital ill clari...,healthcare experts ama low count covid19 cases...
1,AMA,exmuslim ama,fw2u16,147,https://www.reddit.com/r/india/comments/fw2u16...,hi inthecloset exmuslim whos deeply sympatheti...,259,attitudes among friends family towards sunni s...,exmuslim amaattitudes among friends family tow...
2,AMA,hi im dr roshan radhakrishnan doctor avid writ...,fk649f,813,https://i.redd.it/vf1ak8axs8n41.jpg,,411,please remember top level comments reserved qu...,hi im dr roshan radhakrishnan doctor avid writ...
3,AMA,casual ama group friends met r india,fmxz3j,29,https://www.reddit.com/r/india/comments/fmxz3j...,hello r india old members community many since...,78,since 2013 _reddit age 15d_ press x doubt comm...,casual ama group friends met r indiasince 2013...
4,AMA,ama indian living north italy corona virus lock,fg3mdn,98,https://www.reddit.com/r/india/comments/fg3mdn...,hi guys live milan thought id try ama since ne...,48,according news virus kill rate italy high due ...,ama indian living north italy corona virus loc...


#### Removing NaN/NA values from data

In [47]:
data.fillna("", inplace = True)
# inplace = True means values would be updated in the data itself without creating a new copy of the data

In [48]:
reddit_flairs = ['AMA', 'Policy/Economy', 'Non Political', 'Science/Technology', 'Politics', 'Coronavirus', 'AskIndia', 'Business/Finance', 'Food', 'Photography', 'Sports', '[R]eddiquette']

### Trying Different Machine Learning Models

#### We'll be implementing the following machine learning algorithms on our dataset:
 - Logistic Regression
 - Naive Bayes
 - Linear SVM
 - SGDC
 - Random Forest
 - MLP Classifier
 - ADA Boost
 - Gradient Boost
 - KNN

In [49]:
# Logistic Regression
def logistic_reg(X_train, X_test, y_train, y_test):
    logreg = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression(n_jobs = 1, C = 1e5)),])
    logreg.fit(X_train, y_train)
    
    y_pred = logreg.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names = reddit_flairs))

In [50]:
# Naive Bayes
def nb_classifier(X_train, X_test, y_train, y_test):
    nb = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ])
    nb.fit(X_train, y_train)

    y_pred = nb.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names = reddit_flairs))

In [51]:
# Linear SVM
def linear_svm(X_train, X_test, y_train, y_test):
    svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SVC(gamma = 'scale', kernel = 'linear')),])
    svm.fit(X_train, y_train)

    y_pred = svm.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names = reddit_flairs))

In [52]:
# SGDC
def SGDC(X_train, X_test, y_train, y_test):
    sgd = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss = 'hinge', penalty = 'l2', alpha = 1e-3, random_state = 42, max_iter = 5, tol = None)),])
    sgd.fit(X_train, y_train)

    y_pred = sgd.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names = reddit_flairs))

In [53]:
# Random Forest
def randomforest(X_train, X_test, y_train, y_test):
    ranfor = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),])
    ranfor.fit(X_train, y_train)

    y_pred = ranfor.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names = reddit_flairs))

In [54]:
# MLP Classifier
def mlpclassifier(X_train, X_test, y_train, y_test):  
    mlp = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MLPClassifier(hidden_layer_sizes = (30, 30, 30))),])
    mlp.fit(X_train, y_train)

    y_pred = mlp.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names = reddit_flairs))

In [55]:
# ADA Boost
def adaboost(X_train, X_test, y_train, y_test):
    ada_b = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', AdaBoostClassifier(n_estimators = 200, learning_rate = 0.75))])
    ada_b.fit(X_train, y_train)
    
    y_pred = ada_b.predict(X_test)
    
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names = reddit_flairs))

In [56]:
# Gradient Boost
def gradientboost(X_train, X_test, y_train, y_test):
    grad_b = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1))])
    grad_b.fit(X_train, y_train)
    
    y_pred = grad_b.predict(X_test)
    
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names = reddit_flairs))

In [57]:
# KNN Classifier
def knnclassifier(X_train, X_test, y_train, y_test):
    knn = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', KNeighborsClassifier(n_neighbors = 5))])
    knn.fit(X_train, y_train)
    
    y_pred = knn.predict(X_test)
        
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names = reddit_flairs))

#### In most machine learning projects the data to be used is highly unlikely to be in the ideal format for producing the best performing model. There are quite often a number of transformational steps such as encoding categorical variables, feature scaling and normalisation that need to be performed. 
#### However, in a typical machine learning workflow we will need to apply all these transformations at least twice. Once when training the model and again on any new data we want to predict on. Scikit-learn pipelines are a tool to simplify this process. They have several key benefits:
 - They make your workflow much easier to read and understand.
 - They enforce the implementation and order of steps in your project.
 - These in turn make your work much more reproducible.

In [58]:
def train_test(X, y):
     
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10)
    
    print("Results of Naive Bayes Classifier")
    nb_classifier(X_train, X_test, y_train, y_test)
    
    print("Results of Linear Support Vector Machine")
    linear_svm(X_train, X_test, y_train, y_test)
    
    print("Results of SGDC")
    SGDC(X_train, X_test, y_train, y_test)
    
    print("Results of Logistic Regression")
    logistic_reg(X_train, X_test, y_train, y_test)
    
    print("Results of Random Forest")
    randomforest(X_train, X_test, y_train, y_test)
    
    print("Results of MLP Classifier")
    mlpclassifier(X_train, X_test, y_train, y_test)
    
    print("Results of AdaBoost Classifier")
    adaboost(X_train, X_test, y_train, y_test)
    
    print("Results of GradientBoost Classifier")
    gradientboost(X_train, X_test, y_train, y_test)
    
    print("Results of KNN Classifier")
    knnclassifier(X_train, X_test, y_train, y_test)

In [59]:
to_predict = data_clean['Post_Flair']

combined = data_clean['Combined_Features']
comments = data_clean['Post_Comments']
title = data_clean['Post_Title']
body = data_clean['Post_Body']
url = data_clean['Post_URL']

#### The below code prints the test accuracy for all the ML models taken into consideration when only the post title is considered as a feature

In [60]:
print("Flair Detection using Title as Feature")
train_test(title, to_predict)

Flair Detection using Title as Feature
Results of Naive Bayes Classifier
accuracy 0.6774774774774774
                    precision    recall  f1-score   support

               AMA       0.85      0.89      0.87        57
    Policy/Economy       0.52      0.72      0.60        40
     Non Political       0.59      0.35      0.44        54
Science/Technology       0.74      0.89      0.81        54
          Politics       0.70      0.80      0.74        49
       Coronavirus       0.66      0.98      0.78        41
          AskIndia       0.81      0.81      0.81        48
  Business/Finance       0.39      0.38      0.38        53
              Food       0.64      0.56      0.60        52
       Photography       0.68      0.67      0.67        48
            Sports       0.88      0.57      0.69        53
     [R]eddiquette       0.00      0.00      0.00         6

          accuracy                           0.68       555
         macro avg       0.62      0.63      0.62       5

  'precision', 'predicted', average, warn_for)


accuracy 0.7099099099099099
                    precision    recall  f1-score   support

               AMA       0.96      0.91      0.94        57
    Policy/Economy       0.90      0.68      0.77        40
     Non Political       0.44      0.52      0.47        54
Science/Technology       0.91      0.89      0.90        54
          Politics       0.88      0.73      0.80        49
       Coronavirus       0.91      1.00      0.95        41
          AskIndia       0.93      0.83      0.88        48
  Business/Finance       0.34      0.51      0.41        53
              Food       0.69      0.63      0.66        52
       Photography       0.53      0.60      0.56        48
            Sports       0.79      0.62      0.69        53
     [R]eddiquette       0.00      0.00      0.00         6

          accuracy                           0.71       555
         macro avg       0.69      0.66      0.67       555
      weighted avg       0.74      0.71      0.72       555

Results o

  'precision', 'predicted', average, warn_for)


accuracy 0.6990990990990991
                    precision    recall  f1-score   support

               AMA       0.95      0.93      0.94        57
    Policy/Economy       0.83      0.72      0.77        40
     Non Political       0.46      0.46      0.46        54
Science/Technology       0.80      0.89      0.84        54
          Politics       0.84      0.76      0.80        49
       Coronavirus       0.87      1.00      0.93        41
          AskIndia       0.80      0.83      0.82        48
  Business/Finance       0.32      0.34      0.33        53
              Food       0.63      0.63      0.63        52
       Photography       0.54      0.58      0.56        48
            Sports       0.74      0.66      0.70        53
     [R]eddiquette       1.00      0.17      0.29         6

          accuracy                           0.70       555
         macro avg       0.73      0.66      0.67       555
      weighted avg       0.71      0.70      0.70       555

Results o

  'precision', 'predicted', average, warn_for)


accuracy 0.527927927927928
                    precision    recall  f1-score   support

               AMA       0.85      0.58      0.69        57
    Policy/Economy       0.43      0.47      0.45        40
     Non Political       0.39      0.37      0.38        54
Science/Technology       0.69      0.54      0.60        54
          Politics       0.58      0.61      0.59        49
       Coronavirus       0.47      0.83      0.60        41
          AskIndia       0.78      0.73      0.75        48
  Business/Finance       0.27      0.40      0.32        53
              Food       0.50      0.40      0.45        52
       Photography       0.54      0.56      0.55        48
            Sports       0.65      0.45      0.53        53
     [R]eddiquette       0.00      0.00      0.00         6

          accuracy                           0.53       555
         macro avg       0.51      0.50      0.49       555
      weighted avg       0.56      0.53      0.53       555

Results of

  'precision', 'predicted', average, warn_for)


#### The below code prints the test accuracy for all the ML models taken into consideration when only the post body is considered as a feature

In [61]:
print("Flair Detection using Body as Feature")
train_test(body, to_predict)

Flair Detection using Body as Feature
Results of Naive Bayes Classifier
accuracy 0.32432432432432434
                    precision    recall  f1-score   support

               AMA       0.89      0.58      0.70        57
    Policy/Economy       0.28      0.72      0.40        40
     Non Political       0.35      0.61      0.45        54
Science/Technology       0.17      0.83      0.28        54
          Politics       1.00      0.02      0.04        49
       Coronavirus       0.00      0.00      0.00        41
          AskIndia       0.92      0.25      0.39        48
  Business/Finance       0.50      0.19      0.27        53
              Food       1.00      0.02      0.04        52
       Photography       0.67      0.25      0.36        48
            Sports       0.80      0.08      0.14        53
     [R]eddiquette       0.00      0.00      0.00         6

          accuracy                           0.32       555
         macro avg       0.55      0.30      0.26       5

  'precision', 'predicted', average, warn_for)


accuracy 0.37657657657657656
                    precision    recall  f1-score   support

               AMA       0.82      0.63      0.71        57
    Policy/Economy       0.30      0.88      0.45        40
     Non Political       0.71      0.50      0.59        54
Science/Technology       0.17      0.83      0.28        54
          Politics       0.75      0.06      0.11        49
       Coronavirus       1.00      0.02      0.05        41
          AskIndia       1.00      0.38      0.55        48
  Business/Finance       0.47      0.26      0.34        53
              Food       0.60      0.06      0.11        52
       Photography       0.68      0.35      0.47        48
            Sports       1.00      0.19      0.32        53
     [R]eddiquette       0.00      0.00      0.00         6

          accuracy                           0.38       555
         macro avg       0.62      0.35      0.33       555
      weighted avg       0.67      0.38      0.36       555

Results 

  'precision', 'predicted', average, warn_for)


accuracy 0.3981981981981982
                    precision    recall  f1-score   support

               AMA       0.62      0.72      0.67        57
    Policy/Economy       0.49      0.60      0.54        40
     Non Political       0.58      0.57      0.58        54
Science/Technology       0.67      0.04      0.07        54
          Politics       0.55      0.12      0.20        49
       Coronavirus       0.44      0.10      0.16        41
          AskIndia       0.92      0.48      0.63        48
  Business/Finance       0.62      0.19      0.29        53
              Food       0.16      0.83      0.27        52
       Photography       0.71      0.42      0.53        48
            Sports       0.71      0.32      0.44        53
     [R]eddiquette       0.00      0.00      0.00         6

          accuracy                           0.40       555
         macro avg       0.54      0.37      0.36       555
      weighted avg       0.59      0.40      0.39       555

Results o

  'precision', 'predicted', average, warn_for)


accuracy 0.4
                    precision    recall  f1-score   support

               AMA       0.67      0.68      0.68        57
    Policy/Economy       0.45      0.65      0.53        40
     Non Political       0.66      0.50      0.57        54
Science/Technology       0.18      0.87      0.29        54
          Politics       0.71      0.10      0.18        49
       Coronavirus       0.28      0.12      0.17        41
          AskIndia       0.88      0.48      0.62        48
  Business/Finance       0.48      0.23      0.31        53
              Food       0.42      0.10      0.16        52
       Photography       0.74      0.42      0.53        48
            Sports       0.76      0.25      0.37        53
     [R]eddiquette       0.00      0.00      0.00         6

          accuracy                           0.40       555
         macro avg       0.52      0.37      0.37       555
      weighted avg       0.56      0.40      0.40       555

Results of Random Forest

  'precision', 'predicted', average, warn_for)


accuracy 0.4072072072072072
                    precision    recall  f1-score   support

               AMA       0.79      0.72      0.75        57
    Policy/Economy       0.33      0.75      0.45        40
     Non Political       0.74      0.52      0.61        54
Science/Technology       0.17      0.83      0.28        54
          Politics       0.75      0.06      0.11        49
       Coronavirus       0.38      0.15      0.21        41
          AskIndia       0.95      0.38      0.54        48
  Business/Finance       0.73      0.36      0.48        53
              Food       0.60      0.06      0.11        52
       Photography       0.78      0.44      0.56        48
            Sports       1.00      0.23      0.37        53
     [R]eddiquette       0.00      0.00      0.00         6

          accuracy                           0.41       555
         macro avg       0.60      0.37      0.37       555
      weighted avg       0.66      0.41      0.41       555

Results o

  'precision', 'predicted', average, warn_for)


accuracy 0.2972972972972973
                    precision    recall  f1-score   support

               AMA       0.78      0.51      0.62        57
    Policy/Economy       0.43      0.38      0.40        40
     Non Political       0.76      0.35      0.48        54
Science/Technology       0.17      0.83      0.28        54
          Politics       0.20      0.02      0.04        49
       Coronavirus       0.33      0.02      0.05        41
          AskIndia       0.69      0.38      0.49        48
  Business/Finance       0.33      0.21      0.26        53
              Food       0.11      0.21      0.14        52
       Photography       0.59      0.27      0.37        48
            Sports       0.67      0.04      0.07        53
     [R]eddiquette       0.00      0.00      0.00         6

          accuracy                           0.30       555
         macro avg       0.42      0.27      0.27       555
      weighted avg       0.46      0.30      0.29       555

Results o

  'precision', 'predicted', average, warn_for)


accuracy 0.2864864864864865
                    precision    recall  f1-score   support

               AMA       0.93      0.47      0.63        57
    Policy/Economy       0.20      0.78      0.32        40
     Non Political       0.85      0.31      0.46        54
Science/Technology       0.17      0.83      0.28        54
          Politics       0.60      0.06      0.11        49
       Coronavirus       0.00      0.00      0.00        41
          AskIndia       0.48      0.23      0.31        48
  Business/Finance       0.50      0.15      0.23        53
              Food       0.11      0.04      0.06        52
       Photography       0.50      0.12      0.20        48
            Sports       0.67      0.15      0.25        53
     [R]eddiquette       1.00      0.17      0.29         6

          accuracy                           0.29       555
         macro avg       0.50      0.28      0.26       555
      weighted avg       0.48      0.29      0.27       555

Results o

  'precision', 'predicted', average, warn_for)


#### The below code prints the test accuracy for all the ML models taken into consideration when only the post url is considered as a feature

In [62]:
print("Flair Detection using URL as Feature")
train_test(url, to_predict)

Flair Detection using URL as Feature
Results of Naive Bayes Classifier
accuracy 0.2954954954954955
                    precision    recall  f1-score   support

               AMA       0.00      0.00      0.00        57
    Policy/Economy       0.13      0.95      0.24        40
     Non Political       0.71      0.19      0.29        54
Science/Technology       0.41      0.74      0.53        54
          Politics       0.58      0.31      0.40        49
       Coronavirus       0.31      0.27      0.29        41
          AskIndia       0.26      0.19      0.22        48
  Business/Finance       0.50      0.17      0.25        53
              Food       0.66      0.40      0.50        52
       Photography       0.64      0.15      0.24        48
            Sports       0.80      0.08      0.14        53
     [R]eddiquette       0.00      0.00      0.00         6

          accuracy                           0.30       555
         macro avg       0.42      0.29      0.26       555

  'precision', 'predicted', average, warn_for)


accuracy 0.3387387387387387
                    precision    recall  f1-score   support

               AMA       0.00      0.00      0.00        57
    Policy/Economy       0.13      0.95      0.24        40
     Non Political       0.54      0.24      0.33        54
Science/Technology       0.74      0.72      0.73        54
          Politics       0.64      0.33      0.43        49
       Coronavirus       0.32      0.24      0.28        41
          AskIndia       0.26      0.21      0.23        48
  Business/Finance       0.43      0.25      0.31        53
              Food       0.66      0.52      0.58        52
       Photography       0.63      0.25      0.36        48
            Sports       0.83      0.19      0.31        53
     [R]eddiquette       0.00      0.00      0.00         6

          accuracy                           0.34       555
         macro avg       0.43      0.32      0.32       555
      weighted avg       0.47      0.34      0.34       555

Results o

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy 0.34234234234234234
                    precision    recall  f1-score   support

               AMA       0.00      0.00      0.00        57
    Policy/Economy       0.14      0.95      0.24        40
     Non Political       0.55      0.22      0.32        54
Science/Technology       0.67      0.72      0.70        54
          Politics       0.70      0.33      0.44        49
       Coronavirus       0.36      0.22      0.27        41
          AskIndia       0.31      0.29      0.30        48
  Business/Finance       0.37      0.21      0.27        53
              Food       0.66      0.52      0.58        52
       Photography       0.73      0.23      0.35        48
            Sports       0.81      0.25      0.38        53
     [R]eddiquette       0.00      0.00      0.00         6

          accuracy                           0.34       555
         macro avg       0.44      0.33      0.32       555
      weighted avg       0.48      0.34      0.35       555

Results 

  'precision', 'predicted', average, warn_for)


accuracy 0.31351351351351353
                    precision    recall  f1-score   support

               AMA       0.00      0.00      0.00        57
    Policy/Economy       0.00      0.00      0.00        40
     Non Political       0.12      0.67      0.21        54
Science/Technology       0.59      0.76      0.66        54
          Politics       0.43      0.45      0.44        49
       Coronavirus       0.25      0.10      0.14        41
          AskIndia       0.44      0.25      0.32        48
  Business/Finance       0.40      0.23      0.29        53
              Food       0.66      0.48      0.56        52
       Photography       0.71      0.25      0.37        48
            Sports       0.91      0.19      0.31        53
     [R]eddiquette       0.00      0.00      0.00         6

          accuracy                           0.31       555
         macro avg       0.38      0.28      0.27       555
      weighted avg       0.41      0.31      0.30       555

Results 

  'precision', 'predicted', average, warn_for)


accuracy 0.25585585585585585
                    precision    recall  f1-score   support

               AMA       0.00      0.00      0.00        57
    Policy/Economy       0.00      0.00      0.00        40
     Non Political       0.50      0.19      0.27        54
Science/Technology       0.55      0.69      0.61        54
          Politics       0.69      0.18      0.29        49
       Coronavirus       0.42      0.12      0.19        41
          AskIndia       0.57      0.08      0.15        48
  Business/Finance       0.11      0.64      0.19        53
              Food       0.30      0.46      0.36        52
       Photography       0.30      0.19      0.23        48
            Sports       0.77      0.19      0.30        53
     [R]eddiquette       0.00      0.00      0.00         6

          accuracy                           0.26       555
         macro avg       0.35      0.23      0.22       555
      weighted avg       0.38      0.26      0.24       555

Results 

  'precision', 'predicted', average, warn_for)


accuracy 0.26846846846846845
                    precision    recall  f1-score   support

               AMA       0.00      0.00      0.00        57
    Policy/Economy       0.08      0.03      0.04        40
     Non Political       0.12      0.67      0.21        54
Science/Technology       0.89      0.61      0.73        54
          Politics       0.83      0.31      0.45        49
       Coronavirus       0.38      0.20      0.26        41
          AskIndia       0.00      0.00      0.00        48
  Business/Finance       0.56      0.09      0.16        53
              Food       0.29      0.73      0.41        52
       Photography       0.18      0.08      0.11        48
            Sports       1.00      0.17      0.29        53
     [R]eddiquette       0.00      0.00      0.00         6

          accuracy                           0.27       555
         macro avg       0.36      0.24      0.22       555
      weighted avg       0.40      0.27      0.24       555

Results 

  'precision', 'predicted', average, warn_for)


accuracy 0.33153153153153153
                    precision    recall  f1-score   support

               AMA       0.00      0.00      0.00        57
    Policy/Economy       0.00      0.00      0.00        40
     Non Political       0.13      0.72      0.22        54
Science/Technology       0.88      0.69      0.77        54
          Politics       0.46      0.49      0.48        49
       Coronavirus       0.30      0.15      0.20        41
          AskIndia       0.42      0.27      0.33        48
  Business/Finance       0.44      0.26      0.33        53
              Food       0.64      0.54      0.58        52
       Photography       0.63      0.25      0.36        48
            Sports       0.85      0.21      0.33        53
     [R]eddiquette       0.00      0.00      0.00         6

          accuracy                           0.33       555
         macro avg       0.40      0.30      0.30       555
      weighted avg       0.43      0.33      0.33       555

Results 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


#### The below code prints the test accuracy for all the ML models taken into consideration when only the post comments are considered as a feature

In [63]:
print("Flair Detection using Comments as Feature")
train_test(comments, to_predict)

Flair Detection using Comments as Feature
Results of Naive Bayes Classifier
accuracy 0.41621621621621624
                    precision    recall  f1-score   support

               AMA       0.25      0.77      0.38        57
    Policy/Economy       0.19      0.35      0.24        40
     Non Political       0.58      0.39      0.47        54
Science/Technology       0.28      0.28      0.28        54
          Politics       0.74      0.47      0.58        49
       Coronavirus       0.33      0.05      0.09        41
          AskIndia       0.96      0.50      0.66        48
  Business/Finance       0.51      0.49      0.50        53
              Food       0.48      0.52      0.50        52
       Photography       0.71      0.42      0.53        48
            Sports       0.83      0.28      0.42        53
     [R]eddiquette       0.00      0.00      0.00         6

          accuracy                           0.42       555
         macro avg       0.49      0.38      0.39    

  'precision', 'predicted', average, warn_for)


accuracy 0.5135135135135135
                    precision    recall  f1-score   support

               AMA       0.74      0.60      0.66        57
    Policy/Economy       0.27      0.42      0.33        40
     Non Political       0.62      0.46      0.53        54
Science/Technology       0.40      0.61      0.48        54
          Politics       0.65      0.57      0.61        49
       Coronavirus       0.18      0.44      0.26        41
          AskIndia       0.80      0.58      0.67        48
  Business/Finance       0.60      0.47      0.53        53
              Food       0.59      0.46      0.52        52
       Photography       0.77      0.48      0.59        48
            Sports       0.96      0.47      0.63        53
     [R]eddiquette       1.00      0.83      0.91         6

          accuracy                           0.51       555
         macro avg       0.63      0.53      0.56       555
      weighted avg       0.62      0.51      0.54       555

Results o



accuracy 0.5171171171171172
                    precision    recall  f1-score   support

               AMA       0.71      0.72      0.71        57
    Policy/Economy       0.30      0.33      0.31        40
     Non Political       0.53      0.52      0.52        54
Science/Technology       0.36      0.56      0.44        54
          Politics       0.55      0.55      0.55        49
       Coronavirus       0.18      0.20      0.19        41
          AskIndia       0.65      0.62      0.64        48
  Business/Finance       0.57      0.47      0.52        53
              Food       0.57      0.52      0.55        52
       Photography       0.52      0.48      0.50        48
            Sports       0.81      0.57      0.67        53
     [R]eddiquette       0.83      0.83      0.83         6

          accuracy                           0.52       555
         macro avg       0.55      0.53      0.54       555
      weighted avg       0.54      0.52      0.52       555

Results o

  'precision', 'predicted', average, warn_for)


accuracy 0.47387387387387386
                    precision    recall  f1-score   support

               AMA       0.76      0.67      0.71        57
    Policy/Economy       0.22      0.33      0.26        40
     Non Political       0.60      0.52      0.55        54
Science/Technology       0.32      0.48      0.38        54
          Politics       0.54      0.51      0.53        49
       Coronavirus       0.12      0.20      0.15        41
          AskIndia       0.80      0.67      0.73        48
  Business/Finance       0.47      0.36      0.41        53
              Food       0.44      0.35      0.39        52
       Photography       0.59      0.46      0.52        48
            Sports       0.81      0.55      0.65        53
     [R]eddiquette       0.50      0.83      0.62         6

          accuracy                           0.47       555
         macro avg       0.51      0.49      0.49       555
      weighted avg       0.53      0.47      0.49       555

Results 

#### The below code prints the test accuracy for all the ML models taken into consideration when the post title, post body, post url and post comments are considered as a feature

In [64]:
print("Flair Detection using Combined Features")
train_test(combined, to_predict)

Flair Detection using Combined Features
Results of Naive Bayes Classifier
accuracy 0.6252252252252253
                    precision    recall  f1-score   support

               AMA       0.48      0.81      0.61        57
    Policy/Economy       0.31      0.57      0.40        40
     Non Political       0.65      0.72      0.68        54
Science/Technology       0.88      0.65      0.74        54
          Politics       0.79      0.63      0.70        49
       Coronavirus       0.80      0.29      0.43        41
          AskIndia       0.94      0.62      0.75        48
  Business/Finance       0.53      0.79      0.63        53
              Food       0.64      0.65      0.65        52
       Photography       0.73      0.62      0.67        48
            Sports       1.00      0.47      0.64        53
     [R]eddiquette       0.00      0.00      0.00         6

          accuracy                           0.63       555
         macro avg       0.65      0.57      0.58       

  'precision', 'predicted', average, warn_for)


accuracy 0.7567567567567568
                    precision    recall  f1-score   support

               AMA       0.86      0.75      0.80        57
    Policy/Economy       0.52      0.82      0.63        40
     Non Political       0.73      0.69      0.70        54
Science/Technology       0.87      0.85      0.86        54
          Politics       0.88      0.86      0.87        49
       Coronavirus       0.58      0.73      0.65        41
          AskIndia       0.95      0.73      0.82        48
  Business/Finance       0.69      0.70      0.69        53
              Food       0.67      0.75      0.71        52
       Photography       0.81      0.81      0.81        48
            Sports       0.97      0.66      0.79        53
     [R]eddiquette       1.00      0.67      0.80         6

          accuracy                           0.76       555
         macro avg       0.79      0.75      0.76       555
      weighted avg       0.78      0.76      0.76       555

Results o



accuracy 0.781981981981982
                    precision    recall  f1-score   support

               AMA       0.82      0.86      0.84        57
    Policy/Economy       0.59      0.75      0.66        40
     Non Political       0.72      0.67      0.69        54
Science/Technology       0.89      0.87      0.88        54
          Politics       0.86      0.88      0.87        49
       Coronavirus       0.72      0.76      0.74        41
          AskIndia       0.87      0.83      0.85        48
  Business/Finance       0.67      0.64      0.65        53
              Food       0.77      0.77      0.77        52
       Photography       0.73      0.79      0.76        48
            Sports       0.98      0.77      0.86        53
     [R]eddiquette       1.00      0.83      0.91         6

          accuracy                           0.78       555
         macro avg       0.80      0.79      0.79       555
      weighted avg       0.79      0.78      0.78       555

Results of

#### Gradient Boost and Random Forest gave the best accuracy and we have saved those models for future predictions. 

In [65]:
X_train, X_test, y_train, y_test = train_test_split(combined, to_predict, test_size = 0.25, random_state = 42)
ranfor = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),])
ranfor_fit = ranfor.fit(X_train, y_train)
pickle.dump(ranfor_fit, open('Random Forest.pkl', 'wb'))
y_pred = ranfor.predict(X_test)

In [66]:
X_train, X_test, y_train, y_test = train_test_split(combined, to_predict, test_size = 0.25, random_state = 42)
gb_classifier = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.1)),])
gb_classifier_fit = gb_classifier.fit(X_train, y_train)
pickle.dump(gb_classifier_fit, open('Gradient Boosting.pkl', 'wb'))
y_pred = gb_classifier.predict(X_test)

In [68]:
load_model = pickle.load(open('Gradient Boosting.pkl', 'rb'))
result = load_model.score(X_test, y_test)
print(result)

0.836036036036036


#### We have got a testing accuracy of about 84% when we use the Gradient Boost algorithm. Now we'll load the model and predict the flair of a completely unknown Reddit post

#### Loading the Model

In [69]:
model = pickle.load(open('Gradient Boosting.pkl', 'rb'))

#### Predicting Reddit Flair

In [70]:
reddit = praw.Reddit(client_id = '2d0GPjug_U7kaQ', client_secret = 'slwg95MGliJJAFwnh6kK7XziIY8', user_agent = 'Test_API', username = "Shaurya_L", password = "123456")

def prediction(url):
    submission = reddit.submission(url = url)
    data = {}
    data['Post_Title'] = str(submission.title)
    data['Post_URL'] = str(submission.url)
    data['Post_Body'] = str(submission.selftext)

    submission.comments.replace_more(limit = None)
    comment = ''
    count = 0
    
    for top_comment in submission.comments:
        comment = comment + ' ' + top_comment.body
        count+=1
        if(count > 10):
            break

    data["Post_Comments"] = str(comment)

    data['Post_Title'] = clean_text(str(data['Post_Title']))
    data['Post_Body'] = clean_text(str(data['Post_Body']))
    data['Post_Comments'] = clean_text(str(data['Post_Comments']))

    combined_features = data['Post_Title'] + data['Post_Comments'] + data['Post_Body'] + data['Post_URL']

    return model.predict([combined_features])

In [71]:
url = 'https://www.reddit.com/r/politics/comments/g4izpt/kentucky_reports_highest_coronavirus_infection/'
prediction(url)

array(['Coronavirus'], dtype=object)

#### The model was correctly able to predict the flair of the Reddit post as 'Coronavirus' 