<a href="https://colab.research.google.com/github/RachitBansal/RedditFlairDetector/blob/master/3_Modelling_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ML Models for Reddit Flair Detection

In this notebook, we will train and test various ML Models on r/India subreddit data [collected](https://colab.research.google.com/drive/18kM91eTDQ7FlJvQ5oZeR_1kW77kc4HM1#scrollTo=7DtIFLibMBiI) and [preprocessed](https://colab.research.google.com/drive/1Ee_jw9_awzBUfF923tx5yfC6hsGIyvaL) previously.

## Importing the necessary dependencies

In [0]:
import pandas as pd
import numpy as np

In [0]:
from google.colab import drive
drive.mount('drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at drive


In [0]:
!pip install contractions

Collecting contractions
  Downloading https://files.pythonhosted.org/packages/85/41/c3dfd5feb91a8d587ed1a59f553f07c05f95ad4e5d00ab78702fbf8fe48a/contractions-0.0.24-py2.py3-none-any.whl
Collecting textsearch
  Downloading https://files.pythonhosted.org/packages/42/a8/03407021f9555043de5492a2bd7a35c56cc03c2510092b5ec018cae1bbf1/textsearch-0.0.17-py2.py3-none-any.whl
Collecting pyahocorasick
[?25l  Downloading https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz (312kB)
[K     |████████████████████████████████| 317kB 2.9MB/s 
[?25hCollecting Unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 50.7MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
  

In [0]:
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import make_pipeline
from nltk.corpus import stopwords
import re
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import contractions
import pickle

  import pandas.util.testing as tm


In [0]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

In [0]:
nltk.download('all')

## Preprocessing the data using RegExp

In [0]:
wnl = WordNetLemmatizer()
remove =set(stopwords.words('english'))

def process(words):
    words =str(words)
    words=re.sub('([.,////])',' ',words)
    words=re.sub('\[.*?\]', '', words)
    words = words.replace('\n', ' ')
    try:
      words = contractions.fix(words)
    except:
      pass
    word_list = nltk.word_tokenize(re.sub(r'([^a-z A-Z])', '', words.lower()))
    comment = ' '.join([wnl.lemmatize(w) for w in word_list if w not in remove])
    
    return comment

def processURL(words):
  # words = words.split('://')[1]
  words = words.split('/')
  seq = ' '.join(words)
  seq = re.sub("[^a-zA-Z]", " ", seq)
  seq = re.sub(" +", " ", seq).strip()
  return seq

In [0]:
import os
os.listdir

<function posix.listdir>

In [0]:
data = pd.read_csv('./drive/My Drive/rMIDAS_bal_2.csv')

In [0]:
data['title'] = data['title'].apply(process)
data['selftext'] = data['selftext'].apply(process)
data['url'] = data['url'].apply(processURL)

## Logistic Regression

In [0]:
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

def lr(X,y):
    from sklearn.linear_model import LogisticRegression
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=100)
    logreg = make_pipeline(CountVectorizer(), TfidfTransformer(), LogisticRegression(C=1, penalty = 'l1', solver = 'liblinear'))
    
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    
    train_acc = logreg.score(X_train, y_train)
    valid_acc = accuracy_score(y_pred, y_test)

    print(f'\tTrain Acc: {train_acc:.3f}')
    print(f'\tVal. Acc: {valid_acc:.3f}')
    print(confusion_matrix(y_test,y_pred))
    
    print(classification_report(y_test, y_pred))

In [0]:
data['total'] = data['title'] + ' ' + data['url']
X=data['total']
y=data['link_flair_text']

lr(X,y)

	Train Acc: 0.659
	Val. Acc: 0.626
[[ 6990   117    78     9  1185    45    29    99   404   102    25    18]
 [  368  1901     7     4   660    10    11   415   117   157     6     3]
 [  143     3   851     0   228     1    14    20   119    12     1     0]
 [   68     7     0   165   582    13     7     9    50    16     5     0]
 [ 1929   253   137    46  9623    78   187   474  1829   342   120    29]
 [   67    11     3     7   298   347    19    20   150    26    13     1]
 [   35     3     7     2   318     8   385     1    42     5     5     0]
 [  308   266    34     3   845    13     7  2851   935   101    10     7]
 [  661    62    72     7  2139    24    73   497 11384    55    41    22]
 [  374   195    26     5   870    26    19   103   131  1381     8    10]
 [   76     4     1     6   318     4     3     4    51     6   745     0]
 [  611    27     2     8   816     5    12    77   375    30    11   176]]
                    precision    recall  f1-score   support

   

In [0]:
data['total'] = data['title'] 
X=data['total']
y=data['link_flair_text']

lr(X,y)

	Train Acc: 0.637
	Val. Acc: 0.595
[[ 5773   189    75    12  2067    39    24   175   554   146    33    14]
 [  339  1786     6     2   770     5     4   442   128   166     7     4]
 [  107     5   795     0   323     1     2    17   126    14     1     1]
 [   70     7     0   122   615     3     7     8    66    15     9     0]
 [ 1731   307   105    44  9748    31   109   482  1983   363   113    31]
 [   80    14     0     3   346   285     8    26   162    23    15     0]
 [   37     4     6     0   420     5   277     4    40    14     4     0]
 [  262   263    30     2   994     7     3  2748   950   104    11     6]
 [  440    63    52     9  2406    19    14   522 11394    68    37    13]
 [  351   201    29     0  1001    12     7   118   129  1287     7     6]
 [   62     6     2     4   353     6     3     6    64     6   706     0]
 [  474    35     2     9   991     5     8    87   391    28    13   107]]
                    precision    recall  f1-score   support

   

## Naive Bayes

In [0]:
def NaiveBayes(X,y):
  
    from sklearn.naive_bayes import MultinomialNB
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=100)

    nb = make_pipeline(CountVectorizer(),TfidfTransformer(),MultinomialNB(alpha=2))
    nb.fit(X_train, y_train)

    y_pred = nb.predict(X_test)

    train_acc = nb.score(X_train, y_train)
    valid_acc = accuracy_score(y_pred, y_test)

    print(f'\tTrain Acc: {train_acc:.3f}')
    print(f'\tVal. Acc: {valid_acc:.3f}')
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test, y_pred))

    pickle.dump(nb, open("./drive/My Drive/model_nb.pkl", "wb"))

In [0]:
data['total'] = data['title'] + ' ' + data['url']
X=data['total']
y=data['link_flair_text']

NaiveBayes(X,y)

	Train Acc: 0.530
	Val. Acc: 0.496
[[ 6397    14     0     0  5976     7     0    13  1308     2     0     1]
 [  338   788     0     0  3315     0     0   223   777     1     0     0]
 [   88     0     2     0  1354     1     0     2   634     0     0     0]
 [   31     0     0     0  1180     0     0     1   142     1     0     0]
 [ 1254     3     0     0 16230     2     0    92  5008     9     0     0]
 [   90     1     0     0   881    61     0     2   362     0     0     0]
 [    6     0     0     0  1127     3     0     0    84     0     0     0]
 [  181     0     0     0  2670     1     0   900  4349     2     0     0]
 [  225     0     0     0  3073     3     0    45 19223     0     0     0]
 [  320     6     0     0  3818     0     0    14   481   116     0     0]
 [   31     2     0     0  1469     1     0     0   339     0     6     0]
 [  455     2     0     0  1794     1     0     9   892     0     0     1]]


  _warn_prf(average, modifier, msg_start, len(result))


                    precision    recall  f1-score   support

          AskIndia       0.68      0.47      0.55     13718
  Business/Finance       0.97      0.14      0.25      5442
       Coronavirus       1.00      0.00      0.00      2081
     Entertainment       0.00      0.00      0.00      1355
     Non-Political       0.38      0.72      0.50     22598
   Not in English.       0.76      0.04      0.08      1397
       Photography       0.00      0.00      0.00      1220
    Policy/Economy       0.69      0.11      0.19      8103
          Politics       0.57      0.85      0.68     22569
Science/Technology       0.89      0.02      0.05      4755
            Sports       1.00      0.00      0.01      1848
     [R]eddiquette       0.50      0.00      0.00      3154

          accuracy                           0.50     88240
         macro avg       0.62      0.20      0.19     88240
      weighted avg       0.59      0.50      0.43     88240



In [0]:
data['total'] = data['title'] + ' ' + data['selftext'] + ' ' + data['url']
X=data['total']
y=data['link_flair_text']

NaiveBayes(X,y)

	Train Acc: 0.541
	Val. Acc: 0.512
[[ 9202    12     0     0  3649     7     0     6   841     1     0     0]
 [  596   732     0     0  3257     1     0   167   689     0     0     0]
 [  260     0     2     0  1289     1     0     3   526     0     0     0]
 [   59     0     0     0  1170     0     0     0   125     1     0     0]
 [ 2469     2     0     0 15603     4     0    72  4443     5     0     0]
 [   89     1     0     0   898    36     0     1   372     0     0     0]
 [   22     0     0     0  1126     1     0     0    71     0     0     0]
 [  422     0     0     0  2855     1     0   763  4061     1     0     0]
 [  522     0     0     0  3221     3     0    34 18789     0     0     0]
 [  581     3     0     0  3664     0     0     6   411    90     0     0]
 [   49     1     0     0  1493     1     0     0   299     0     5     0]
 [  802     1     0     0  1560     1     0     5   785     0     0     0]]


  _warn_prf(average, modifier, msg_start, len(result))


                    precision    recall  f1-score   support

          AskIndia       0.61      0.67      0.64     13718
  Business/Finance       0.97      0.13      0.24      5442
       Coronavirus       1.00      0.00      0.00      2081
     Entertainment       0.00      0.00      0.00      1355
     Non-Political       0.39      0.69      0.50     22598
   Not in English.       0.64      0.03      0.05      1397
       Photography       0.00      0.00      0.00      1220
    Policy/Economy       0.72      0.09      0.17      8103
          Politics       0.60      0.83      0.70     22569
Science/Technology       0.92      0.02      0.04      4755
            Sports       1.00      0.00      0.01      1848
     [R]eddiquette       0.00      0.00      0.00      3154

          accuracy                           0.51     88240
         macro avg       0.57      0.21      0.19     88240
      weighted avg       0.58      0.51      0.44     88240



## SVM Classifier

In [0]:
def svm(X,y):
    from sklearn.linear_model import SGDClassifier
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=100)

    sgd = make_pipeline(CountVectorizer(),TfidfTransformer(),SGDClassifier(alpha=0.001, penalty='elasticnet'))

    sgd.fit(X_train, y_train)

    y_pred = sgd.predict(X_test)
    
    train_acc = sgd.score(X_train, y_train)
    valid_acc = accuracy_score(y_pred, y_test)

    print(f'\tTrain Acc: {train_acc:.3f}')
    print(f'\tVal. Acc: {valid_acc:.3f}')
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test, y_pred))

    pickle.dump(sgd, open("./drive/My Drive/model_sgd.pkl", "wb"))

In [0]:
data['total'] = data['title'] + ' ' + data['url']
X=data['total']
y=data['link_flair_text']

svm(X,y)

	Train Acc: 0.497
	Val. Acc: 0.495
[[   29    74     3     4     0     0     0     0   287     0     0     0
      0     0    11    90     0     0     0     0     0     0     0     0]
 [  135  8610    74    96     0     0     0     0  2605     0     0     0
      0     0   286  1861     0     0     0     0     2     0     0     0]
 [  129   686   845    10     0     0     0     0  2054     0     0     0
      0     0   742  1211     0     0     0     0     1     0     0     0]
 [    4   289     2   864     0     0     0     0   407     0     0     0
      0     0     6   527     0     0     0     0     0     0     0     0]
 [   27   116     6     0     0     0     0     0   903     0     0     0
      0     0     9   276     0     0     0     0     0     0     0     0]
 [   31   202     5     7     0     0     0     0   611     0     0     0
      0     0    10   149     0     0     0     0     2     0     0     0]
 [    0   168     0     8     0     0     0     0    38     0     0    

  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

            All CAPS.       0.02      0.06      0.03       498
             AskIndia       0.41      0.63      0.50     13669
     Business/Finance       0.70      0.15      0.25      5678
          Coronavirus       0.64      0.41      0.50      2099
        Entertainment       0.00      0.00      0.00      1337
                 Food       0.00      0.00      0.00      1017
Low-effort self-post.       0.00      0.00      0.00       351
                Meta.       0.00      0.00      0.00       162
        Non-Political       0.51      0.47      0.49     45272
  Not Original Title.       0.00      0.00      0.00       618
     Not about India.       0.00      0.00      0.00       769
      Not in English.       0.00      0.00      0.00      1389
          Photography       0.00      0.00      0.00      1278
               Policy       0.00      0.00      0.00       698
       Policy/Economy       0.42      0.27      0.33  

In [0]:
data['total'] = data['title']
X=data['total']
y=data['link_flair_text']

svm(X,y)

	Train Acc: 0.399
	Val. Acc: 0.396
[[    0   384     2     4     0     0     0     0    47     0     0     0
      0     0     3    58     0     0     0     0     0     0     0     0]
 [    0 10046    67    95     0     4     0     0  1666     0     0     0
      0     0   212  1576     0     0     1     0     2     0     0     0]
 [    0  3573   650    10     0     1     0     0   527     0     0     0
      0     0   236   680     0     0     0     0     1     0     0     0]
 [    0   701     2   863     0     0     0     0   113     0     0     0
      0     0     4   416     0     0     0     0     0     0     0     0]
 [    0   754     3     0     0     0     0     0   390     0     0     0
      0     0     7   183     0     0     0     0     0     0     0     0]
 [    0   749     6     5     0    88     0     0   100     0     0     0
      0     0     1    66     0     0     0     0     2     0     0     0]
 [    0   171     0     8     0     0     0     0    44     0     0    

  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

            All CAPS.       0.00      0.00      0.00       498
             AskIndia       0.16      0.73      0.27     13669
     Business/Finance       0.58      0.11      0.19      5678
          Coronavirus       0.65      0.41      0.50      2099
        Entertainment       0.00      0.00      0.00      1337
                 Food       0.77      0.09      0.16      1017
Low-effort self-post.       0.00      0.00      0.00       351
                Meta.       0.00      0.00      0.00       162
        Non-Political       0.58      0.24      0.34     45272
  Not Original Title.       0.00      0.00      0.00       618
     Not about India.       0.00      0.00      0.00       769
      Not in English.       0.00      0.00      0.00      1389
          Photography       0.00      0.00      0.00      1278
               Policy       0.00      0.00      0.00       698
       Policy/Economy       0.50      0.21      0.30  

In [0]:
data['total'] = data['title'] + ' ' + data['selftext'] + ' ' + data['url']
X=data['total']
y=data['link_flair_text']

svm(X,y)

	Train Acc: 0.486
	Val. Acc: 0.485
[[    0    56     0     4     0     0     0     0   308     0     0     0
      0    22     1   107     0     0     0     0     0     0     0     0]
 [    0  1988    42   117     0     0     0     0  9046     0     0     0
      0    63   146  2266     0     0     0     0     1     0     0     0]
 [    0   255   805    11     0     0     0     0  3134     0     0     0
      0   198   165  1110     0     0     0     0     0     0     0     0]
 [    0    56     1   898     0     0     0     0   516     0     0     0
      0    57     4   567     0     0     0     0     0     0     0     0]
 [    0    15     1     0     0     0     0     0  1018     0     0     0
      0    14     3   286     0     0     0     0     0     0     0     0]
 [    0    50     3     7     0     2     0     0   774     0     0     0
      0    17     0   162     0     0     0     0     2     0     0     0]
 [    0    84     0     8     0     0     0     0   152     0     0    

  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

            All CAPS.       0.00      0.00      0.00       498
             AskIndia       0.45      0.15      0.22     13669
     Business/Finance       0.87      0.14      0.24      5678
          Coronavirus       0.63      0.43      0.51      2099
        Entertainment       0.00      0.00      0.00      1337
                 Food       1.00      0.00      0.00      1017
Low-effort self-post.       0.00      0.00      0.00       351
                Meta.       0.00      0.00      0.00       162
        Non-Political       0.45      0.60      0.52     45272
  Not Original Title.       0.00      0.00      0.00       618
     Not about India.       0.00      0.00      0.00       769
      Not in English.       0.00      0.00      0.00      1389
          Photography       0.00      0.00      0.00      1278
               Policy       0.01      0.02      0.01       698
       Policy/Economy       0.54      0.15      0.24  

In [0]:
data['total'] = data['title']
X=data['total']
y=data['link_flair_text']

svm(X,y)

	Train Acc: 0.496
	Val. Acc: 0.494
[[    0    22     2    11     0     0     0     0   399     0     0     0
      0     0     9    53     0     0     0     0     2     0     0     0]
 [    0  2194   309   446     0     4     0     0  8942     0     0     0
      0     0   242  1520     0     0     1     0    11     0     0     0]
 [    0   234  1004   124     0     1     0     0  3210     0     0     0
      0     0   456   646     0     0     0     0     3     0     0     0]
 [    0    42    11   868     0     0     0     0   749     0     0     0
      0     0     8   421     0     0     0     0     0     0     0     0]
 [    0    68     6    12     0     0     0     0  1070     0     0     0
      0     0     6   174     0     0     0     0     1     0     0     0]
 [    0   108    12    14     0    89     0     0   717     0     0     0
      0     0     9    68     0     0     0     0     0     0     0     0]
 [    0    25     3    10     0     0     0     0   186     0     0    

  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

            All CAPS.       0.00      0.00      0.00       498
             AskIndia       0.31      0.16      0.21     13669
     Business/Finance       0.40      0.18      0.25      5678
          Coronavirus       0.27      0.41      0.32      2099
        Entertainment       0.00      0.00      0.00      1337
                 Food       0.79      0.09      0.16      1017
Low-effort self-post.       0.00      0.00      0.00       351
                Meta.       0.00      0.00      0.00       162
        Non-Political       0.45      0.69      0.55     45272
  Not Original Title.       0.00      0.00      0.00       618
     Not about India.       0.00      0.00      0.00       769
      Not in English.       0.00      0.00      0.00      1389
          Photography       0.00      0.00      0.00      1278
               Policy       0.00      0.00      0.00       698
       Policy/Economy       0.46      0.24      0.32  

## Random Forest

In [0]:
def randomForest(X,y):
  
    from sklearn.ensemble import RandomForestClassifier

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    rf = make_pipeline(CountVectorizer(),TfidfTransformer(), RandomForestClassifier(criterion = "gini", n_estimators=50, bootstrap=True, max_features='sqrt'))
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)
    
    train_acc = rf.score(X_train, y_train)
    valid_acc = accuracy_score(y_pred, y_test)

    print(f'\tTrain Acc: {train_acc:.3f}')
    print(f'\tVal. Acc: {valid_acc:.3f}')
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test, y_pred))

    pickle.dump(rf, open("./drive/My Drive/model_rf.pkl", "wb"))

In [0]:
data['total'] = data['title']
X=data['total']
y=data['link_flair_text']

randomForest(X,y)

	Train Acc: 0.991
	Val. Acc: 0.574
[[ 9433   152    72     7  2905    85    23   193   733    88    23    31]
 [  535  2280    20     4  1561    12     2   585   335   155    10     6]
 [  206     2   993     0   524    10     1    17   267     5     1     7]
 [  141    10     0   147   874    12     5    11   111    11     9     1]
 [ 3074   281   151    30 14866    72    93   554  3018   297   132    53]
 [  104    15     5     5   484   509    12    32   215    16    10     2]
 [  100    11     6     0   770     8   322     5    63    24     0     1]
 [  478   218    41     2  1948    19     4  3702  1718    77     4     8]
 [  957    64    61     9  4558    69    12   509 16019    58    45    30]
 [  682   191    37     3  1972    33     9   153   267  1306    12     8]
 [  126     8     7     7   727     9     5    15   177     5   838     1]
 [  753    24     0     3  1379    13     7    74   549    35     9   231]]
                    precision    recall  f1-score   support

   

In [0]:
data['total'] = data['title'] + ' ' + data['url']
X=data['total']
y=data['link_flair_text']

randomForest(X,y)

	Train Acc: 0.998
	Val. Acc: 0.588
[[11346    81    49     0  1717    52    12    52   438    51    12    17]
 [  634  2328     7     3  1363    13     3   626   357   138     3    12]
 [  367     8   900     1   475     1     4    15   349    10     1     1]
 [  135     8     0   131   889    17     3    14   116     4     2     0]
 [ 3606   219    90    35 13848    71    80   492  3696   277    90    31]
 [   70    18     3     9   539   461    13    15   199    23     7     3]
 [   98     9     8     1   636     8   440     0    87     7     1     0]
 [  614   229    25     5  1607    17     5  3553  1972    79     4     7]
 [ 1354    57    39     7  3822    43    19   536 16535    44    31    38]
 [  690   202    27     5  1910    25     4   146   370  1347     6    10]
 [  158     7     4     2   682     4     2    12   159     7   772     1]
 [ 1038    23     3     1  1056     6     9    59   604    21     4   267]]
                    precision    recall  f1-score   support

   